3 éve · 0787f06ecb
--- a/package-system/squish-ccr/CMakeLists.txt
+++ b/package-system/squish-ccr/CMakeLists.txt
@@ -0,0 +1,138 @@
 
															+#
														
 
															+# Copyright (c) Contributors to the Open 3D Engine Project. For complete copyright and license terms please see the LICENSE at the root of this distribution.
														
 
															+# 
														
 
															+# SPDX-License-Identifier: Apache-2.0 OR MIT
														
 
															+#
														
 
															+#
														
 
															+
														
 
															+# CMake definition for squish-ccr 2.00 alpha2
														
 
															+cmake_minimum_required(VERSION 3.17)
														
 
															+
														
 
															+project(squish-ccr)
														
 
															+
														
 
															+set(SQUISH_SOURCE_FILES 
														
 
															+    alpha.cpp
														
 
															+    alphanormalfit.cpp
														
 
															+    bitoneblock.cpp
														
 
															+    bitoneclusterfit.cpp
														
 
															+    bitonefit.cpp
														
 
															+    bitonenormalfit.cpp
														
 
															+    bitonerangefit.cpp
														
 
															+    bitoneset.cpp
														
 
															+    colourblock.cpp
														
 
															+    colourclusterfit.cpp
														
 
															+    colourfit.cpp
														
 
															+    colournormalfit.cpp
														
 
															+    colourrangefit.cpp
														
 
															+    colourset.cpp
														
 
															+    hdrblock.cpp
														
 
															+    hdrfit.cpp
														
 
															+    hdrindexfit.cpp
														
 
															+    hdrrangefit.cpp
														
 
															+    hdrset.cpp
														
 
															+    hdrsinglefit.cpp
														
 
															+    hdrsinglesnap.cpp
														
 
															+    maths.cpp
														
 
															+    paletteblock.cpp
														
 
															+    palettechannelfit.cpp
														
 
															+    paletteclusterfit.cpp
														
 
															+    palettefit.cpp
														
 
															+    paletteindexfit.cpp
														
 
															+    palettenormalfit.cpp
														
 
															+    paletterangefit.cpp
														
 
															+    paletteset.cpp
														
 
															+    coloursinglefit.cpp
														
 
															+    coloursinglesnap.cpp
														
 
															+    palettesinglefit.cpp
														
 
															+    palettesinglesnap.cpp
														
 
															+    simd.cpp
														
 
															+    squish.cpp
														
 
															+)
														
 
															+
														
 
															+set(SQUISH_HEADER_FILES
														
 
															+    alpha.h
														
 
															+    alphanormalfit.h
														
 
															+    bitoneblock.h
														
 
															+    bitoneclusterfit.h
														
 
															+    bitonefit.h
														
 
															+    bitonenormalfit.h
														
 
															+    bitonerangefit.h
														
 
															+    bitoneset.h
														
 
															+    colourblock.h
														
 
															+    colourclusterfit.h
														
 
															+    colourfit.h
														
 
															+    colournormalfit.h
														
 
															+    colourrangefit.h
														
 
															+    colourset.h
														
 
															+    config.h
														
 
															+    helpers.h
														
 
															+    hdrblock.h
														
 
															+    hdrfit.h
														
 
															+    hdrindexfit.h
														
 
															+    hdrrangefit.h
														
 
															+    hdrset.h
														
 
															+    hdrsinglefit.h
														
 
															+    hdrsinglesnap.h
														
 
															+    maths.h
														
 
															+    paletteblock.h
														
 
															+    palettechannelfit.h
														
 
															+    paletteclusterfit.h
														
 
															+    palettefit.h
														
 
															+    paletteindexfit.h
														
 
															+    palettenormalfit.h
														
 
															+    paletterangefit.h
														
 
															+    paletteset.h
														
 
															+    simd.h
														
 
															+    simd_float.h
														
 
															+    simd_sse.h
														
 
															+    simd_ve.h
														
 
															+    coloursinglefit.h
														
 
															+    coloursinglesnap.h
														
 
															+    palettesinglefit.h
														
 
															+    palettesinglesnap.h
														
 
															+    squish.h
														
 
															+)
														
 
															+set(SQUISH_INLINE_FILES
														
 
															+    bitoneclusterfit.inl
														
 
															+    coloursinglelookup.inl
														
 
															+    palettesinglelookup.inl
														
 
															+)
														
 
															+
														
 
															+set(SQUISH_PUBLIC_INCLUDE_FILES
														
 
															+    squish.h
														
 
															+    config.h
														
 
															+    coloursinglelookup_ccr.inl
														
 
															+    coloursinglelookup_ccr_vector.inl
														
 
															+    degeneracy_ccr.inl
														
 
															+)
														
 
															+
														
 
															+add_library(squish-ccr SHARED ${SQUISH_SOURCE_FILES} ${SQUISH_HEADER_FILES} ${SQUISH_INLINE_FILES})
														
 
															+target_include_directories(squish-ccr PRIVATE ${CMAKE_CURRENT_LIST_DIR})
														
 
															+target_compile_definitions(squish-ccr PRIVATE SQUISH_USE_SSE=2 SQUISH_USE_CPP SQUISH_USE_CCR)
														
 
															+
														
 
															+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
														
 
															+    target_compile_options(squish-ccr PRIVATE -msse2 -Wno-unused-value)
														
 
															+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
														
 
															+    target_compile_definitions(squish-ccr PRIVATE NDEBUG USE_CPP)
														
 
															+endif()
														
 
															+
														
 
															+set_target_properties(squish-ccr
														
 
															+    PROPERTIES
														
 
															+        LIBRARY_OUTPUT_DIRECTORY_RELEASE "${CMAKE_BINARY_DIR}/bin/"
														
 
															+        PUBLIC_HEADER "${SQUISH_PUBLIC_INCLUDE_FILES}"
														
 
															+)
														
 
															+
														
 
															+include(GNUInstallDirs)
														
 
															+
														
 
															+install(TARGETS squish-ccr
														
 
															+        PUBLIC_HEADER
														
 
															+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/squish-ccr"
														
 
															+        ARCHIVE
														
 
															+            DESTINATION ${CMAKE_INSTALL_BINDIR}
														
 
															+        LIBRARY
														
 
															+            DESTINATION ${CMAKE_INSTALL_BINDIR}
														
 
															+        RUNTIME
														
 
															+            DESTINATION ${CMAKE_INSTALL_BINDIR}
														
 
															+        FRAMEWORK
														
 
															+            DESTINATION ${CMAKE_INSTALL_BINDIR}
														
 
															+)
														
--- a/package-system/squish-ccr/Findsquish-ccr.cmake.template
+++ b/package-system/squish-ccr/Findsquish-ccr.cmake.template
@@ -0,0 +1,45 @@
 
															+#
														
 
															+# Copyright (c) Contributors to the Open 3D Engine Project. For complete copyright and license terms please see the LICENSE at the root of this distribution.
														
 
															+# 
														
 
															+# SPDX-License-Identifier: Apache-2.0 OR MIT
														
 
															+#
														
 
															+
														
 
															+# this file actually ingests the library and defines targets.
														
 
															+
														
 
															+set(LIB_NAME "squish-ccr")
														
 
															+set(TARGET_WITH_NAMESPACE "3rdParty::$${LIB_NAME}")
														
 
															+if (TARGET $${TARGET_WITH_NAMESPACE})
														
 
															+    return()
														
 
															+endif()
														
 
															+
														
 
															+set($${LIB_NAME}_INCLUDE_DIR $${CMAKE_CURRENT_LIST_DIR}/$${LIB_NAME}/include)
														
 
															+set($${LIB_NAME}_LIBRARY_DIR $${CMAKE_CURRENT_LIST_DIR}/$${LIB_NAME}/bin)
														
 
															+
														
 
															+add_library($${TARGET_WITH_NAMESPACE} INTERFACE IMPORTED GLOBAL)
														
 
															+
														
 
															+# add include directory
														
 
															+ly_target_include_system_directories(TARGET $${TARGET_WITH_NAMESPACE} INTERFACE $${$${LIB_NAME}_INCLUDE_DIR})
														
 
															+
														
 
															+if ($${PAL_PLATFORM_NAME} STREQUAL "Windows")
														
 
															+    set($${LIB_NAME}_LIBRARY   $${$${LIB_NAME}_LIBRARY_DIR}/$${LIB_NAME}.lib)
														
 
															+else()
														
 
															+    set($${LIB_NAME}_LIBRARY   $${$${LIB_NAME}_LIBRARY_DIR}/$${CMAKE_SHARED_LIBRARY_PREFIX}$${LIB_NAME}$${CMAKE_SHARED_LIBRARY_SUFFIX})
														
 
															+endif()
														
 
															+
														
 
															+set($${LIB_NAME}_RUNTIME_DEPENDENCIES $${$${LIB_NAME}_LIBRARY_DIR}/$${CMAKE_SHARED_LIBRARY_PREFIX}$${LIB_NAME}$${CMAKE_SHARED_LIBRARY_SUFFIX})
														
 
															+
														
 
															+# for linking
														
 
															+target_link_libraries($${TARGET_WITH_NAMESPACE} INTERFACE $${$${LIB_NAME}_LIBRARY})
														
 
															+
														
 
															+# add runtime dependencies
														
 
															+ly_add_target_files(TARGETS $${TARGET_WITH_NAMESPACE} FILES $${$${LIB_NAME}_RUNTIME_DEPENDENCIES})
														
 
															+
														
 
															+# using squish causes your target to get a USING_SQUISH_SDK applied to it.
														
 
															+target_compile_definitions($${TARGET_WITH_NAMESPACE} INTERFACE 
														
 
															+    USING_SQUISH_SDK
														
 
															+    SQUISH_USE_SSE=2
														
 
															+    SQUISH_USE_CPP
														
 
															+    SQUISH_USE_CCR
														
 
															+    )
														
 
															+
														
 
															+set($${LIB_NAME}_FOUND True)
														
--- a/package-system/squish-ccr/LICENSE.txt
+++ b/package-system/squish-ccr/LICENSE.txt
@@ -0,0 +1,32 @@
 
															+LICENSE
														
 
															+-------
														
 
															+
														
 
															+The squish library is distributed under the terms and conditions of the MIT
														
 
															+license. This license is specified at the top of each source file and must be
														
 
															+preserved in its entirety.
														
 
															+
														
 
															+/* -----------------------------------------------------------------------------
														
 
															+
														
 
															+	Copyright (c) 2006 Simon Brown                          [email protected]
														
 
															+	Copyright (c) 2012 Niels Fröhling              [email protected]
														
 
															+
														
 
															+	Permission is hereby granted, free of charge, to any person obtaining
														
 
															+	a copy of this software and associated documentation files (the
														
 
															+	"Software"), to	deal in the Software without restriction, including
														
 
															+	without limitation the rights to use, copy, modify, merge, publish,
														
 
															+	distribute, sublicense, and/or sell copies of the Software, and to
														
 
															+	permit persons to whom the Software is furnished to do so, subject to
														
 
															+	the following conditions:
														
 
															+
														
 
															+	The above copyright notice and this permission notice shall be included
														
 
															+	in all copies or substantial portions of the Software.
														
 
															+
														
 
															+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
														
 
															+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
														
 
															+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
														
 
															+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
														
 
															+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
														
 
															+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
														
 
															+
														
 
															+   -------------------------------------------------------------------------- */
														
--- a/package-system/squish-ccr/build_config.json
+++ b/package-system/squish-ccr/build_config.json
@@ -0,0 +1,78 @@
 
															+{
														
 
															+   "git_url":"https://github.com/Ethatron/squish-ccr.git",
														
 
															+   "git_tag":"master",
														
 
															+   "git_commit":"deb557d2fa647b191b37a2d8682df54ec8a7cfba",
														
 
															+   "package_name":"squish-ccr",
														
 
															+   "package_version":"deb557d-rev1",
														
 
															+   "package_url":"http://sjbrown.co.uk/2006/01/19/dxt-compression-techniques/",
														
 
															+   "package_license":"MIT",
														
 
															+   "package_license_file":"LICENSE.txt",
														
 
															+   "cmake_find_template":"Findsquish-ccr.cmake.template",
														
 
															+   "cmake_find_target":"Findsquish-ccr.cmake",
														
 
															+   "patch_file":"squish-ccr-deb557d-rev1.patch",
														
 
															+   "additional_src_files":[
														
 
															+      "CMakeLists.txt",
														
 
															+      "LICENSE.txt"
														
 
															+   ],
														
 
															+   "Platforms":{
														
 
															+      "Windows":{
														
 
															+        "Windows": {
														
 
															+            "custom_cmake_install": true,
														
 
															+            "cmake_generate_args_release": [
														
 
															+                "-G",
														
 
															+                "\"Visual Studio 16 2019\"",
														
 
															+                "-DCMAKE_CXX_STANDARD=17",
														
 
															+                "-DCMAKE_WINDOWS_EXPORT_ALL_SYMBOLS=TRUE",
														
 
															+                "-DBUILD_SHARED_LIBS=TRUE"
														
 
															+            ],
														
 
															+            "cmake_build_args": [
														
 
															+                "-j"
														
 
															+            ],
														
 
															+            "build_configs": [
														
 
															+                "Release"
														
 
															+            ]
														
 
															+        }
														
 
															+      },
														
 
															+      "Darwin":{
														
 
															+        "Mac": {
														
 
															+            "custom_cmake_install": true,
														
 
															+            "cmake_generate_args_release": [
														
 
															+                "-G",
														
 
															+                "Xcode",
														
 
															+                "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.12",
														
 
															+                "-DCMAKE_OSX_ARCHITECTURES=x86_64",
														
 
															+                "-DCMAKE_CXX_FLAGS=\"-fPIC -O2\"",
														
 
															+                "-DCMAKE_CXX_STANDARD=17",
														
 
															+                "-DCMAKE_BUILD_TYPE=Release"
														
 
															+            ],
														
 
															+            "cmake_build_args": [
														
 
															+                "-j",
														
 
															+                "8"
														
 
															+            ],
														
 
															+            "build_configs": [
														
 
															+                "Release"
														
 
															+            ]
														
 
															+        }
														
 
															+      },
														
 
															+      "Linux":{
														
 
															+         "Linux":{
														
 
															+            "custom_cmake_install":true,
														
 
															+            "cmake_generate_args_release": [
														
 
															+                "-G",
														
 
															+                "Unix\\ Makefiles",
														
 
															+                "-DCMAKE_C_COMPILER=clang-6.0",
														
 
															+                "-DCMAKE_CXX_COMPILER=clang++-6.0",
														
 
															+                "-DCMAKE_CXX_FLAGS=\"-fPIC -O2\"",
														
 
															+                "-DCMAKE_CXX_STANDARD=17",
														
 
															+                "-DCMAKE_BUILD_TYPE=Release"
														
 
															+            ],
														
 
															+            "cmake_build_args":[
														
 
															+               "-j"
														
 
															+            ],
														
 
															+            "build_configs":[
														
 
															+                "Release"
														
 
															+            ]
														
 
															+         }
														
 
															+      }
														
 
															+   }
														
 
															+}
														
--- a/package-system/squish-ccr/squish-ccr-deb557d-rev1.patch
+++ b/package-system/squish-ccr/squish-ccr-deb557d-rev1.patch
@@ -0,0 +1,2537 @@
 
															+diff --git a/bitoneset.cpp b/bitoneset.cpp
														
 
															+index bc0a0a7..3dc456d 100644
														
 
															+--- a/bitoneset.cpp
														
 
															++++ b/bitoneset.cpp
														
 
															+@@ -371,7 +371,7 @@ BitoneSet::BitoneSet(f23 const* rgba, int mask, int flags)
														
 
															+ void BitoneSet::RemapIndices(u8 const* source, u8* target) const
														
 
															+ {
														
 
															+   for (int i = 0; i < 16; ++i) {
														
 
															+-    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[m_remap[i]]); target[i] = t;
														
 
															++    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[static_cast<int>(m_remap[i])]); target[i] = t;
														
 
															+   }
														
 
															+ }
														
 
															+ #endif
														
 
															+diff --git a/colourset.cpp b/colourset.cpp
														
 
															+index 9af55ef..dcc4a5d 100644
														
 
															+--- a/colourset.cpp
														
 
															++++ b/colourset.cpp
														
 
															+@@ -25,6 +25,7 @@
														
 
															+    -------------------------------------------------------------------------- */
														
 
															+ 
														
 
															+ #include <assert.h>
														
 
															++#include <string.h>
														
 
															+ #include "colourset.h"
														
 
															+ #include "helpers.h"
														
 
															+ 
														
 
															+@@ -409,7 +410,7 @@ bool ColourSet::RemoveBlack(const Vec3 &metric, Scr3 &error)
														
 
															+ 	continue;
														
 
															+ 
														
 
															+       // maps to black
														
 
															+-      Vec3 colour = m_points[m_remap[i]];
														
 
															++      Vec3 colour = m_points[static_cast<int>(m_remap[i])];
														
 
															+       /*Vec3 result = q.SnapToLattice(colour);*/
														
 
															+       if (true /*CompareAllEqualTo(result, Vec3(0.0f))*/) {
														
 
															+ 	Scr3 len = LengthSquared(metric * colour);
														
 
															+@@ -451,7 +452,7 @@ bool ColourSet::RemoveBlack(const Vec3 &metric, Scr3 &error)
														
 
															+ void ColourSet::RemapIndices(u8 const* source, u8* target) const
														
 
															+ {
														
 
															+   for (int i = 0; i < 16; ++i) {
														
 
															+-    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[m_remap[i]]); target[i] = t;
														
 
															++    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[static_cast<int>(m_remap[i])]); target[i] = t;
														
 
															+   }
														
 
															+ }
														
 
															+ #endif
														
 
															+diff --git a/config.h b/config.h
														
 
															+index ef7dbbd..9b1bf89 100644
														
 
															+--- a/config.h
														
 
															++++ b/config.h
														
 
															+@@ -413,7 +413,8 @@ using namespace ::Concurrency;
														
 
															+ #ifdef __GNUC__
														
 
															+ #define assume
														
 
															+ #define doinline
														
 
															+-#define	passreg		__fastcall
														
 
															++// clang reports warnings with __fastcall with x86_64 and __fastcall only works for i386 anyway
														
 
															++#define	passreg
														
 
															+ #else
														
 
															+ #define assume		__assume
														
 
															+ #define doinline	__forceinline
														
 
															+diff --git a/inlineables.cpp b/inlineables.cpp
														
 
															+index f2e0ca1..cdb51bc 100644
														
 
															+--- a/inlineables.cpp
														
 
															++++ b/inlineables.cpp
														
 
															+@@ -162,6 +162,8 @@ static const vQuantizer q8880s1(8, 8, 8, 0, ~0);
														
 
															+ static const vQuantizer q7770s1(7, 7, 7, 0, ~0);
														
 
															+ static const vQuantizer q5550s1(5, 5, 5, 0, ~0);
														
 
															+ 
														
 
															++static const vQuantizer invalidQuantizer(0, 0, 0, 0, 0);
														
 
															++
														
 
															+ #define vGetQuantizer(r, g, b, a)					\
														
 
															+ 	(((r) == 7) && ((a) == 8)                ? q7778s1 :		\
														
 
															+ 	(((r) == 5) && ((a) == 6)                ? q5556s1 :		\
														
 
															+@@ -171,7 +173,7 @@ static const vQuantizer q5550s1(5, 5, 5, 0, ~0);
														
 
															+ 	(((r) == 8) && ((a) == 1)                ? q8880s1 :		\
														
 
															+ 	(((r) == 7) && ((a) == 1)                ? q7770s1 :		\
														
 
															+ 	(((r) == 5) && ((a) == 1)                ? q5550s1 :		\
														
 
															+-	(vQuantizer&)*(vQuantizer*)nullptr))))))))
														
 
															++	invalidQuantizer))))))))
														
 
															+ 
														
 
															+ #define eGetQuantizer(r, g, b, a, e)					\
														
 
															+ 	(((r) == 7) && ((a) == 8) && ((e) == ~0) ? q7778s1 :		\
														
 
															+@@ -182,7 +184,7 @@ static const vQuantizer q5550s1(5, 5, 5, 0, ~0);
														
 
															+ 	(((r) == 8) && ((a) == 1) && ((e) ==  0) ? q8880s0 :		\
														
 
															+ 	(((r) == 7) && ((a) == 1) && ((e) ==  0) ? q7770s0 :		\
														
 
															+ 	(((r) == 5) && ((a) == 1) && ((e) ==  0) ? q5550s0 :		\
														
 
															+-	(vQuantizer&)*(vQuantizer*)nullptr))))))))
														
 
															++	invalidQuantizer))))))))
														
 
															+ 
														
 
															+ template<const int rb, const int gb, const int bb, const int ab, const int eb, const int sb>
														
 
															+ static doinline void passreg FloatTo(Vec4 (&colour)[1], Col4 (&field)[1][FIELDN], int bitset) ccr_restricted
														
 
															+@@ -900,15 +902,16 @@ static doinline void passreg Codebook6or8(s16 (&codes)[8*1], bool bw) ccr_restri
														
 
															+       cd = (2 * c + 3 * d); codes[4 + i] = (s16)((cd * 0x3334) >> 16) + (cd < 0);
														
 
															+       cd = (1 * c + 4 * d); codes[5 + i] = (s16)((cd * 0x3334) >> 16) + (cd < 0);
														
 
															+ 
														
 
															+-      codes[6 + i] = (s16)-127 << prc;
														
 
															+-      codes[7 + i] = (s16) 127 << prc;
														
 
															++      // Negative number doesn't support shift. Need to convert it to unsigned first
														
 
															++      codes[6 + i] = (s16) (((u16)(-127)) << prc);
														
 
															++      codes[7 + i] = (s16) (127 << prc);
														
 
															+ 
														
 
															+       assert(s16(codes[2]) == (((s16(4) * s16(codes[0])) + (s16(1) * s16(codes[1]))) / 5));
														
 
															+       assert(s16(codes[3]) == (((s16(3) * s16(codes[0])) + (s16(2) * s16(codes[1]))) / 5));
														
 
															+       assert(s16(codes[4]) == (((s16(2) * s16(codes[0])) + (s16(3) * s16(codes[1]))) / 5));
														
 
															+       assert(s16(codes[5]) == (((s16(1) * s16(codes[0])) + (s16(4) * s16(codes[1]))) / 5));
														
 
															+-      assert(s16(codes[6]) == (-127 << prc));
														
 
															+-      assert(s16(codes[7]) == ( 127 << prc));
														
 
															++      assert(s16(codes[6]) == (((u16)(-127)) << prc));
														
 
															++      assert(s16(codes[7]) == (127 << prc));
														
 
															+     }
														
 
															+     else {
														
 
															+       cd = (6 * c + 1 * d); codes[2 + i] = (s16)((cd * 0x4925) >> 17) + (cd < 0);
														
 
															+@@ -1063,7 +1066,8 @@ static doinline void passreg Codebook6(Col8 &codes, Col8::Arg start, Col8::Arg e
														
 
															+   // max   signed: (5 * 127) << 5 = 20320 / 0x4F60 fits   signed short
														
 
															+   const Col8 smul = Col8(0x05 << pb, 0x00 << pb, 0x04 << pb, 0x03 << pb, 0x02 << pb, 0x01 << pb, 0x00 << pb, 0x00 << pb);
														
 
															+   const Col8 emul = Col8(0x00 << pb, 0x05 << pb, 0x01 << pb, 0x02 << pb, 0x03 << pb, 0x04 << pb, 0x00 << pb, 0x00 << pb);
														
 
															+-  const Col8 mask = Col8(0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, min  << pb, max  << pb);
														
 
															++  // Negative number doesn't support shift. Need to convert it to unsigned first
														
 
															++  const Col8 mask = Col8(0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, ((u16)min) << pb, ((u16)min) << pb);
														
 
															+ 
														
 
															+   // range [0,2*5*255]
														
 
															+   Col8 ipol = (smul * start) + (emul * end);
														
 
															+diff --git a/maths.cpp b/maths.cpp
														
 
															+index d9c3808..b58c36a 100644
														
 
															+--- a/maths.cpp
														
 
															++++ b/maths.cpp
														
 
															+@@ -790,7 +790,16 @@ void EstimatePrincipleComponent(Sym3x3 const& matrix, Vec4 &out)
														
 
															+     Scr4 y = Dot(v, row1);
														
 
															+     Scr4 z = Dot(v, row2);
														
 
															+ 
														
 
															+-    v  = Vec4(x, y, z);
														
 
															++    //This is to fix Nans caused by really really small values.
														
 
															++    if(Vec3(x,y,z) < Vec3(FLT_EPSILON))
														
 
															++    {
														
 
															++        v  = Vec4(FLT_EPSILON,FLT_EPSILON,FLT_EPSILON);
														
 
															++    }
														
 
															++    else
														
 
															++    {
														
 
															++        v  = Vec4(x, y, z);
														
 
															++    }
														
 
															++
														
 
															+     v *= Reciprocal(HorizontalMax(Abs(v)));
														
 
															+   }
														
 
															+ #if POWER_ITERATION_COUNT <= 0
														
 
															+diff --git a/paletteclusterfit.cpp b/paletteclusterfit.cpp
														
 
															+index 2d6f5a1..b98e975 100644
														
 
															+--- a/paletteclusterfit.cpp
														
 
															++++ b/paletteclusterfit.cpp
														
 
															+@@ -26,6 +26,7 @@
														
 
															+    -------------------------------------------------------------------------- */
														
 
															+ 
														
 
															+ #include <assert.h>
														
 
															++#include <stdio.h>
														
 
															+ 
														
 
															+ #include "paletteclusterfit.h"
														
 
															+ #include "paletteset.h"
														
 
															+diff --git a/palettefit.cpp b/palettefit.cpp
														
 
															+index 062f45c..120da27 100644
														
 
															+--- a/palettefit.cpp
														
 
															++++ b/palettefit.cpp
														
 
															+@@ -150,9 +150,9 @@ const int *PaletteFit::GetSharedMap(int mode) {
														
 
															+ }
														
 
															+ 
														
 
															+ int PaletteFit::GetSharedSkip(int mode) {
														
 
															+-  if (PBcfg[mode].EPB) return skip[1][PBcfg[mode].NS];
														
 
															+-  if (PBcfg[mode].SPB) return skip[0][PBcfg[mode].NS];
														
 
															+-  return NULL;
														
 
															++  if (PBcfg[mode].EPB) return skip[1][static_cast<int>(PBcfg[mode].NS)];
														
 
															++  if (PBcfg[mode].SPB) return skip[0][static_cast<int>(PBcfg[mode].NS)];
														
 
															++  return 0;
														
 
															+ }
														
 
															+ 
														
 
															+ int PaletteFit::GetPrecisionBits(int mode) {
														
 
															+diff --git a/paletteset.cpp b/paletteset.cpp
														
 
															+index bee740c..8c7aea0 100644
														
 
															+--- a/paletteset.cpp
														
 
															++++ b/paletteset.cpp
														
 
															+@@ -1248,7 +1248,7 @@ void PaletteSet::RemapIndices(u8 const* source, u8* target, int set) const
														
 
															+       if ((imask & 1) == 0)
														
 
															+ 	continue;
														
 
															+ 
														
 
															+-      u8 t = 0; t = ((m_remap[s][i] == -1) ? t : source[m_remap[s][i]]); target[i] = t;
														
 
															++      u8 t = 0; t = ((m_remap[s][i] == -1) ? t : source[static_cast<int>(m_remap[s][i])]); target[i] = t;
														
 
															+     }
														
 
															+   }
														
 
															+ }
														
 
															+diff --git a/simd_sse.h b/simd_sse.h
														
 
															+index f959e20..1a2f6b8 100644
														
 
															+--- a/simd_sse.h
														
 
															++++ b/simd_sse.h
														
 
															+@@ -1,7 +1,7 @@
														
 
															+ /* -----------------------------------------------------------------------------
														
 
															+ 
														
 
															+ 	Copyright (c) 2006 Simon Brown                          [email protected]
														
 
															+-	Copyright (c) 2012 Niels Fröhling              [email protected]
														
 
															++	Copyright (c) 2012 Niels Fr?hling              [email protected]
														
 
															+ 
														
 
															+ 	Permission is hereby granted, free of charge, to any person obtaining
														
 
															+ 	a copy of this software and associated documentation files (the
														
 
															+@@ -33,6 +33,7 @@
														
 
															+ #endif
														
 
															+ #if ( SQUISH_USE_SSE >= 3 )
														
 
															+ #include <pmmintrin.h>
														
 
															++#include <smmintrin.h>
														
 
															+ #endif
														
 
															+ #if ( SQUISH_USE_SSE >= 4 )
														
 
															+ #include <smmintrin.h>
														
 
															+@@ -69,6 +70,12 @@
														
 
															+ 
														
 
															+ namespace squish {
														
 
															+ 
														
 
															++class Col3;
														
 
															++class Col4;
														
 
															++class Col8;
														
 
															++class Vec3;
														
 
															++class Vec4;
														
 
															++
														
 
															+ #define COL4_CONST( X ) Col4( X )
														
 
															+ 
														
 
															+ 
														
 
															+@@ -263,7 +270,7 @@ public:
														
 
															+ 	Col3& operator/=( short v )
														
 
															+ 	{
														
 
															+ 		__m128
														
 
															+-			
														
 
															++
														
 
															+ 		fp = _mm_cvtepi32_ps(m_v);
														
 
															+ 		fp = _mm_div_ps(fp, _mm_set1_ps(v));
														
 
															+ 		m_v = _mm_cvttps_epi32(fp);
														
 
															+@@ -351,64 +358,18 @@ public:
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col3 ShiftLeft( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col3 ShiftLeft( Arg a )
														
 
															+-	{
														
 
															+-		if ((n) <= 0)
														
 
															+-			return Col3( a.m_v );
														
 
															+-		if ((n) <= 7)
														
 
															+-			return Col3( _mm_slli_epi32( a.m_v, (n) & 7 ) );
														
 
															+-		if ((n) & 7)
														
 
															+-			return Col3( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															+-
														
 
															+-			return Col3( _mm_slli_si128( a.m_v, (n) >> 3 ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col3 ShiftRight( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col3 ShiftRight( Arg a )
														
 
															+-	{
														
 
															+-		if ((n) <= 0)
														
 
															+-			return Col3( a.m_v );
														
 
															+-		if ((n) <= 7)
														
 
															+-			return Col3( _mm_srli_epi32( a.m_v, (n) & 7 ) );
														
 
															+-		if ((n) & 7)
														
 
															+-			return Col3( _mm_srli_epi32( _mm_srli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															+-
														
 
															+-			return Col3( _mm_srli_si128( a.m_v, (n) >> 3 ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col3 ShiftRightHalf( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col3 ShiftRightHalf( Arg a )
														
 
															+-	{
														
 
															+-		return Col3( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col3 ShiftRightHalf( Arg a, const int n )
														
 
															+-	{
														
 
															+-		return Col3( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col3 ShiftRightHalf( Arg a, Arg b )
														
 
															+-	{
														
 
															+-		return Col3( _mm_srl_epi64( a.m_v, b.m_v ) );
														
 
															+-	}
														
 
															++	friend Col3 ShiftRightHalf( Arg a, const int n );
														
 
															++	friend Col3 ShiftRightHalf( Arg a, Arg b );
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col3 ShiftLeftHalf( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col3 ShiftLeftHalf( Arg a )
														
 
															+-	{
														
 
															+-		return Col3( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col3 ShiftLeftHalf( Arg a, const int n )
														
 
															+-	{
														
 
															+-		return Col3( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															+-	}
														
 
															++	friend Col3 ShiftLeftHalf( Arg a, const int n );
														
 
															+ 
														
 
															+ 	template<const int r, const int g, const int b>
														
 
															+ 	friend Col3 ShiftLeftLo( Arg v )
														
 
															+@@ -422,140 +383,24 @@ public:
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col3 MaskBits( Arg a );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col3 MaskBits( Arg a )
														
 
															+-	{
														
 
															+-		if ((p + n) <= 0)
														
 
															+-			return Col3(0);
														
 
															+-		if ((p + n) >= 64)
														
 
															+-			return a;
														
 
															+-
														
 
															+-		// compile time
														
 
															+-		__int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     (p + n) & 63));
														
 
															+-	//	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
														
 
															+-		__m128i mask = _mm_setr_epi32(
														
 
															+-		  (int)(base >>  0),
														
 
															+-		  (int)(base >> 32), 0, 0
														
 
															+-		);
														
 
															+-
														
 
															+-		return Col3( _mm_and_si128( a.m_v, mask ) );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col3 MaskBits( Arg a, const int n, const int p )
														
 
															+-	{
														
 
															+-		const int val = 64 - (p + n);
														
 
															+-
														
 
															+-		__m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
														
 
															+-		__m128i mask = _mm_setr_epi32(
														
 
															+-		  0xFFFFFFFF,
														
 
															+-		  0xFFFFFFFF, 0, 0
														
 
															+-		);
														
 
															+-
														
 
															+-		mask = _mm_srl_epi64( mask, shift );
														
 
															+-
														
 
															+-		// (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
														
 
															+-		return Col3( _mm_and_si128( a.m_v, mask ) );
														
 
															+-	}
														
 
															++    friend Col3 MaskBits(Arg a, const int n, const int p);
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col3 CopyBits( Arg left, Arg right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col3 CopyBits( Arg left, Arg right )
														
 
															+-	{
														
 
															+-		if (!(n))
														
 
															+-			return left;
														
 
															+-		if (!(p))
														
 
															+-			return MaskBits<n, 0>(right);
														
 
															+-		if (((p) + (n)) >= 64)
														
 
															+-			return (left) + ShiftLeftHalf<p>(right);
														
 
															+-
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		return Col3( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
														
 
															+-#else
														
 
															+-		return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															+-	//	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															+-#endif
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col3 CopyBits( Arg left, Col3 &right, const int n, const int p )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		/* ---- ---bl xxxx xxxx */
														
 
															+-		const int val = (p << 8) + (n << 0);
														
 
															+-
														
 
															+-		right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
														
 
															+-		return Col3( _mm_insert_si64( left.m_v, right.m_v ) );
														
 
															+-#else
														
 
															+-		return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															+-	//	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															+-#endif
														
 
															+-	}
														
 
															+ 
														
 
															++	friend Col3 CopyBits( Arg left, Col3 &right, const int n, const int p );
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col3 ExtrBits( Arg a );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col3 ExtrBits( Arg a )
														
 
															+-	{
														
 
															+-		if (!(n))
														
 
															+-			return Col3(0);
														
 
															+-		if (!(p))
														
 
															+-			return MaskBits<n, 0>(a);
														
 
															+-		if (((n) + (p)) >= 64)
														
 
															+-			return ShiftRightHalf<p>(a);
														
 
															+-
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		return Col3( _mm_extracti_si64( a.m_v, n, p ) );
														
 
															+-#else
														
 
															+-		return MaskBits<n, 0>(ShiftRightHalf<p>(a));
														
 
															+-#endif
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col3 ExtrBits( Arg a, const int n, const int p )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		/* ---- ----- ---- ---bl */
														
 
															+-		const int val = (p << 8) + (n << 0);
														
 
															+-
														
 
															+-		return Col3( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
														
 
															+-#else
														
 
															+-		return MaskBits(ShiftRightHalf(a, p), n, 0);
														
 
															+-#endif
														
 
															+-	}
														
 
															+ 
														
 
															++	friend Col3 ExtrBits( Arg a, const int n, const int p );
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend void ExtrBits( Arg left, Col3 &right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend void ExtrBits( Arg left, Col3 &right )
														
 
															+-	{
														
 
															+-		right  = ExtrBits<n, p>( left );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend void ConcBits( Arg left, Col3 &right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend void ConcBits( Arg left, Col3 &right )
														
 
															+-	{
														
 
															+-		right  = ShiftLeft<32>( right );
														
 
															+-		if (n > 0)
														
 
															+-			right += ExtrBits<n, p>( left );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend void ReplBits( Arg left, Col3 &right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend void ReplBits( Arg left, Col3 &right )
														
 
															+-	{
														
 
															+-		if (!n)
														
 
															+-			return;
														
 
															+-		if ((n < 0)) {
														
 
															+-			right  = ExtrBits<-n, p>( left );
														
 
															+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
														
 
															+-		}
														
 
															+-		else {
														
 
															+-			right  = ExtrBits< n, p>( left );
														
 
															+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															+-		}
														
 
															+-	}
														
 
															+ 
														
 
															+ 	friend Col3 Mul16x16u( Arg a, Arg b )
														
 
															+ 	{
														
 
															+@@ -652,18 +497,7 @@ public:
														
 
															+ 	template<const int f, const int t>
														
 
															+ 	friend Col3 Exchange( Arg a );
														
 
															+ 	template<const int f, const int t>
														
 
															+-	friend Col3 Exchange( Arg a )
														
 
															+-	{
														
 
															+-		if (f == t)
														
 
															+-			return a;
														
 
															+-
														
 
															+-		return Col3( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
														
 
															+-			(t == 0 ? f : (f == 0 ? t : 0)),
														
 
															+-			(t == 1 ? f : (f == 1 ? t : 1)),
														
 
															+-			(t == 2 ? f : (f == 2 ? t : 2)),
														
 
															+-			(t == 3 ? f : (f == 3 ? t : 3))
														
 
															+-		) ) );
														
 
															+-	}
														
 
															++	friend Col3 Exchange( Arg a );
														
 
															+ 
														
 
															+ 	friend Col3 HorizontalAdd( Arg a )
														
 
															+ 	{
														
 
															+@@ -751,7 +585,7 @@ public:
														
 
															+ 		return HorizontalAdd( a, b );
														
 
															+ #endif
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col3 HorizontalMaxTiny( Arg a )
														
 
															+ 	{
														
 
															+ #if ( SQUISH_USE_SSE >= 4 ) && 0
														
 
															+@@ -867,7 +701,7 @@ public:
														
 
															+ 
														
 
															+ 	      return Col3( _mm_castps_si128 ( resc ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend bool CompareFirstLessThan( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		__m128i bits = _mm_cmplt_epi32( left.m_v, right.m_v );
														
 
															+@@ -937,7 +771,7 @@ public:
														
 
															+ 
														
 
															+ 		loc = _mm_cvtsi128_si32( r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void PackBytes( Arg a, int &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+@@ -947,7 +781,7 @@ public:
														
 
															+ 
														
 
															+ 		loc = _mm_cvtsi128_si32( r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void PackWords( Arg a, unsigned__int64 &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+@@ -964,17 +798,17 @@ public:
														
 
															+ //		loc = _mm_cvtsi128_si64( r );
														
 
															+ 		_mm_storel_epi64( (__m128i *)&loc, r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void PackWords( Arg a, __int64 &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+-		  
														
 
															++
														
 
															+ 		r = _mm_packs_epi32( a.m_v, a.m_v );
														
 
															+ 
														
 
															+ //		loc = _mm_cvtsi128_si64( r );
														
 
															+ 		_mm_storel_epi64( (__m128i *)&loc, r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	// clamp the output to [0, 1]
														
 
															+ 	Col3 Clamp() const {
														
 
															+ 		Col3 const one (0xFF);
														
 
															+@@ -1020,17 +854,17 @@ public:
														
 
															+ 	{
														
 
															+ 		_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void StoreUnaligned( Arg a, void *destination )
														
 
															+ 	{
														
 
															+ 		_mm_storeu_si128( (__m128i *)destination, a.m_v );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void StoreUnaligned( Arg a, Arg b, void *destination )
														
 
															+ 	{
														
 
															+ 		_mm_storeu_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void StoreUnaligned( Arg a, u8* loc ) {
														
 
															+ 	  PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) ); }
														
 
															+ 	friend void StoreUnaligned( Arg a, u16* loc ) {
														
 
															+@@ -1043,10 +877,202 @@ public:
														
 
															+ private:
														
 
															+ 	__m128i m_v;
														
 
															+ 
														
 
															+-	friend class Col4;
														
 
															+-	friend class Vec3;
														
 
															++	friend squish::Col4;
														
 
															++	friend squish::Vec3;
														
 
															+ };
														
 
															+ 
														
 
															++template<const int f, const int t>
														
 
															++Col3 Exchange( Col3::Arg a )
														
 
															++{
														
 
															++    if (f == t)
														
 
															++        return a;
														
 
															++
														
 
															++    return Col3( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
														
 
															++                                                            (t == 0 ? f : (f == 0 ? t : 0)),
														
 
															++                                                            (t == 1 ? f : (f == 1 ? t : 1)),
														
 
															++                                                            (t == 2 ? f : (f == 2 ? t : 2)),
														
 
															++                                                            (t == 3 ? f : (f == 3 ? t : 3))
														
 
															++                                                            ) ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col3 ShiftRight(Col3::Arg a)
														
 
															++{
														
 
															++	if ((n) <= 0)
														
 
															++		return Col3(a.m_v);
														
 
															++	if ((n) <= 7)
														
 
															++		return Col3(_mm_srli_epi32(a.m_v, (n) & 7));
														
 
															++	if ((n) & 7)
														
 
															++		return Col3(_mm_srli_epi32(_mm_srli_si128(a.m_v, (n) >> 3), (n) & 7));
														
 
															++
														
 
															++	return Col3(_mm_srli_si128(a.m_v, (n) >> 3));
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col3 ShiftLeftHalf( Col3::Arg a )
														
 
															++{
														
 
															++    return Col3( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															++}
														
 
															++
														
 
															++inline Col3 ShiftLeftHalf( Col3::Arg a, const int n )
														
 
															++{
														
 
															++    return Col3( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col3 ShiftRightHalf( Col3::Arg a )
														
 
															++{
														
 
															++    return Col3( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															++}
														
 
															++
														
 
															++inline Col3 ShiftRightHalf( Col3::Arg a, const int n )
														
 
															++{
														
 
															++    return Col3( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															++}
														
 
															++
														
 
															++inline Col3 ShiftRightHalf( Col3::Arg a, Col3::Arg b )
														
 
															++{
														
 
															++    return Col3( _mm_srl_epi64( a.m_v, b.m_v ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++Col3 MaskBits( Col3::Arg a )
														
 
															++{
														
 
															++    if ((p + n) <= 0)
														
 
															++        return Col3(0);
														
 
															++    if ((p + n) >= 64)
														
 
															++        return a;
														
 
															++
														
 
															++    // compile time
														
 
															++    __int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     (p + n) & 63));
														
 
															++    //	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
														
 
															++    __m128i mask = _mm_setr_epi32(
														
 
															++                                  (int)(base >>  0),
														
 
															++                                  (int)(base >> 32), 0, 0
														
 
															++                                  );
														
 
															++
														
 
															++    return Col3( _mm_and_si128( a.m_v, mask ) );
														
 
															++}
														
 
															++
														
 
															++inline Col3 MaskBits( Col3::Arg a, const int n, const int p )
														
 
															++{
														
 
															++    const int val = 64 - (p + n);
														
 
															++
														
 
															++    __m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
														
 
															++    __m128i mask = _mm_setr_epi32(
														
 
															++                                  0xFFFFFFFF,
														
 
															++                                  0xFFFFFFFF, 0, 0
														
 
															++                                  );
														
 
															++
														
 
															++    mask = _mm_srl_epi64( mask, shift );
														
 
															++
														
 
															++    // (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
														
 
															++    return Col3( _mm_and_si128( a.m_v, mask ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++Col3 CopyBits( Col3::Arg left, Col3::Arg right )
														
 
															++{
														
 
															++    if (!(n))
														
 
															++        return left;
														
 
															++    if (!(p))
														
 
															++        return MaskBits<n, 0>(right);
														
 
															++    if (((p) + (n)) >= 64)
														
 
															++        return (left) + ShiftLeftHalf<p>(right);
														
 
															++
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    return Col3( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
														
 
															++#else
														
 
															++    return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															++    //	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++inline Col3 CopyBits( Col3::Arg left, Col3 &right, const int n, const int p )
														
 
															++{
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    /* ---- ---bl xxxx xxxx */
														
 
															++    const int val = (p << 8) + (n << 0);
														
 
															++
														
 
															++    right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
														
 
															++    return Col3( _mm_insert_si64( left.m_v, right.m_v ) );
														
 
															++#else
														
 
															++    return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															++    //	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++Col3 ExtrBits( Col3::Arg a )
														
 
															++{
														
 
															++    if (!(n))
														
 
															++        return Col3(0);
														
 
															++    if (!(p))
														
 
															++        return MaskBits<n, 0>(a);
														
 
															++    if (((n) + (p)) >= 64)
														
 
															++        return ShiftRightHalf<p>(a);
														
 
															++
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    return Col3( _mm_extracti_si64( a.m_v, n, p ) );
														
 
															++#else
														
 
															++    return MaskBits<n, 0>(ShiftRightHalf<p>(a));
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++inline Col3 ExtrBits( Col3::Arg a, const int n, const int p )
														
 
															++{
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    /* ---- ----- ---- ---bl */
														
 
															++    const int val = (p << 8) + (n << 0);
														
 
															++
														
 
															++    return Col3( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
														
 
															++#else
														
 
															++    return MaskBits(ShiftRightHalf(a, p), n, 0);
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col3 ShiftLeft( Col3::Arg a )
														
 
															++{
														
 
															++    if ((n) <= 0)
														
 
															++        return Col3( a.m_v );
														
 
															++    if ((n) <= 7)
														
 
															++        return Col3( _mm_slli_epi32( a.m_v, (n) & 7 ) );
														
 
															++    if ((n) & 7)
														
 
															++        return Col3( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															++
														
 
															++    return Col3( _mm_slli_si128( a.m_v, (n) >> 3 ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++void ExtrBits( Col3::Arg left, Col3 &right )
														
 
															++{
														
 
															++    right  = ExtrBits<n, p>( left );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++void ConcBits( Col3::Arg left, Col3 &right )
														
 
															++{
														
 
															++    right  = ShiftLeft<32>( right );
														
 
															++    if (n > 0)
														
 
															++        right += ExtrBits<n, p>( left );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++void ReplBits( Col3::Arg left, Col3 &right )
														
 
															++{
														
 
															++    if (!n)
														
 
															++        return;
														
 
															++    if ((n < 0)) {
														
 
															++        right  = ExtrBits<-n, p>( left );
														
 
															++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
														
 
															++    }
														
 
															++    else {
														
 
															++        right  = ExtrBits< n, p>( left );
														
 
															++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															++    }
														
 
															++}
														
 
															++
														
 
															+ class Col4
														
 
															+ {
														
 
															+ public:
														
 
															+@@ -1305,317 +1331,56 @@ public:
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col4 FillSign( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col4 FillSign( Arg a )
														
 
															+-	{
														
 
															+-		return Col4( _mm_srai_epi32( _mm_slli_epi32( a.m_v, n ), n ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col4 ExtendSign( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col4 ExtendSign( Arg a )
														
 
															+-	{
														
 
															+-		return Col4( _mm_srai_epi32( a.m_v, n ) );
														
 
															+-	}
														
 
															+-	
														
 
															++
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col4 ShiftLeft( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col4 ShiftLeft( Arg a )
														
 
															+-	{
														
 
															+-		if ((n) <= 0)
														
 
															+-			return Col4( a.m_v );
														
 
															+-		if ((n) <= 7)
														
 
															+-			return Col4( _mm_slli_epi32( a.m_v, (n) & 7 ) );
														
 
															+-		if ((n) & 7)
														
 
															+-			return Col4( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															+-
														
 
															+-			return Col4( _mm_slli_si128( a.m_v, (n) >> 3 ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col4 ShiftRight( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col4 ShiftRight( Arg a )
														
 
															+-	{
														
 
															+-		if ((n) <= 0)
														
 
															+-			return Col4( a.m_v );
														
 
															+-		if ((n) <= 7)
														
 
															+-			return Col4( _mm_srli_epi32( a.m_v, (n) & 7 ) );
														
 
															+-		if ((n) & 7)
														
 
															+-			return Col4( _mm_srli_epi32( _mm_srli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															+-
														
 
															+-			return Col4( _mm_srli_si128( a.m_v, (n) >> 3 ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col4 ShiftRightHalf( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col4 ShiftRightHalf( Arg a )
														
 
															+-	{
														
 
															+-		return Col4( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 ShiftRightHalf( Arg a, const int n )
														
 
															+-	{
														
 
															+-		return Col4( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 ShiftRightHalf( Arg a, Arg b )
														
 
															+-	{
														
 
															+-		return Col4( _mm_srl_epi64( a.m_v, b.m_v ) );
														
 
															+-	}
														
 
															++	friend Col4 ShiftRightHalf( Arg a, const int n );
														
 
															++	friend Col4 ShiftRightHalf( Arg a, Arg b );
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col4 ShiftLeftHalf( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Col4 ShiftLeftHalf( Arg a )
														
 
															+-	{
														
 
															+-		return Col4( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 ShiftLeftHalf( Arg a, const int n )
														
 
															+-	{
														
 
															+-		return Col4( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															+-	}
														
 
															++    friend Col4 ShiftLeftHalf( Arg a, const int n  );
														
 
															+ 
														
 
															+ 	template<const int r, const int g, const int b, const int a>
														
 
															+ 	friend Col4 ShiftLeftLo( Arg v );
														
 
															+-	template<const int r, const int g, const int b, const int a>
														
 
															+-	friend Col4 ShiftLeftLo( Arg v )
														
 
															+-	{
														
 
															+-		// (1 << r, 1 << g, 1 << b, 1 << a);
														
 
															+-		Col4 p2; p2.SetRGBApow2<0>(r, g, b, a);
														
 
															+-
														
 
															+-#if ( SQUISH_USE_SSE >= 4 )
														
 
															+-		return Col4( _mm_mullo_epi32( v.m_v, p2.m_v ) );
														
 
															+-#else
														
 
															+-		return Col4( _mm_mullo_epi16( v.m_v, p2.m_v ) );
														
 
															+-#endif
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col4 MaskBits( Arg a );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col4 MaskBits( Arg a )
														
 
															+-	{
														
 
															+-		if (((p) + (n)) <= 0)
														
 
															+-			return Col4(0);
														
 
															+-		if (((p) + (n)) >= 64)
														
 
															+-			return a;
														
 
															+-
														
 
															+-		// compile time
														
 
															+-		__int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     ((p) + (n)) & 63));
														
 
															+-	//	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - ((p) + (n)) & 63));
														
 
															+-		__m128i mask = _mm_setr_epi32(
														
 
															+-		  (int)(base >>  0),
														
 
															+-		  (int)(base >> 32), 0, 0
														
 
															+-		);
														
 
															+-
														
 
															+-		return Col4( _mm_and_si128( a.m_v, mask ) );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 MaskBits( Arg a, const int n, const int p )
														
 
															+-	{
														
 
															+-		const int val = 64 - ((p) + (n));
														
 
															+-
														
 
															+-		__m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
														
 
															+-		__m128i mask = _mm_setr_epi32(
														
 
															+-		  0xFFFFFFFF,
														
 
															+-		  0xFFFFFFFF, 0, 0
														
 
															+-		);
														
 
															+-
														
 
															+-		mask = _mm_srl_epi64( mask, shift );
														
 
															+-
														
 
															+-		// (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
														
 
															+-		return Col4( _mm_and_si128( a.m_v, mask ) );
														
 
															+-	}
														
 
															++    friend Col4 MaskBits( Arg a, const int n, const int p );
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col4 CopyBits( Arg left, Arg right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col4 CopyBits( Arg left, Arg right )
														
 
															+-	{
														
 
															+-		if (!(n))
														
 
															+-			return left;
														
 
															+-		if (!(p))
														
 
															+-			return MaskBits<n, 0>(right);
														
 
															+-		if (((p) + (n)) >= 64)
														
 
															+-			return (left) + ShiftLeftHalf<p>(right);
														
 
															+-
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
														
 
															+-#else
														
 
															+-		return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															+-	//	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															+-#endif
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 CopyBits( Arg left, Col4& right, const int n, const int p )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		/* ---- ---bl xxxx xxxx */
														
 
															+-		const int val = (p << 8) + (n << 0);
														
 
															+-
														
 
															+-		right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
														
 
															+-		return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
														
 
															+-#else
														
 
															+-		return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															+-	//	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															+-#endif
														
 
															+-	}
														
 
															++    friend Col4 CopyBits( Arg left, Col4& right, const int n, const int p );
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col4 KillBits( Arg a );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col4 KillBits( Arg a )
														
 
															+-	{
														
 
															+-		if (!n || (p >= 64))
														
 
															+-			return a;
														
 
															+-		if (!p && (n >= 64))
														
 
															+-			return Col4(0);
														
 
															+-
														
 
															+-		// compile time
														
 
															+-		__int64 base1 =  (0xFFFFFFFFFFFFFFFFULL << (     (p + 0) & 63));
														
 
															+-		__int64 base2 =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
														
 
															+-	//	__int64 base1 = ~(0xFFFFFFFFFFFFFFFFULL >> (64 - (p + 0) & 63));
														
 
															+-	//	__int64 base2 = ~(0xFFFFFFFFFFFFFFFFULL << (64 - (p + n) & 63));
														
 
															+-
														
 
															+-		__m128i mask;
														
 
															+-
														
 
															+-		if ((p + n) >= 64)
														
 
															+-		  base2 = 0xFFFFFFFFFFFFFFFFULL;
														
 
															+-
														
 
															+-		mask = _mm_setr_epi32(
														
 
															+-		  (int)((base1 ^ base2) >>  0),
														
 
															+-		  (int)((base1 ^ base2) >> 32), 0, 0
														
 
															+-		);
														
 
															+-
														
 
															+-		return Col4( _mm_and_si128( a.m_v, mask ) );
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 KillBits( Arg a, const int n, const int p )
														
 
															+-	{
														
 
															+-		const int val1 =      (p + 0);
														
 
															+-		const int val2 = 64 - (p + n);
														
 
															+-
														
 
															+-		__m128i shift1 = _mm_max_epi16( _mm_cvtsi32_si128( val1 ), _mm_set1_epi32( 0 ) );
														
 
															+-		__m128i shift2 = _mm_max_epi16( _mm_cvtsi32_si128( val2 ), _mm_set1_epi32( 0 ) );
														
 
															+-		__m128i mask1 = _mm_setr_epi32(
														
 
															+-		  0xFFFFFFFF,
														
 
															+-		  0xFFFFFFFF, 0, 0
														
 
															+-		);
														
 
															+-		__m128i mask2 = _mm_setr_epi32(
														
 
															+-		  0xFFFFFFFF,
														
 
															+-		  0xFFFFFFFF, 0, 0
														
 
															+-		);
														
 
															+-
														
 
															+-		mask1 = _mm_sll_epi64( mask1, shift1 );
														
 
															+-		mask2 = _mm_srl_epi64( mask2, shift2 );
														
 
															+-
														
 
															+-		return Col4( _mm_and_si128( a.m_v, _mm_xor_si128( mask1, mask2 ) ) );
														
 
															+-	}
														
 
															++    friend Col4 KillBits( Arg a, const int n, const int p );
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col4 InjtBits( Arg left, Arg right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col4 InjtBits( Arg left, Arg right )
														
 
															+-	{
														
 
															+-		if (!n || (p >= 64))
														
 
															+-			return right;
														
 
															+-		if ((p + n) >= 64)
														
 
															+-			return KillBits<n, p>(left) + ShiftLeftHalf<p>(right);
														
 
															+-	//		return               (left) + ShiftLeftHalf<p>(right);
														
 
															+-
														
 
															+-
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
														
 
															+-#else
														
 
															+-		return KillBits<n, p>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															+-	//	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															+-#endif
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 InjtBits( Arg left, Col4& right, const int n, const int p )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		/* ---- ---bl xxxx xxxx */
														
 
															+-		const int val = (p << 8) + (n << 0);
														
 
															+-
														
 
															+-		right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
														
 
															+-		return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
														
 
															+-#else
														
 
															+-		return KillBits(left, n, p) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															+-	//	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															+-#endif
														
 
															+-	}
														
 
															++    friend Col4 InjtBits( Arg left, Col4& right, const int n, const int p );
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend Col4 ExtrBits( Arg a );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend Col4 ExtrBits( Arg a )
														
 
															+-	{
														
 
															+-		if (!n)
														
 
															+-			return Col4(0);
														
 
															+-		if (!p)
														
 
															+-			return MaskBits<n, 0>(a);
														
 
															+-		if ((n + p) >= 64)
														
 
															+-			return ShiftRightHalf<p>(a);
														
 
															+-
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		return Col4( _mm_extracti_si64( a.m_v, n, p ) );
														
 
															+-#else
														
 
															+-		return MaskBits<n, 0>(ShiftRightHalf<p>(a));
														
 
															+-#endif
														
 
															+-	}
														
 
															+-
														
 
															+-	friend Col4 ExtrBits( Arg a, const int n, const int p )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_XSSE == 4 )
														
 
															+-		/* ---- ----- ---- ---bl */
														
 
															+-		const int val = (p << 8) + (n << 0);
														
 
															+-
														
 
															+-		return Col4( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
														
 
															+-#else
														
 
															+-		return MaskBits(ShiftRightHalf(a, p), n, 0);
														
 
															+-#endif
														
 
															+-	}
														
 
															++    friend Col4 ExtrBits( Arg a, const int n, const int p );
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend void ExtrBits( Arg left, Col4 &right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend void ExtrBits( Arg left, Col4 &right )
														
 
															+-	{
														
 
															+-		right  = ExtrBits<n, p>( left );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend void ConcBits( Arg left, Col4 &right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend void ConcBits( Arg left, Col4 &right )
														
 
															+-	{
														
 
															+-		right  = ShiftLeft<32>( right );
														
 
															+-		if (n > 0)
														
 
															+-			right += ExtrBits<n, p>( left );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int n, const int p>
														
 
															+ 	friend void ReplBits( Arg left, Col4 &right );
														
 
															+-	template<const int n, const int p>
														
 
															+-	friend void ReplBits( Arg left, Col4 &right )
														
 
															+-	{
														
 
															+-		if (!n)
														
 
															+-			return;
														
 
															+-		if ((n < 0)) {
														
 
															+-			right  = ExtrBits<-n, p>( left );
														
 
															+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
														
 
															+-		}
														
 
															+-		else {
														
 
															+-			right  = ExtrBits< n, p>( left );
														
 
															+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															+-		}
														
 
															+-	}
														
 
															+ 
														
 
															+ 	friend Col4 RevsBits( Col4::Arg v )
														
 
															+ 	{
														
 
															+@@ -1679,19 +1444,7 @@ public:
														
 
															+ 
														
 
															+ 	template<const int f, const int t>
														
 
															+ 	friend Col4 Shuffle( Arg a );
														
 
															+-	template<const int f, const int t>
														
 
															+-	friend Col4 Shuffle( Arg a )
														
 
															+-	{
														
 
															+-		if (f == t)
														
 
															+-			return a;
														
 
															+ 
														
 
															+-		return Col4( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
														
 
															+-			(t == 0 ? f : 0),
														
 
															+-			(t == 1 ? f : 1),
														
 
															+-			(t == 2 ? f : 2),
														
 
															+-			(t == 3 ? f : 3)
														
 
															+-		) ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const int f, const int t>
														
 
															+ 	friend Col4 Exchange( Arg a );
														
 
															+@@ -1888,7 +1641,7 @@ public:
														
 
															+ 		return Col4( _mm_max_epi16( left.m_v, right.m_v ) );
														
 
															+ #endif
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 MaxTiny( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		__m128 resa = _mm_castsi128_ps( left.m_v );
														
 
															+@@ -1973,7 +1726,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_cmplt_epi8( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 CompareAllEqualTo_M8( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_cmpeq_epi8( left.m_v, right.m_v ) );
														
 
															+@@ -1996,11 +1749,6 @@ public:
														
 
															+ 
														
 
															+ 	template<const int value>
														
 
															+ 	friend Col4 IsValue( Arg v );
														
 
															+-	template<const int value>
														
 
															+-	friend Col4 IsValue( Arg v )
														
 
															+-	{
														
 
															+-		return Col4( _mm_cmpeq_epi32( v.m_v, _mm_set1_epi32( value ) ) );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	friend Col4 TransferA( Arg left, Arg right )
														
 
															+ 	{
														
 
															+@@ -2014,7 +1762,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_or_si128( left.m_v, _mm_setr_epi32( 0x00, 0x00, 0x00, 0xFF ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 CollapseA( Arg r, Arg g, Arg b, Arg a )
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_packus_epi16(
														
 
															+@@ -2032,7 +1780,7 @@ public:
														
 
															+ 
														
 
															+ 		loc = _mm_cvtsi128_si32 ( r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void PackBytes( Arg a, int &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+@@ -2042,7 +1790,7 @@ public:
														
 
															+ 
														
 
															+ 		loc = _mm_cvtsi128_si32 ( r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void PackWords( Arg a, unsigned__int64 &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+@@ -2059,11 +1807,11 @@ public:
														
 
															+ //		loc = _mm_cvtsi128_si64( r );
														
 
															+ 		_mm_storel_epi64( (__m128i *)&loc, r );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend void PackWords( Arg a, __int64 &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+-		  
														
 
															++
														
 
															+ 		r = _mm_packs_epi32( a.m_v, a.m_v );
														
 
															+ 
														
 
															+ //		loc = _mm_cvtsi128_si64( r );
														
 
															+@@ -2100,18 +1848,9 @@ public:
														
 
															+ 
														
 
															+ 		a = Col4( r );
														
 
															+ 	}
														
 
															+-	
														
 
															+-	friend void UnpackBytes( Col4 &a, const int &loc )
														
 
															+-	{
														
 
															+-		__m128i
														
 
															+ 
														
 
															+-		r = _mm_cvtsi32_si128 ( loc );
														
 
															+-		r = _mm_unpacklo_epi8( r, r );
														
 
															+-		r = _mm_unpacklo_epi16( r, r );
														
 
															+-		
														
 
															+-		a = ExtendSign<24>( Col4( r ) );
														
 
															+-	}
														
 
															+-	
														
 
															++    friend void UnpackBytes( Col4 &a, const int &loc );
														
 
															++
														
 
															+ 	friend void UnpackWords( Col4 &a, const unsigned__int64 &loc )
														
 
															+ 	{
														
 
															+ 		__m128i
														
 
															+@@ -2121,110 +1860,447 @@ public:
														
 
															+ 
														
 
															+ 		a = Col4( r );
														
 
															+ 	}
														
 
															+-	
														
 
															+-	friend void UnpackWords( Col4 &a, const __int64 &loc )
														
 
															++
														
 
															++    friend void UnpackWords( Col4 &a, const __int64 &loc );
														
 
															++
														
 
															++	// clamp the output to [0, 1]
														
 
															++	Col4 Clamp() const {
														
 
															++		Col4 const one (0xFF);
														
 
															++		Col4 const zero(0x00);
														
 
															++
														
 
															++		return Min(one, Max(zero, *this));
														
 
															++	}
														
 
															++
														
 
															++	friend void Interleave( Col4 &a, Arg b, Arg c )
														
 
															+ 	{
														
 
															+-		__m128i
														
 
															++		a = Col4( _mm_shuffle_epi32( _mm_unpacklo_epi32( b.m_v , c.m_v ), SQUISH_SSE_SHUF(0, 3, 0, 3) ) );
														
 
															++	}
														
 
															++
														
 
															++	friend void LoadAligned( Col4 &a, Col4 &b, Arg c )
														
 
															++	{
														
 
															++	        a.m_v = c.m_v;
														
 
															++		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
														
 
															++	}
														
 
															++
														
 
															++	friend void LoadAligned( Col4 &a, void const *source )
														
 
															++	{
														
 
															++		a.m_v = _mm_load_si128( (__m128i const *)source );
														
 
															++	}
														
 
															++
														
 
															++	friend void LoadAligned( Col4 &a, Col4 &b, void const *source )
														
 
															++	{
														
 
															++		a.m_v = _mm_load_si128( (__m128i const *)source );
														
 
															++		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
														
 
															++	}
														
 
															++
														
 
															++	friend void LoadUnaligned( Col4 &a, Col4 &b, void const *source )
														
 
															++	{
														
 
															++		a.m_v = _mm_loadu_si128( (__m128i const *)source );
														
 
															++		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
														
 
															++	}
														
 
															++
														
 
															++	friend void StoreAligned( Arg a, Arg b, Col4 &c )
														
 
															++	{
														
 
															++		c.m_v = _mm_unpacklo_epi64( a.m_v, b.m_v );
														
 
															++	}
														
 
															++
														
 
															++	friend void StoreAligned( Arg a, void *destination )
														
 
															++	{
														
 
															++		_mm_store_si128( (__m128i *)destination, a.m_v );
														
 
															++	}
														
 
															++
														
 
															++	friend void StoreAligned( Arg a, Arg b, void *destination )
														
 
															++	{
														
 
															++		_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
														
 
															++	}
														
 
															++
														
 
															++	friend void StoreUnaligned( Arg a, void *destination )
														
 
															++	{
														
 
															++		_mm_storeu_si128( (__m128i *)destination, a.m_v );
														
 
															++	}
														
 
															++
														
 
															++	friend void StoreUnaligned( Arg a, Arg b, void *destination )
														
 
															++	{
														
 
															++		_mm_storeu_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
														
 
															++	}
														
 
															++
														
 
															++	friend void StoreUnaligned( Arg a, u8* loc )
														
 
															++	{
														
 
															++		PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) );
														
 
															++	}
														
 
															++	friend void StoreUnaligned( Arg a, u16* loc )
														
 
															++	{
														
 
															++		PackWords( a, (unsigned__int64&) (*((unsigned__int64 *)loc)) );
														
 
															++	}
														
 
															++	friend void StoreUnaligned( Arg a, s8* loc )
														
 
															++	{
														
 
															++		PackBytes( a, (int&) (*((int *)loc)) );
														
 
															++	}
														
 
															++	friend void StoreUnaligned( Arg a, s16* loc )
														
 
															++	{
														
 
															++		PackWords( a, (__int64&) (*((__int64 *)loc)) );
														
 
															++	}
														
 
															++
														
 
															++	friend void LoadUnaligned( Col4 &a, const u8* loc );
														
 
															++	friend void LoadUnaligned( Col4 &a, const u16* loc );
														
 
															++	friend void LoadUnaligned( Col4 &a, const s8* loc )
														
 
															++	{
														
 
															++	    UnpackBytes( a, (const int&) (*((const int *)loc)) );
														
 
															++	}
														
 
															++	friend void LoadUnaligned( Col4 &a, const s16* loc )
														
 
															++	{
														
 
															++	    UnpackWords( a, (const __int64&) (*((const __int64 *)loc)) );
														
 
															++	}
														
 
															++
														
 
															++	void SwapRGBA( Col4 &with )
														
 
															++	{
														
 
															++	  /* inplace swap based on xors */
														
 
															++	       m_v = _mm_xor_si128( m_v, with.m_v );
														
 
															++	  with.m_v = _mm_xor_si128( with.m_v, m_v );
														
 
															++	       m_v = _mm_xor_si128( m_v, with.m_v );
														
 
															++	}
														
 
															++
														
 
															++private:
														
 
															++	__m128i m_v;
														
 
															++
														
 
															++	friend squish::Vec4;
														
 
															++	friend squish::Col8;
														
 
															++};
														
 
															++
														
 
															++template<const int n>
														
 
															++Col4 ExtendSign( Col4::Arg a )
														
 
															++{
														
 
															++    return Col4( _mm_srai_epi32( a.m_v, n ) );
														
 
															++}
														
 
															++
														
 
															++inline void UnpackBytes( Col4 &a, const int &loc )
														
 
															++{
														
 
															++    __m128i
														
 
															++
														
 
															++    r = _mm_cvtsi32_si128 ( loc );
														
 
															++    r = _mm_unpacklo_epi8( r, r );
														
 
															++    r = _mm_unpacklo_epi16( r, r );
														
 
															++
														
 
															++    a = ExtendSign<24>( Col4( r ) );
														
 
															++}
														
 
															++
														
 
															++inline void UnpackWords( Col4 &a, const __int64 &loc )
														
 
															++{
														
 
															++    __m128i
														
 
															++
														
 
															++    r = _mm_loadl_epi64( (__m128i *)&loc );
														
 
															++    r = _mm_unpacklo_epi16( r, r );
														
 
															++
														
 
															++    a = ExtendSign<16>( Col4( r ) );
														
 
															++}
														
 
															++
														
 
															++inline void LoadUnaligned( Col4 &a, const u8* loc )
														
 
															++{
														
 
															++    UnpackBytes( a, (const unsigned int&) (*((const unsigned int *)loc)) );
														
 
															++}
														
 
															++
														
 
															++inline void LoadUnaligned( Col4 &a, const u16* loc )
														
 
															++{
														
 
															++    UnpackWords( a, (const unsigned__int64&) (*((const unsigned__int64 *)loc)) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col4 ShiftLeft( Col4::Arg a )
														
 
															++{
														
 
															++    if ((n) <= 0)
														
 
															++        return Col4( a.m_v );
														
 
															++    if ((n) <= 7)
														
 
															++        return Col4( _mm_slli_epi32( a.m_v, (n) & 7 ) );
														
 
															++    if ((n) & 7)
														
 
															++        return Col4( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															++
														
 
															++    return Col4( _mm_slli_si128( a.m_v, (n) >> 3 ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++void ReplBits( Col4::Arg left, Col4 &right )
														
 
															++{
														
 
															++    if (!n)
														
 
															++        return;
														
 
															++    if ((n < 0)) {
														
 
															++        right  = ExtrBits<-n, p>( left );
														
 
															++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
														
 
															++    }
														
 
															++    else {
														
 
															++        right  = ExtrBits< n, p>( left );
														
 
															++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															++    }
														
 
															++}
														
 
															++
														
 
															++template<const int value>
														
 
															++Col4 IsValue( Col4::Arg v )
														
 
															++{
														
 
															++    return Col4( _mm_cmpeq_epi32( v.m_v, _mm_set1_epi32( value ) ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col4 ShiftLeftHalf( Col4::Arg a )
														
 
															++{
														
 
															++    return Col4( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															++}
														
 
															++
														
 
															++inline Col4 ShiftLeftHalf( Col4::Arg a, const int n )
														
 
															++{
														
 
															++    return Col4( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col4 ShiftRightHalf( Col4::Arg a )
														
 
															++{
														
 
															++    return Col4( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
														
 
															++}
														
 
															++
														
 
															++inline Col4 ShiftRightHalf( Col4::Arg a, const int n )
														
 
															++{
														
 
															++    return Col4( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
														
 
															++}
														
 
															++
														
 
															++inline Col4 ShiftRightHalf( Col4::Arg a, Col4::Arg b )
														
 
															++{
														
 
															++    return Col4( _mm_srl_epi64( a.m_v, b.m_v ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col4 ShiftRight( Col4::Arg a )
														
 
															++{
														
 
															++    if ((n) <= 0)
														
 
															++        return Col4( a.m_v );
														
 
															++    if ((n) <= 7)
														
 
															++        return Col4( _mm_srli_epi32( a.m_v, (n) & 7 ) );
														
 
															++    if ((n) & 7)
														
 
															++        return Col4( _mm_srli_epi32( _mm_srli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
														
 
															++
														
 
															++    return Col4( _mm_srli_si128( a.m_v, (n) >> 3 ) );
														
 
															++}
														
 
															++
														
 
															++template<const int f, const int t>
														
 
															++Col4 Shuffle( Col4::Arg a )
														
 
															++{
														
 
															++    if (f == t)
														
 
															++        return a;
														
 
															++
														
 
															++    return Col4( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
														
 
															++                                                           (t == 0 ? f : 0),
														
 
															++                                                           (t == 1 ? f : 1),
														
 
															++                                                           (t == 2 ? f : 2),
														
 
															++                                                           (t == 3 ? f : 3)
														
 
															++                                                           ) ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col4 FillSign( Col4::Arg a )
														
 
															++{
														
 
															++    return Col4( _mm_srai_epi32( _mm_slli_epi32( a.m_v, n ), n ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++Col4 MaskBits( Col4::Arg a )
														
 
															++{
														
 
															++    if (((p) + (n)) <= 0)
														
 
															++        return Col4(0);
														
 
															++    if (((p) + (n)) >= 64)
														
 
															++        return a;
														
 
															++
														
 
															++    // compile time
														
 
															++    __int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     ((p) + (n)) & 63));
														
 
															++    //	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - ((p) + (n)) & 63));
														
 
															++    __m128i mask = _mm_setr_epi32(
														
 
															++                                  (int)(base >>  0),
														
 
															++                                  (int)(base >> 32), 0, 0
														
 
															++                                  );
														
 
															++
														
 
															++    return Col4( _mm_and_si128( a.m_v, mask ) );
														
 
															++}
														
 
															++
														
 
															++inline Col4 MaskBits( Col4::Arg a, const int n, const int p )
														
 
															++{
														
 
															++    const int val = 64 - ((p) + (n));
														
 
															++
														
 
															++    __m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
														
 
															++    __m128i mask = _mm_setr_epi32(
														
 
															++                                  0xFFFFFFFF,
														
 
															++                                  0xFFFFFFFF, 0, 0
														
 
															++                                  );
														
 
															++
														
 
															++    mask = _mm_srl_epi64( mask, shift );
														
 
															++
														
 
															++    // (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
														
 
															++    return Col4( _mm_and_si128( a.m_v, mask ) );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++Col4 CopyBits( Col4::Arg left, Col4::Arg right )
														
 
															++{
														
 
															++    if (!(n))
														
 
															++        return left;
														
 
															++    if (!(p))
														
 
															++        return MaskBits<n, 0>(right);
														
 
															++    if (((p) + (n)) >= 64)
														
 
															++        return (left) + ShiftLeftHalf<p>(right);
														
 
															++
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
														
 
															++#else
														
 
															++    return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															++    //	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++inline Col4 CopyBits( Col4::Arg left, Col4& right, const int n, const int p )
														
 
															++{
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    /* ---- ---bl xxxx xxxx */
														
 
															++    const int val = (p << 8) + (n << 0);
														
 
															++
														
 
															++    right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
														
 
															++    return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
														
 
															++#else
														
 
															++    return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															++    //	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++template<const int r, const int g, const int b, const int a>
														
 
															++Col4 ShiftLeftLo( Col4::Arg v )
														
 
															++{
														
 
															++    // (1 << r, 1 << g, 1 << b, 1 << a);
														
 
															++    Col4 p2; p2.SetRGBApow2<0>(r, g, b, a);
														
 
															++
														
 
															++#if ( SQUISH_USE_SSE >= 4 )
														
 
															++    return Col4( _mm_mullo_epi32( v.m_v, p2.m_v ) );
														
 
															++#else
														
 
															++    return Col4( _mm_mullo_epi16( v.m_v, p2.m_v ) );
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++void ExtrBits( Col4::Arg left, Col4 &right )
														
 
															++{
														
 
															++    right  = ExtrBits<n, p>( left );
														
 
															++}
														
 
															++
														
 
															++template<const int n, const int p>
														
 
															++Col4 ExtrBits( Col4::Arg a )
														
 
															++{
														
 
															++    if (!n)
														
 
															++        return Col4(0);
														
 
															++    if (!p)
														
 
															++        return MaskBits<n, 0>(a);
														
 
															++    if ((n + p) >= 64)
														
 
															++        return ShiftRightHalf<p>(a);
														
 
															++
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    return Col4( _mm_extracti_si64( a.m_v, n, p ) );
														
 
															++#else
														
 
															++    return MaskBits<n, 0>(ShiftRightHalf<p>(a));
														
 
															++#endif
														
 
															++}
														
 
															+ 
														
 
															+-		r = _mm_loadl_epi64( (__m128i *)&loc );
														
 
															+-		r = _mm_unpacklo_epi16( r, r );
														
 
															+-		
														
 
															+-		a = ExtendSign<16>( Col4( r ) );
														
 
															+-	}
														
 
															+-	
														
 
															+-	// clamp the output to [0, 1]
														
 
															+-	Col4 Clamp() const {
														
 
															+-		Col4 const one (0xFF);
														
 
															+-		Col4 const zero(0x00);
														
 
															++inline Col4 ExtrBits( Col4::Arg a, const int n, const int p )
														
 
															++{
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    /* ---- ----- ---- ---bl */
														
 
															++    const int val = (p << 8) + (n << 0);
														
 
															+ 
														
 
															+-		return Min(one, Max(zero, *this));
														
 
															+-	}
														
 
															++    return Col4( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
														
 
															++#else
														
 
															++    return MaskBits(ShiftRightHalf(a, p), n, 0);
														
 
															++#endif
														
 
															++}
														
 
															+ 
														
 
															+-	friend void Interleave( Col4 &a, Arg b, Arg c )
														
 
															+-	{
														
 
															+-		a = Col4( _mm_shuffle_epi32( _mm_unpacklo_epi32( b.m_v , c.m_v ), SQUISH_SSE_SHUF(0, 3, 0, 3) ) );
														
 
															+-	}
														
 
															++template<const int n, const int p>
														
 
															++void ConcBits( Col4::Arg left, Col4 &right )
														
 
															++{
														
 
															++    right  = ShiftLeft<32>( right );
														
 
															++    if (n > 0)
														
 
															++        right += ExtrBits<n, p>( left );
														
 
															++}
														
 
															+ 
														
 
															+-	friend void LoadAligned( Col4 &a, Col4 &b, Arg c )
														
 
															+-	{
														
 
															+-	        a.m_v = c.m_v;
														
 
															+-		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
														
 
															+-	}
														
 
															++template<const int n, const int p>
														
 
															++Col4 KillBits( Col4::Arg a )
														
 
															++{
														
 
															++    if (!n || (p >= 64))
														
 
															++        return a;
														
 
															++    if (!p && (n >= 64))
														
 
															++        return Col4(0);
														
 
															+ 
														
 
															+-	friend void LoadAligned( Col4 &a, void const *source )
														
 
															+-	{
														
 
															+-		a.m_v = _mm_load_si128( (__m128i const *)source );
														
 
															+-	}
														
 
															++    // compile time
														
 
															++    __int64 base1 =  (0xFFFFFFFFFFFFFFFFULL << (     (p + 0) & 63));
														
 
															++    __int64 base2 =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
														
 
															++    //	__int64 base1 = ~(0xFFFFFFFFFFFFFFFFULL >> (64 - (p + 0) & 63));
														
 
															++    //	__int64 base2 = ~(0xFFFFFFFFFFFFFFFFULL << (64 - (p + n) & 63));
														
 
															+ 
														
 
															+-	friend void LoadAligned( Col4 &a, Col4 &b, void const *source )
														
 
															+-	{
														
 
															+-		a.m_v = _mm_load_si128( (__m128i const *)source );
														
 
															+-		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
														
 
															+-	}
														
 
															++    __m128i mask;
														
 
															+ 
														
 
															+-	friend void LoadUnaligned( Col4 &a, Col4 &b, void const *source )
														
 
															+-	{
														
 
															+-		a.m_v = _mm_loadu_si128( (__m128i const *)source );
														
 
															+-		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
														
 
															+-	}
														
 
															++    if ((p + n) >= 64)
														
 
															++        base2 = 0xFFFFFFFFFFFFFFFFULL;
														
 
															+ 
														
 
															+-	friend void StoreAligned( Arg a, Arg b, Col4 &c )
														
 
															+-	{
														
 
															+-		c.m_v = _mm_unpacklo_epi64( a.m_v, b.m_v );
														
 
															+-	}
														
 
															++    mask = _mm_setr_epi32(
														
 
															++                          (int)((base1 ^ base2) >>  0),
														
 
															++                          (int)((base1 ^ base2) >> 32), 0, 0
														
 
															++                          );
														
 
															+ 
														
 
															+-	friend void StoreAligned( Arg a, void *destination )
														
 
															+-	{
														
 
															+-		_mm_store_si128( (__m128i *)destination, a.m_v );
														
 
															+-	}
														
 
															++    return Col4( _mm_and_si128( a.m_v, mask ) );
														
 
															++}
														
 
															+ 
														
 
															+-	friend void StoreAligned( Arg a, Arg b, void *destination )
														
 
															+-	{
														
 
															+-		_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
														
 
															+-	}
														
 
															++inline Col4 KillBits( Col4::Arg a, const int n, const int p )
														
 
															++{
														
 
															++    const int val1 =      (p + 0);
														
 
															++    const int val2 = 64 - (p + n);
														
 
															++
														
 
															++    __m128i shift1 = _mm_max_epi16( _mm_cvtsi32_si128( val1 ), _mm_set1_epi32( 0 ) );
														
 
															++    __m128i shift2 = _mm_max_epi16( _mm_cvtsi32_si128( val2 ), _mm_set1_epi32( 0 ) );
														
 
															++    __m128i mask1 = _mm_setr_epi32(
														
 
															++                                   0xFFFFFFFF,
														
 
															++                                   0xFFFFFFFF, 0, 0
														
 
															++                                   );
														
 
															++    __m128i mask2 = _mm_setr_epi32(
														
 
															++                                   0xFFFFFFFF,
														
 
															++                                   0xFFFFFFFF, 0, 0
														
 
															++                                   );
														
 
															++
														
 
															++    mask1 = _mm_sll_epi64( mask1, shift1 );
														
 
															++    mask2 = _mm_srl_epi64( mask2, shift2 );
														
 
															++
														
 
															++    return Col4( _mm_and_si128( a.m_v, _mm_xor_si128( mask1, mask2 ) ) );
														
 
															++}
														
 
															+ 
														
 
															+-	friend void StoreUnaligned( Arg a, void *destination )
														
 
															+-	{
														
 
															+-		_mm_storeu_si128( (__m128i *)destination, a.m_v );
														
 
															+-	}
														
 
															++template<const int n, const int p>
														
 
															++Col4 InjtBits( Col4::Arg left, Col4::Arg right )
														
 
															++{
														
 
															++    if (!n || (p >= 64))
														
 
															++        return right;
														
 
															++    if ((p + n) >= 64)
														
 
															++        return KillBits<n, p>(left) + ShiftLeftHalf<p>(right);
														
 
															++    //		return               (left) + ShiftLeftHalf<p>(right);
														
 
															+ 
														
 
															+-	friend void StoreUnaligned( Arg a, Arg b, void *destination )
														
 
															+-	{
														
 
															+-		_mm_storeu_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
														
 
															+-	}
														
 
															+-	
														
 
															+-	friend void StoreUnaligned( Arg a, u8* loc ) {
														
 
															+-	  PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) ); }
														
 
															+-	friend void StoreUnaligned( Arg a, u16* loc ) {
														
 
															+-	  PackWords( a, (unsigned__int64&) (*((unsigned__int64 *)loc)) ); }
														
 
															+-	friend void StoreUnaligned( Arg a, s8* loc ) {
														
 
															+-	  PackBytes( a, (int&) (*((int *)loc)) ); }
														
 
															+-	friend void StoreUnaligned( Arg a, s16* loc ) {
														
 
															+-	  PackWords( a, (__int64&) (*((__int64 *)loc)) ); }
														
 
															+-	
														
 
															+-	friend void LoadUnaligned( Col4 &a, const u8* loc ) {
														
 
															+-	  UnpackBytes( a, (const unsigned int&) (*((const unsigned int *)loc)) ); }
														
 
															+-	friend void LoadUnaligned( Col4 &a, const u16* loc ) {
														
 
															+-	  UnpackWords( a, (const unsigned__int64&) (*((const unsigned__int64 *)loc)) ); }
														
 
															+-	friend void LoadUnaligned( Col4 &a, const s8* loc ) {
														
 
															+-	  UnpackBytes( a, (const int&) (*((const int *)loc)) ); }
														
 
															+-	friend void LoadUnaligned( Col4 &a, const s16* loc ) {
														
 
															+-	  UnpackWords( a, (const __int64&) (*((const __int64 *)loc)) ); }
														
 
															+ 
														
 
															+-	void SwapRGBA( Col4 &with )
														
 
															+-	{
														
 
															+-	  /* inplace swap based on xors */
														
 
															+-	       m_v = _mm_xor_si128( m_v, with.m_v );
														
 
															+-	  with.m_v = _mm_xor_si128( with.m_v, m_v );
														
 
															+-	       m_v = _mm_xor_si128( m_v, with.m_v );
														
 
															+-	}
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
														
 
															++#else
														
 
															++    return KillBits<n, p>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															++    //	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
														
 
															++#endif
														
 
															++}
														
 
															+ 
														
 
															+-private:
														
 
															+-	__m128i m_v;
														
 
															++inline Col4 InjtBits( Col4::Arg left, Col4& right, const int n, const int p )
														
 
															++{
														
 
															++#if ( SQUISH_USE_XSSE == 4 )
														
 
															++    /* ---- ---bl xxxx xxxx */
														
 
															++    const int val = (p << 8) + (n << 0);
														
 
															+ 
														
 
															+-	friend class Vec4;
														
 
															+-	friend class Col8;
														
 
															+-};
														
 
															++    right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
														
 
															++    return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
														
 
															++#else
														
 
															++    return KillBits(left, n, p) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															++    //	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
														
 
															++#endif
														
 
															++}
														
 
															+ 
														
 
															+ #if	!defined(SQUISH_USE_PRE)
														
 
															+ inline Col3 LengthSquared( Col3::Arg v )
														
 
															+@@ -2291,30 +2367,30 @@ public:
														
 
															+ 	{
														
 
															+ 		return _mm_extract_epi16( m_v, 0 );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ #pragma warning ( push )
														
 
															+ #pragma warning ( disable : 4100 )
														
 
															+ 	friend Col4 LoCol4(Arg v, const unsigned dummy)
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_unpacklo_epi16( v.m_v, _mm_setzero_si128() ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 HiCol4(Arg v, const unsigned dummy)
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_unpackhi_epi16( v.m_v, _mm_setzero_si128() ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 LoCol4(Arg v, const signed dummy)
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_srai_epi32( _mm_unpacklo_epi16( _mm_setzero_si128(), v.m_v ), 16 ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 HiCol4(Arg v, const signed dummy)
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_srai_epi32( _mm_unpackhi_epi16( _mm_setzero_si128(), v.m_v ), 16 ) );
														
 
															+ 	}
														
 
															+ #pragma warning ( pop )
														
 
															+-	
														
 
															++
														
 
															+ 	const u16 &operator[]( int pos ) const
														
 
															+ 	{
														
 
															+ 		return ((u16 *)&m_v)[pos];
														
 
															+@@ -2331,7 +2407,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_srli_epi16( left.m_v, right ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col8 operator>>( Arg left, int right )
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_srai_epi16( left.m_v, right ) );
														
 
															+@@ -2341,7 +2417,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_slli_epi16( left.m_v, right ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col8 operator<<( Arg left, int right )
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_slli_epi16( left.m_v, right ) );
														
 
															+@@ -2366,7 +2442,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_mulhi_epu16( left.m_v, _mm_set1_epi16( (unsigned short)right ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col8 operator*( Arg left, int right )
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_mulhi_epi16( left.m_v, _mm_set1_epi16( (short)right ) ) );
														
 
															+@@ -2374,12 +2450,7 @@ public:
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col8 ExtendSign(Arg a);
														
 
															+-	template<const int n>
														
 
															+-	friend Col8 ExtendSign(Arg a)
														
 
															+-	{
														
 
															+-		return Col8( _mm_srai_epi16( a.m_v, n ) );
														
 
															+-	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col8 HorizontalMin(Arg a)
														
 
															+ 	{
														
 
															+ 		__m128i res = a.m_v;
														
 
															+@@ -2420,17 +2491,13 @@ public:
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Col8 ShiftUp(Arg a);
														
 
															+-	template<const int n>
														
 
															+-	friend Col8 ShiftUp(Arg a)
														
 
															+-	{
														
 
															+-		return Col8( _mm_slli_si128( a.m_v, n << 1 ) );
														
 
															+-	}
														
 
															+-	
														
 
															++
														
 
															++
														
 
															+ #pragma warning ( push )
														
 
															+ #pragma warning ( disable : 4100 )
														
 
															+ 	friend Col4 ExpandUpper(Arg a, const unsigned dummy) {
														
 
															+ 		__m128i res = a.m_v;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
														
 
															+ 
														
 
															+ #ifdef _MSV_VER
														
 
															+@@ -2445,7 +2512,7 @@ public:
														
 
															+ 
														
 
															+ 	friend Col4 RepeatUpper(Arg a, const unsigned dummy) {
														
 
															+ 		__m128i res = a.m_v;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
														
 
															+ 		res = _mm_shuffle_epi32( res, SQUISH_SSE_SPLAT(3) );
														
 
															+ 
														
 
															+@@ -2458,10 +2525,10 @@ public:
														
 
															+ 
														
 
															+ 		return Col4( res );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 InterleaveUpper(Arg a, Arg b, const unsigned dummy) {
														
 
															+ 		__m128i res;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi16( a.m_v, b.m_v );
														
 
															+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
														
 
															+ 		res = _mm_unpackhi_epi64( res, res );
														
 
															+@@ -2478,7 +2545,7 @@ public:
														
 
															+ 
														
 
															+ 	friend Col4 ReplicateUpper(Arg a, Arg b, const unsigned dummy) {
														
 
															+ 		__m128i res;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi16( a.m_v, b.m_v );
														
 
															+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
														
 
															+ 		res = _mm_unpackhi_epi32( res, res );
														
 
															+@@ -2495,7 +2562,7 @@ public:
														
 
															+ 
														
 
															+ 	friend Col4 ExpandUpper(Arg a, const signed dummy) {
														
 
															+ 		__m128i res = a.m_v;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi16( res, res );
														
 
															+ 		res = _mm_srai_epi32( res, 16 );
														
 
															+ 
														
 
															+@@ -2524,10 +2591,10 @@ public:
														
 
															+ 
														
 
															+ 		return Col4( res );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 InterleaveUpper(Arg a, Arg b, const signed dummy) {
														
 
															+ 		__m128i res;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi32( a.m_v, b.m_v );
														
 
															+ 		res = _mm_srai_epi32( res, 16 );
														
 
															+ 		res = _mm_unpackhi_epi64( res, res );
														
 
															+@@ -2544,11 +2611,11 @@ public:
														
 
															+ 
														
 
															+ 	friend Col4 ReplicateUpper(Arg a, Arg b, const signed dummy) {
														
 
															+ 		__m128i res;
														
 
															+-		
														
 
															++
														
 
															+ 		res = _mm_unpackhi_epi32( a.m_v, b.m_v );
														
 
															+ 		res = _mm_srai_epi32( res, 16 );
														
 
															+ 		res = _mm_unpackhi_epi32( res, res );
														
 
															+-		
														
 
															++
														
 
															+ #ifdef _MSV_VER
														
 
															+ 		assert(res.m128i_i32[0] == a.m_v.m128i_i16[7]);
														
 
															+ 		assert(res.m128i_i32[1] == a.m_v.m128i_i16[7]);
														
 
															+@@ -2559,7 +2626,7 @@ public:
														
 
															+ 		return Col4( res );
														
 
															+ 	}
														
 
															+ #pragma warning ( pop )
														
 
															+-	
														
 
															++
														
 
															+ 	/*
														
 
															+ 	friend Col4 Expand(Arg a, int ia) {
														
 
															+ 		__m128i res = _mm_setzero_si128();
														
 
															+@@ -2601,17 +2668,17 @@ public:
														
 
															+ 		return Col4( res );
														
 
															+ 	}
														
 
															+ 	*/
														
 
															+-	
														
 
															++
														
 
															+ 	friend int CompareEqualTo( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return _mm_movemask_epi8( _mm_cmpeq_epi16( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col8 CompareAllEqualTo( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_cmpeq_epi16( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col8 CompareAllLessThan( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Col8( _mm_cmplt_epi16( left.m_v, right.m_v ) );
														
 
															+@@ -2620,9 +2687,21 @@ public:
														
 
															+ private:
														
 
															+ 	__m128i m_v;
														
 
															+ 
														
 
															+-	friend class Vec4;
														
 
															++	friend squish::Vec4;
														
 
															+ };
														
 
															+ 
														
 
															++template<const int n>
														
 
															++Col8 ExtendSign(Col8::Arg a)
														
 
															++{
														
 
															++	return Col8(_mm_srai_epi16(a.m_v, n));
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Col8 ShiftUp(Col8::Arg a)
														
 
															++{
														
 
															++    return Col8( _mm_slli_si128( a.m_v, n << 1 ) );
														
 
															++}
														
 
															++
														
 
															+ #define VEC4_CONST( X ) Vec4( X )
														
 
															+ 
														
 
															+ class Vec3
														
 
															+@@ -2649,7 +2728,7 @@ public:
														
 
															+ 		m_v = _mm_unpacklo_ps(_mm_load_ss(x), _mm_load_ss(y));
														
 
															+ 		m_v = _mm_movelh_ps(m_v, _mm_load_ss(z));
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec3( bool x, bool y, bool z ) : m_v( _mm_castsi128_ps( _mm_setr_epi32( x ? ~0 : 0, y ? ~0 : 0, z ? ~0 : 0, 0 ) ) ) {}
														
 
															+ 
														
 
															+ 	Vec3( float x, float y, float z ) : m_v( _mm_setr_ps( x, y, z, 0.0f ) ) {}
														
 
															+@@ -2662,7 +2741,7 @@ public:
														
 
															+ 	void StoreX(float *x) const { _mm_store_ss(x, m_v); }
														
 
															+ 	void StoreY(float *y) const { _mm_store_ss(y, _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) )); }
														
 
															+ 	void StoreZ(float *z) const { _mm_store_ss(z, _mm_movehl_ps( m_v, m_v ) ); }
														
 
															+-	
														
 
															++
														
 
															+ 	float X() const { return ((float *)&m_v)[0]; }
														
 
															+ 	float Y() const { return ((float *)&m_v)[1]; }
														
 
															+ 	float Z() const { return ((float *)&m_v)[2]; }
														
 
															+@@ -2729,7 +2808,7 @@ public:
														
 
															+ 		m_v = _mm_mul_ps( m_v, v.m_v );
														
 
															+ 		return *this;
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec3& operator/=( Arg v )
														
 
															+ 	{
														
 
															+ 		*this *= Reciprocal( v );
														
 
															+@@ -2863,16 +2942,7 @@ public:
														
 
															+ 
														
 
															+ 	template<const int n>
														
 
															+ 	friend Vec3 RotateLeft( Arg a );
														
 
															+-	template<const int n>
														
 
															+-	friend Vec3 RotateLeft( Arg a )
														
 
															+-	{
														
 
															+-		return Vec3( _mm_shuffle_ps( a.m_v , a.m_v , SQUISH_SSE_SHUF(
														
 
															+-			(n + 0) % 3,
														
 
															+-			(n + 1) % 3,
														
 
															+-			(n + 2) % 3,
														
 
															+-			3
														
 
															+-		) ) );
														
 
															+-	}
														
 
															++
														
 
															+ 
														
 
															+ 	friend Vec3 HorizontalAdd( Arg a )
														
 
															+ 	{
														
 
															+@@ -2974,7 +3044,7 @@ public:
														
 
															+ 
														
 
															+ 		return Vec3( res );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec3 HorizontalMaxXY( Arg a )
														
 
															+ 	{
														
 
															+ 		__m128 res = a.m_v;
														
 
															+@@ -2986,7 +3056,7 @@ public:
														
 
															+ 
														
 
															+ 		return Vec3( res );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec3 HorizontalMinXY( Arg a )
														
 
															+ 	{
														
 
															+ 		__m128 res = a.m_v;
														
 
															+@@ -3063,37 +3133,6 @@ public:
														
 
															+ 
														
 
															+ 	template<const bool disarm>
														
 
															+ 	friend Vec3 Complement( Arg left );
														
 
															+-	template<const bool disarm>
														
 
															+-	friend Vec3 Complement( Arg left )
														
 
															+-	{
														
 
															+-		__m128 ren, res, rez;
														
 
															+-
														
 
															+-		ren = left.m_v;
														
 
															+-		rez = _mm_set1_ps( 1.0f );
														
 
															+-		res = _mm_mul_ps( left.m_v, left.m_v );
														
 
															+-#if ( SQUISH_USE_SSE >= 3 )
														
 
															+-		res = _mm_hadd_ps( res, res );
														
 
															+-#else
														
 
															+-		res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
														
 
															+-#endif
														
 
															+-		if (!disarm) {
														
 
															+-			// correct x² + y² > 1.0f by renormalization
														
 
															+-			if ( _mm_comigt_ss( res, rez ) ) {
														
 
															+-				res = ReciprocalSqrt( Vec3(res) ).m_v;
														
 
															+-				res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															+-
														
 
															+-				ren = _mm_mul_ps( ren, res );
														
 
															+-				res = rez;
														
 
															+-			}
														
 
															+-		}
														
 
															+-		
														
 
															+-		rez = _mm_sub_ps( rez, _mm_min_ps( rez, res ) );
														
 
															+-		rez = _mm_sqrt_ps( rez );
														
 
															+-		res = _mm_movelh_ps( left.m_v, rez );
														
 
															+-
														
 
															+-		// sqrt(1.0f - (x*x + y*y))
														
 
															+-		return Vec3( res );
														
 
															+-	}
														
 
															+ 
														
 
															+ 	template<const bool disarm>
														
 
															+ 	friend Vec3 Complement( Vec3 &left, Vec3 &right );
														
 
															+@@ -3104,20 +3143,20 @@ public:
														
 
															+ 			Vec3 len = (left * left) + (right * right);
														
 
															+ 			Vec3 adj = ReciprocalSqrt(Max(Vec3(1.0f), len));
														
 
															+ 
														
 
															+-			// correct x² + y² > 1.0f by renormalization
														
 
															++			// correct x? + y? > 1.0f by renormalization
														
 
															+ 			left  *= adj;
														
 
															+ 			right *= adj;
														
 
															+ 
														
 
															+-			// sqrt(1.0f - (x² + y²))
														
 
															++			// sqrt(1.0f - (x? + y?))
														
 
															+ 			return Sqrt(Vec3(1.0f) - Min(Vec3(1.0f), len));
														
 
															+ 		}
														
 
															+ 		else {
														
 
															+ 			Vec3 len = (left * left) + (right * right);
														
 
															+ 
														
 
															+-			// disarm x² + y² > 1.0f by letting NaN happen
														
 
															++			// disarm x? + y? > 1.0f by letting NaN happen
														
 
															+ 			// ...
														
 
															+ 
														
 
															+-			// sqrt(1.0f - (x² + y²))
														
 
															++			// sqrt(1.0f - (x? + y?))
														
 
															+ 			return Sqrt(Vec3(1.0f) - len);
														
 
															+ 		}
														
 
															+ 	}
														
 
															+@@ -3168,7 +3207,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Vec3( _mm_and_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x7FFFFFFF ) ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec3 Neg( Arg a )
														
 
															+ 	{
														
 
															+ 		return Vec3( _mm_or_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x80000000 ) ) ) );
														
 
															+@@ -3192,21 +3231,9 @@ public:
														
 
															+ 		return Min(one, Max(zero, *this));
														
 
															+ 	}
														
 
															+ 
														
 
															+-	template<const bool round>
														
 
															+-	friend Col3 FloatToInt( Arg v );
														
 
															+-	template<const bool round>
														
 
															+-	friend Col3 FloatToInt( Arg v )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_SSE == 1 )
														
 
															+-		...
														
 
															+-#else
														
 
															+-		// use SSE2 instructions
														
 
															+-		if (round)
														
 
															+-		      return Col3( _mm_cvtps_epi32( v.m_v ) );
														
 
															+-		else
														
 
															+-		      return Col3( _mm_cvttps_epi32( v.m_v ) );
														
 
															+-#endif
														
 
															+-	}
														
 
															++    template<const bool round>
														
 
															++    friend Col3 FloatToInt( Arg v );
														
 
															++
														
 
															+ 
														
 
															+ 	friend Vec3 Truncate( Arg v )
														
 
															+ 	{
														
 
															+@@ -3296,7 +3323,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Vec3( _mm_cmpneq_ps( m_v, _mm_set1_ps( 1.0f ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec3 TransferZ( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Vec3( _mm_shuffle_ps( left.m_v, right.m_v, SQUISH_SSE_SHUF( 0, 1, 2, 3 ) ) );
														
 
															+@@ -3351,9 +3378,70 @@ public:
														
 
															+ private:
														
 
															+ 	__m128 m_v;
														
 
															+ 
														
 
															+-	friend class Vec4;
														
 
															++	friend squish::Vec4;
														
 
															+ };
														
 
															+ 
														
 
															++
														
 
															++template<const bool round>
														
 
															++Col3 FloatToInt(Vec3::Arg v )
														
 
															++{
														
 
															++
														
 
															++#if ( SQUISH_USE_SSE == 1 )
														
 
															++        dasda
														
 
															++        ...
														
 
															++#else
														
 
															++        // use SSE2 instructions
														
 
															++        if (round)
														
 
															++            return Col3( _mm_cvtps_epi32( v.m_v ) );
														
 
															++        else
														
 
															++            return Col3( _mm_cvttps_epi32( v.m_v ) );
														
 
															++#endif
														
 
															++
														
 
															++}
														
 
															++
														
 
															++template<const int n>
														
 
															++Vec3 RotateLeft( Vec3::Arg a )
														
 
															++{
														
 
															++    return Vec3( _mm_shuffle_ps( a.m_v , a.m_v , SQUISH_SSE_SHUF(
														
 
															++                                                                 (n + 0) % 3,
														
 
															++                                                                 (n + 1) % 3,
														
 
															++                                                                 (n + 2) % 3,
														
 
															++                                                                 3
														
 
															++                                                                 ) ) );
														
 
															++}
														
 
															++
														
 
															++template<const bool disarm>
														
 
															++Vec3 Complement( Vec3::Arg left )
														
 
															++{
														
 
															++    __m128 ren, res, rez;
														
 
															++
														
 
															++    ren = left.m_v;
														
 
															++    rez = _mm_set1_ps( 1.0f );
														
 
															++    res = _mm_mul_ps( left.m_v, left.m_v );
														
 
															++#if ( SQUISH_USE_SSE >= 3 )
														
 
															++    res = _mm_hadd_ps( res, res );
														
 
															++#else
														
 
															++    res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
														
 
															++#endif
														
 
															++    if (!disarm) {
														
 
															++        // correct x? + y? > 1.0f by renormalization
														
 
															++        if ( _mm_comigt_ss( res, rez ) ) {
														
 
															++            res = ReciprocalSqrt( Vec3(res) ).m_v;
														
 
															++            res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															++
														
 
															++            ren = _mm_mul_ps( ren, res );
														
 
															++            res = rez;
														
 
															++        }
														
 
															++    }
														
 
															++
														
 
															++    rez = _mm_sub_ps( rez, _mm_min_ps( rez, res ) );
														
 
															++    rez = _mm_sqrt_ps( rez );
														
 
															++    res = _mm_movelh_ps( left.m_v, rez );
														
 
															++
														
 
															++    // sqrt(1.0f - (x*x + y*y))
														
 
															++    return Vec3( res );
														
 
															++}
														
 
															++
														
 
															+ template<const bool round>
														
 
															+ Col3 FloatToUHalf( Vec3::Arg v );
														
 
															+ template<const bool round>
														
 
															+@@ -3382,7 +3470,7 @@ Col3 FloatToSHalf( Vec3::Arg v )
														
 
															+ 	return h;
														
 
															+ }
														
 
															+ 
														
 
															+-Vec3 UHalfToFloat( Col3::Arg v )
														
 
															++inline Vec3 UHalfToFloat( Col3::Arg v )
														
 
															+ {
														
 
															+ 	Vec3 f;
														
 
															+ 
														
 
															+@@ -3393,7 +3481,7 @@ Vec3 UHalfToFloat( Col3::Arg v )
														
 
															+ 	return f;
														
 
															+ }
														
 
															+ 
														
 
															+-Vec3 SHalfToFloat( Col3::Arg v )
														
 
															++inline Vec3 SHalfToFloat( Col3::Arg v )
														
 
															+ {
														
 
															+ 	Vec3 f;
														
 
															+ 
														
 
															+@@ -3427,7 +3515,7 @@ public:
														
 
															+ 		m_v = arg.m_v;
														
 
															+ 		return *this;
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	operator Vec3()
														
 
															+ 	{
														
 
															+ 		return Vec3(m_v);
														
 
															+@@ -3458,21 +3546,21 @@ public:
														
 
															+ 		m_v = _mm_load_ss(x);
														
 
															+ 		m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec4( const unsigned short* x ) {
														
 
															+ 		__m128i v = _mm_setzero_si128();
														
 
															+ 
														
 
															+ 		m_v = _mm_cvtepi32_ps( _mm_insert_epi16( v, *x, 0 ) );
														
 
															+ 		m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec4( const signed short* x ) {
														
 
															+ 		__m128i v = _mm_setzero_si128();
														
 
															+ 
														
 
															+ 		m_v = _mm_cvtepi32_ps( _mm_insert_epi16( v, *x, 0 ) );
														
 
															+ 		m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec4( bool x, bool y, bool z, bool w ) : m_v( _mm_castsi128_ps( _mm_setr_epi32( x ? ~0 : 0, y ? ~0 : 0, z ? ~0 : 0, w ? ~0 : 0 ) ) ) {}
														
 
															+ 
														
 
															+ 	Vec4( int x, int y, int z, int w ) : m_v( _mm_cvtepi32_ps( _mm_setr_epi32( x, y, z, w ) ) ) {}
														
 
															+@@ -3498,23 +3586,17 @@ public:
														
 
															+ 	{
														
 
															+ 		return Vec3( m_v );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	int GetM4() const
														
 
															+ 	{
														
 
															+ 		return _mm_movemask_ps( m_v );
														
 
															+ 	}
														
 
															+ 
														
 
															+ 	template<class dtyp> friend Vec4 LoVec4(Col8 const&v, const dtyp& dummy);
														
 
															+-	template<class dtyp> friend Vec4 LoVec4(Col8 const&v, const dtyp& dummy)
														
 
															+-	{
														
 
															+-		return Vec4( LoCol4( v, dummy ) );
														
 
															+-	}
														
 
															++
														
 
															+ 
														
 
															+ 	template<class dtyp> friend Vec4 HiVec4(Col8 const&v, const dtyp& dummy);
														
 
															+-	template<class dtyp> friend Vec4 HiVec4(Col8 const&v, const dtyp& dummy)
														
 
															+-	{
														
 
															+-		return Vec4( HiCol4( v, dummy ) );
														
 
															+-	}
														
 
															++
														
 
															+ 
														
 
															+ 	void StoreX(float *x) const { _mm_store_ss(x, m_v); }
														
 
															+ 	void StoreY(float *y) const { _mm_store_ss(y, _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) )); }
														
 
															+@@ -3619,7 +3701,7 @@ public:
														
 
															+ 		m_v = _mm_mul_ps( m_v, v.m_v );
														
 
															+ 		return *this;
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec4& operator*=( float v )
														
 
															+ 	{
														
 
															+ 		m_v = _mm_mul_ps( m_v, Vec4( v ).m_v );
														
 
															+@@ -3631,7 +3713,7 @@ public:
														
 
															+ 		*this *= Reciprocal( v );
														
 
															+ 		return *this;
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	Vec4& operator/=( float v )
														
 
															+ 	{
														
 
															+ 		*this *= Reciprocal( Vec4( v ) );
														
 
															+@@ -3732,16 +3814,7 @@ public:
														
 
															+ 
														
 
															+ 	template<const int a, const int b, const int c, const int d>
														
 
															+ 	friend Vec4 Merge( Arg lo, Arg hi );
														
 
															+-	template<const int a, const int b, const int c, const int d>
														
 
															+-	friend Vec4 Merge( Arg lo, Arg hi )
														
 
															+-	{
														
 
															+-		return Vec4( _mm_shuffle_ps( lo.m_v , hi.m_v , SQUISH_SSE_SHUF(
														
 
															+-			a % 4,
														
 
															+-			b % 4,
														
 
															+-			c % 4,
														
 
															+-			d % 4
														
 
															+-		) ) );
														
 
															+-	}
														
 
															++
														
 
															+ 
														
 
															+ 	template<const int f, const int t>
														
 
															+ 	friend Vec4 Shuffle( Arg a );
														
 
															+@@ -3900,7 +3973,7 @@ public:
														
 
															+ 
														
 
															+ 		return Vec4( res );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 HorizontalMaxXY( Arg a )
														
 
															+ 	{
														
 
															+ 		__m128 res = a.m_v;
														
 
															+@@ -3912,7 +3985,7 @@ public:
														
 
															+ 
														
 
															+ 		return Vec4( res );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 HorizontalMinXY( Arg a )
														
 
															+ 	{
														
 
															+ 		__m128 res = a.m_v;
														
 
															+@@ -3965,7 +4038,7 @@ public:
														
 
															+ 
														
 
															+ 		return rsq;
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 Normalize( Arg left )
														
 
															+ 	{
														
 
															+ 		Vec4 sum = HorizontalAdd( Vec4( _mm_mul_ps( left.m_v, left.m_v ) ) );
														
 
															+@@ -3973,7 +4046,7 @@ public:
														
 
															+ 
														
 
															+ 		return left * rsq;
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 Normalize( Vec4& x, Vec4& y, Vec4& z )
														
 
															+ 	{
														
 
															+ 		Vec4 xx = x * x;
														
 
															+@@ -4006,7 +4079,7 @@ public:
														
 
															+ 		res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
														
 
															+ #endif
														
 
															+ 		if (!disarm) {
														
 
															+-			// correct x² + y² > 1.0f by renormalization
														
 
															++			// correct x? + y? > 1.0f by renormalization
														
 
															+ 			if ( _mm_comigt_ss( res, rez ) ) {
														
 
															+ 				res = ReciprocalSqrt( Vec4(res) ).m_v;
														
 
															+ 				res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
														
 
															+@@ -4028,7 +4101,7 @@ public:
														
 
															+ 			res = _mm_and_ps( res, _mm_castsi128_ps ( _mm_setr_epi32( ~0, ~0, ~0,  0 ) ) );
														
 
															+ 		}
														
 
															+ 
														
 
															+-		// sqrt(1.0f - (x² + y²))
														
 
															++		// sqrt(1.0f - (x? + y?))
														
 
															+ 		return Vec4( res );
														
 
															+ 	}
														
 
															+ 
														
 
															+@@ -4041,20 +4114,20 @@ public:
														
 
															+ 			Vec4 len = left * left + right * right;
														
 
															+ 			Vec4 adj = ReciprocalSqrt(Max(Vec4(1.0f), len));
														
 
															+ 
														
 
															+-			// correct x² + y² > 1.0f by renormalization
														
 
															++			// correct x? + y? > 1.0f by renormalization
														
 
															+ 			left  *= adj;
														
 
															+ 			right *= adj;
														
 
															+ 
														
 
															+-			// sqrt(1.0f - (x² + y²))
														
 
															++			// sqrt(1.0f - (x? + y?))
														
 
															+ 			return Sqrt(Vec4(1.0f) - Min(Vec4(1.0f), len));
														
 
															+ 		}
														
 
															+ 		else {
														
 
															+ 			Vec4 len = (left * left) + (right * right);
														
 
															+ 
														
 
															+-			// disarm x² + y² > 1.0f by letting NaN happen
														
 
															++			// disarm x? + y? > 1.0f by letting NaN happen
														
 
															+ 			// ...
														
 
															+ 
														
 
															+-			// sqrt(1.0f - (x² + y²))
														
 
															++			// sqrt(1.0f - (x? + y?))
														
 
															+ 			return Sqrt(Vec4(1.0f) - len);
														
 
															+ 		}
														
 
															+ 	}
														
 
															+@@ -4105,7 +4178,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_and_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x7FFFFFFF ) ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 Neg( Arg a )
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_or_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x80000000 ) ) ) );
														
 
															+@@ -4131,19 +4204,7 @@ public:
														
 
															+ 
														
 
															+ 	template<const bool round>
														
 
															+ 	friend Col4 FloatToInt( Vec4::Arg v );
														
 
															+-	template<const bool round>
														
 
															+-	friend Col4 FloatToInt( Vec4::Arg v )
														
 
															+-	{
														
 
															+-#if ( SQUISH_USE_SSE == 1 )
														
 
															+-		...
														
 
															+-#else
														
 
															+-		// use SSE2 instructions
														
 
															+-		if (round)
														
 
															+-		      return Col4( _mm_cvtps_epi32( v.m_v ) );
														
 
															+-		else
														
 
															+-		      return Col4( _mm_cvttps_epi32( v.m_v ) );
														
 
															+-#endif
														
 
															+-	}
														
 
															++
														
 
															+ 
														
 
															+ 	friend Vec4 Truncate( Arg v )
														
 
															+ 	{
														
 
															+@@ -4159,7 +4220,7 @@ public:
														
 
															+ 
														
 
															+ 		// clear out the MMX multimedia state to allow FP calls later
														
 
															+ 		_mm_empty();
														
 
															+-		
														
 
															++
														
 
															+ 		return Vec4( truncated );
														
 
															+ #else
														
 
															+ 		// use SSE2 instructions
														
 
															+@@ -4188,7 +4249,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return _mm_movemask_ps( _mm_cmpeq_ps( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend int CompareNotEqualTo( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return _mm_movemask_ps( _mm_cmpneq_ps( left.m_v, right.m_v ) );
														
 
															+@@ -4198,7 +4259,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return _mm_movemask_ps( _mm_cmplt_ps( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend int CompareGreaterThan( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return _mm_movemask_ps( _mm_cmpgt_ps( left.m_v, right.m_v ) );
														
 
															+@@ -4234,17 +4295,17 @@ public:
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_cmpeq_epi32( _mm_castps_si128 ( left.m_v ), _mm_castps_si128 ( right.m_v ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Col4 CompareAllEqualTo_M8( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Col4( _mm_cmpeq_epi8( _mm_castps_si128 ( left.m_v ), _mm_castps_si128 ( right.m_v ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend int CompareFirstLessThan( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return _mm_comilt_ss( left.m_v, right.m_v );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend int CompareFirstLessEqualTo( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return _mm_comile_ss( left.m_v, right.m_v );
														
 
															+@@ -4264,17 +4325,17 @@ public:
														
 
															+ 	{
														
 
															+ 		return _mm_comieq_ss( left.m_v, right.m_v );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 IsGreaterThan( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_cmpgt_ps( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 IsGreaterEqual( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_cmpge_ps( left.m_v, right.m_v ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 IsNotEqualTo( Arg left, Arg right )
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_cmpneq_ps( left.m_v, right.m_v ) );
														
 
															+@@ -4326,7 +4387,7 @@ public:
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_and_ps( left.m_v, _mm_castsi128_ps ( _mm_setr_epi32(  0,  0,  0, ~0 ) ) ) );
														
 
															+ 	}
														
 
															+-	
														
 
															++
														
 
															+ 	friend Vec4 CollapseW( Arg x, Arg y, Arg z, Arg w )
														
 
															+ 	{
														
 
															+ 		return Vec4( _mm_unpackhi_ps( _mm_unpackhi_ps( x.m_v, z.m_v ), _mm_unpackhi_ps( y.m_v, w.m_v ) ) );
														
 
															+@@ -4420,6 +4481,41 @@ private:
														
 
															+ 	__m128 m_v;
														
 
															+ };
														
 
															+ 
														
 
															++template<const bool round>
														
 
															++Col4 FloatToInt( Vec4::Arg v )
														
 
															++{
														
 
															++#if ( SQUISH_USE_SSE == 1 )
														
 
															++    ...
														
 
															++#else
														
 
															++    // use SSE2 instructions
														
 
															++    if (round)
														
 
															++        return Col4( _mm_cvtps_epi32( v.m_v ) );
														
 
															++    else
														
 
															++        return Col4( _mm_cvttps_epi32( v.m_v ) );
														
 
															++#endif
														
 
															++}
														
 
															++
														
 
															++template<class dtyp> Vec4 LoVec4(Col8 const&v, const dtyp& dummy)
														
 
															++{
														
 
															++    return Vec4( LoCol4( v, dummy ) );
														
 
															++}
														
 
															++
														
 
															++template<class dtyp> Vec4 HiVec4(Col8 const&v, const dtyp& dummy)
														
 
															++{
														
 
															++    return Vec4( HiCol4( v, dummy ) );
														
 
															++}
														
 
															++
														
 
															++template<const int a, const int b, const int c, const int d>
														
 
															++Vec4 Merge( Vec4::Arg lo, Vec4::Arg hi )
														
 
															++{
														
 
															++    return Vec4( _mm_shuffle_ps( lo.m_v , hi.m_v , SQUISH_SSE_SHUF(
														
 
															++                                                                   a % 4,
														
 
															++                                                                   b % 4,
														
 
															++                                                                   c % 4,
														
 
															++                                                                   d % 4
														
 
															++                                                                   ) ) );
														
 
															++}
														
 
															++
														
 
															+ template<const bool round>
														
 
															+ Col4 FloatToUHalf( Vec4::Arg v );
														
 
															+ template<const bool round>
														
 
															+@@ -4450,7 +4546,7 @@ Col4 FloatToSHalf( Vec4::Arg v )
														
 
															+ 	return h;
														
 
															+ }
														
 
															+ 
														
 
															+-Vec4 UHalfToFloat( Col4::Arg v )
														
 
															++inline Vec4 UHalfToFloat( Col4::Arg v )
														
 
															+ {
														
 
															+ 	Vec4 f;
														
 
															+ 
														
 
															+@@ -4462,7 +4558,7 @@ Vec4 UHalfToFloat( Col4::Arg v )
														
 
															+ 	return f;
														
 
															+ }
														
 
															+ 
														
 
															+-Vec4 SHalfToFloat( Col4::Arg v )
														
 
															++inline Vec4 SHalfToFloat( Col4::Arg v )
														
 
															+ {
														
 
															+ 	Vec4 f;
														
 
															+ 
														
--- a/package_build_list_host_darwin.json
+++ b/package_build_list_host_darwin.json
@@ -28,7 +28,8 @@
 
															         "poly2tri-7f0487a-rev1-mac": "package-system/poly2tri/build_package_image.py --platform-name mac",
														
 
															         "v-hacd-2.3-1a49edf-rev1-mac": "package-system/v-hacd/build_package_image.py --platform-name mac",
														
 
															         "SPIRVCross-2021.04.29-rev1-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/SPIRVCross --platform-name Mac --package-root ../../package-system --clean",
														
 
															-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Mac --package-root ../../package-system --clean",
														
 
															+        "squish-ccr-deb557d-rev1-mac" : "Scripts/extras/pull_and_build_from_git.py ../../package-system/squish-ccr --platform-name Mac --package-root ../../package-system --clean",
														
 
															+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Mac --package-root ../../package-system --clean",
														
 
															         "azslc-1.7.23-rev2-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/azslc --platform-name Mac --package-root ../../package-system --clean",
														
 
															         "tiff-4.2.0.10-mac" : "package-system/tiff/build_package_image.py --platform mac",
														
 
															         "tiff-4.2.0.10-ios" : "package-system/tiff/build_package_image.py --platform ios",
														
@@ -72,7 +73,8 @@
 
															         "v-hacd-2.3-1a49edf-rev1-mac": "package-system/v-hacd-mac",
														
 
															         "mcpp-2.7.2_az.1-rev1-mac": "package-system/mcpp-mac",
														
 
															         "SPIRVCross-2021.04.29-rev1-mac": "package-system/SPIRVCross-mac",
														
 
															-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-mac": "package-system/DirectXShaderCompilerDxc-mac",
														
 
															+        "squish-ccr-deb557d-rev1-mac": "package-system/squish-ccr-mac",
														
 
															+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-mac": "package-system/DirectXShaderCompilerDxc-mac",
														
 
															         "azslc-1.7.23-rev2-mac": "package-system/azslc-mac",
														
 
															         "SQLite-3.32.2-rev3-multiplatform" : "package-system/SQLite-multiplatform",
														
 
															         "xxhash-0.7.4-rev1-multiplatform":  "package-system/xxhash-multiplatform",
														
--- a/package_build_list_host_linux.json
+++ b/package_build_list_host_linux.json
@@ -20,7 +20,8 @@
 
															         "poly2tri-7f0487a-rev1-linux": "package-system/poly2tri/build_package_image.py --platform-name linux",
														
 
															         "v-hacd-2.3-1a49edf-rev1-linux": "package-system/v-hacd/build_package_image.py --platform-name linux",
														
 
															         "SPIRVCross-2021.04.29-rev1-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/SPIRVCross --platform-name Linux --package-root ../../package-system --clean",
														
 
															-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Linux --package-root ../../package-system --clean",
														
 
															+        "squish-ccr-deb557d-rev1-linux" : "Scripts/extras/pull_and_build_from_git.py ../../package-system/squish-ccr --platform-name Linux --package-root ../../package-system --clean",
														
 
															+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Linux --package-root ../../package-system --clean",
														
 
															         "azslc-1.7.23-rev2-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/azslc --platform-name Linux --package-root ../../package-system --clean",
														
 
															         "tiff-4.2.0.10-linux" : "package-system/tiff/build_package_image.py --platform linux",
														
 
															         "python-3.7.10-rev2-linux" : "package-system/python/build_package_image.py",
														
@@ -41,7 +42,8 @@
 
															         "OpenSSL-1.1.1b-rev2-linux": "package-system/OpenSSL-linux",
														
 
															         "ilmbase-2.3.0-rev4-linux": "package-system/ilmbase-linux",
														
 
															         "SPIRVCross-2021.04.29-rev1-linux": "package-system/SPIRVCross-linux",
														
 
															-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-linux": "package-system/DirectXShaderCompilerDxc-linux",
														
 
															+        "squish-ccr-deb557d-rev1-linux" : "package-system/squish-ccr-linux",
														
 
															+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-linux": "package-system/DirectXShaderCompilerDxc-linux",
														
 
															         "azslc-1.7.23-rev2-linux": "package-system/azslc-linux",
														
 
															         "tiff-4.2.0.10-linux" : "package-system/tiff-linux",
														
 
															         "python-3.7.10-rev2-linux" : "package-system/python/linux_x64/package",
														
--- a/package_build_list_host_windows.json
+++ b/package_build_list_host_windows.json
@@ -26,7 +26,8 @@
 
															         "OpenSSL-1.1.1b-rev1-android": "package-system/OpenSSL/build_package_image.py --platform-name android",
														
 
															         "ilmbase-2.3.0-rev4-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/OpenEXR --platform-name Windows --package-root ../../package-system --clean",
														
 
															         "SPIRVCross-2021.04.29-rev1-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/SPIRVCross --platform-name Windows --package-root ../../package-system --clean",
														
 
															-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Windows --package-root ../../package-system --clean",
														
 
															+        "squish-ccr-deb557d-rev1-windows" : "Scripts/extras/pull_and_build_from_git.py ../../package-system/squish-ccr --platform-name Windows --package-root ../../package-system --clean",
														
 
															+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Windows --package-root ../../package-system --clean",
														
 
															         "azslc-1.7.23-rev2-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/azslc --platform-name Windows --package-root ../../package-system --clean",
														
 
															         "PhysX-4.1.2.29882248-rev3-windows" : "package-system/PhysX/build_package_image.py --platform windows",
														
 
															         "PhysX-4.1.2.29882248-rev3-android" : "package-system/PhysX/build_package_image.py --platform android",
														
@@ -72,7 +73,6 @@
 
															     "alembic-1.7.11-rev3-multiplatform": "package-system/alembic-multiplatform",
														
 
															     "ilmbase-2.3.0-rev4-windows": "package-system/ilmbase-windows",
														
 
															     "assimp-5.0.1-rev11-multiplatform": "package-system/assimp-multiplatform",
														
 
															-    "squish-ccr-20150601-rev3-multiplatform": "package-system/squish-ccr-multiplatform",
														
 
															     "md5-2.0-multiplatform": "package-system/md5-multiplatform",
														
 
															     "RapidJSON-1.1.0-rev1-multiplatform": "package-system/RapidJSON-multiplatform",
														
 
															     "RapidXML-1.13-multiplatform": "package-system/RapidXML-multiplatform",
														
@@ -94,7 +94,8 @@
 
															     "openimageio-2.1.16.0-rev2-windows": "package-system/openimageio-windows",
														
 
															     "v-hacd-2.3-1a49edf-rev1-windows": "package-system/v-hacd-windows",
														
 
															     "SPIRVCross-2021.04.29-rev1-windows": "package-system/SPIRVCross-windows",
														
 
															-    "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-windows": "package-system/DirectXShaderCompilerDxc-windows",
														
 
															+    "squish-ccr-deb557d-rev1-windows" : "package-system/squish-ccr-windows",
														
 
															+    "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-windows": "package-system/DirectXShaderCompilerDxc-windows",
														
 
															     "azslc-1.7.23-rev2-windows": "package-system/azslc-windows",
														
 
															     "zstd-1.35-multiplatform": "package-system/zstd-multiplatform",
														
 
															     "SQLite-3.32.2-rev3-multiplatform": "package-system/SQLite-multiplatform",