3 년 전 · 0787f06ecb
--- a/package-system/squish-ccr/CMakeLists.txt
+++ b/package-system/squish-ccr/CMakeLists.txt
@@ -0,0 +1,138 @@
 
				+#
			
 
				+# Copyright (c) Contributors to the Open 3D Engine Project. For complete copyright and license terms please see the LICENSE at the root of this distribution.
			
 
				+# 
			
 
				+# SPDX-License-Identifier: Apache-2.0 OR MIT
			
 
				+#
			
 
				+#
			
 
				+
			
 
				+# CMake definition for squish-ccr 2.00 alpha2
			
 
				+cmake_minimum_required(VERSION 3.17)
			
 
				+
			
 
				+project(squish-ccr)
			
 
				+
			
 
				+set(SQUISH_SOURCE_FILES 
			
 
				+    alpha.cpp
			
 
				+    alphanormalfit.cpp
			
 
				+    bitoneblock.cpp
			
 
				+    bitoneclusterfit.cpp
			
 
				+    bitonefit.cpp
			
 
				+    bitonenormalfit.cpp
			
 
				+    bitonerangefit.cpp
			
 
				+    bitoneset.cpp
			
 
				+    colourblock.cpp
			
 
				+    colourclusterfit.cpp
			
 
				+    colourfit.cpp
			
 
				+    colournormalfit.cpp
			
 
				+    colourrangefit.cpp
			
 
				+    colourset.cpp
			
 
				+    hdrblock.cpp
			
 
				+    hdrfit.cpp
			
 
				+    hdrindexfit.cpp
			
 
				+    hdrrangefit.cpp
			
 
				+    hdrset.cpp
			
 
				+    hdrsinglefit.cpp
			
 
				+    hdrsinglesnap.cpp
			
 
				+    maths.cpp
			
 
				+    paletteblock.cpp
			
 
				+    palettechannelfit.cpp
			
 
				+    paletteclusterfit.cpp
			
 
				+    palettefit.cpp
			
 
				+    paletteindexfit.cpp
			
 
				+    palettenormalfit.cpp
			
 
				+    paletterangefit.cpp
			
 
				+    paletteset.cpp
			
 
				+    coloursinglefit.cpp
			
 
				+    coloursinglesnap.cpp
			
 
				+    palettesinglefit.cpp
			
 
				+    palettesinglesnap.cpp
			
 
				+    simd.cpp
			
 
				+    squish.cpp
			
 
				+)
			
 
				+
			
 
				+set(SQUISH_HEADER_FILES
			
 
				+    alpha.h
			
 
				+    alphanormalfit.h
			
 
				+    bitoneblock.h
			
 
				+    bitoneclusterfit.h
			
 
				+    bitonefit.h
			
 
				+    bitonenormalfit.h
			
 
				+    bitonerangefit.h
			
 
				+    bitoneset.h
			
 
				+    colourblock.h
			
 
				+    colourclusterfit.h
			
 
				+    colourfit.h
			
 
				+    colournormalfit.h
			
 
				+    colourrangefit.h
			
 
				+    colourset.h
			
 
				+    config.h
			
 
				+    helpers.h
			
 
				+    hdrblock.h
			
 
				+    hdrfit.h
			
 
				+    hdrindexfit.h
			
 
				+    hdrrangefit.h
			
 
				+    hdrset.h
			
 
				+    hdrsinglefit.h
			
 
				+    hdrsinglesnap.h
			
 
				+    maths.h
			
 
				+    paletteblock.h
			
 
				+    palettechannelfit.h
			
 
				+    paletteclusterfit.h
			
 
				+    palettefit.h
			
 
				+    paletteindexfit.h
			
 
				+    palettenormalfit.h
			
 
				+    paletterangefit.h
			
 
				+    paletteset.h
			
 
				+    simd.h
			
 
				+    simd_float.h
			
 
				+    simd_sse.h
			
 
				+    simd_ve.h
			
 
				+    coloursinglefit.h
			
 
				+    coloursinglesnap.h
			
 
				+    palettesinglefit.h
			
 
				+    palettesinglesnap.h
			
 
				+    squish.h
			
 
				+)
			
 
				+set(SQUISH_INLINE_FILES
			
 
				+    bitoneclusterfit.inl
			
 
				+    coloursinglelookup.inl
			
 
				+    palettesinglelookup.inl
			
 
				+)
			
 
				+
			
 
				+set(SQUISH_PUBLIC_INCLUDE_FILES
			
 
				+    squish.h
			
 
				+    config.h
			
 
				+    coloursinglelookup_ccr.inl
			
 
				+    coloursinglelookup_ccr_vector.inl
			
 
				+    degeneracy_ccr.inl
			
 
				+)
			
 
				+
			
 
				+add_library(squish-ccr SHARED ${SQUISH_SOURCE_FILES} ${SQUISH_HEADER_FILES} ${SQUISH_INLINE_FILES})
			
 
				+target_include_directories(squish-ccr PRIVATE ${CMAKE_CURRENT_LIST_DIR})
			
 
				+target_compile_definitions(squish-ccr PRIVATE SQUISH_USE_SSE=2 SQUISH_USE_CPP SQUISH_USE_CCR)
			
 
				+
			
 
				+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
			
 
				+    target_compile_options(squish-ccr PRIVATE -msse2 -Wno-unused-value)
			
 
				+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
			
 
				+    target_compile_definitions(squish-ccr PRIVATE NDEBUG USE_CPP)
			
 
				+endif()
			
 
				+
			
 
				+set_target_properties(squish-ccr
			
 
				+    PROPERTIES
			
 
				+        LIBRARY_OUTPUT_DIRECTORY_RELEASE "${CMAKE_BINARY_DIR}/bin/"
			
 
				+        PUBLIC_HEADER "${SQUISH_PUBLIC_INCLUDE_FILES}"
			
 
				+)
			
 
				+
			
 
				+include(GNUInstallDirs)
			
 
				+
			
 
				+install(TARGETS squish-ccr
			
 
				+        PUBLIC_HEADER
			
 
				+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/squish-ccr"
			
 
				+        ARCHIVE
			
 
				+            DESTINATION ${CMAKE_INSTALL_BINDIR}
			
 
				+        LIBRARY
			
 
				+            DESTINATION ${CMAKE_INSTALL_BINDIR}
			
 
				+        RUNTIME
			
 
				+            DESTINATION ${CMAKE_INSTALL_BINDIR}
			
 
				+        FRAMEWORK
			
 
				+            DESTINATION ${CMAKE_INSTALL_BINDIR}
			
 
				+)
			
--- a/package-system/squish-ccr/Findsquish-ccr.cmake.template
+++ b/package-system/squish-ccr/Findsquish-ccr.cmake.template
@@ -0,0 +1,45 @@
 
				+#
			
 
				+# Copyright (c) Contributors to the Open 3D Engine Project. For complete copyright and license terms please see the LICENSE at the root of this distribution.
			
 
				+# 
			
 
				+# SPDX-License-Identifier: Apache-2.0 OR MIT
			
 
				+#
			
 
				+
			
 
				+# this file actually ingests the library and defines targets.
			
 
				+
			
 
				+set(LIB_NAME "squish-ccr")
			
 
				+set(TARGET_WITH_NAMESPACE "3rdParty::$${LIB_NAME}")
			
 
				+if (TARGET $${TARGET_WITH_NAMESPACE})
			
 
				+    return()
			
 
				+endif()
			
 
				+
			
 
				+set($${LIB_NAME}_INCLUDE_DIR $${CMAKE_CURRENT_LIST_DIR}/$${LIB_NAME}/include)
			
 
				+set($${LIB_NAME}_LIBRARY_DIR $${CMAKE_CURRENT_LIST_DIR}/$${LIB_NAME}/bin)
			
 
				+
			
 
				+add_library($${TARGET_WITH_NAMESPACE} INTERFACE IMPORTED GLOBAL)
			
 
				+
			
 
				+# add include directory
			
 
				+ly_target_include_system_directories(TARGET $${TARGET_WITH_NAMESPACE} INTERFACE $${$${LIB_NAME}_INCLUDE_DIR})
			
 
				+
			
 
				+if ($${PAL_PLATFORM_NAME} STREQUAL "Windows")
			
 
				+    set($${LIB_NAME}_LIBRARY   $${$${LIB_NAME}_LIBRARY_DIR}/$${LIB_NAME}.lib)
			
 
				+else()
			
 
				+    set($${LIB_NAME}_LIBRARY   $${$${LIB_NAME}_LIBRARY_DIR}/$${CMAKE_SHARED_LIBRARY_PREFIX}$${LIB_NAME}$${CMAKE_SHARED_LIBRARY_SUFFIX})
			
 
				+endif()
			
 
				+
			
 
				+set($${LIB_NAME}_RUNTIME_DEPENDENCIES $${$${LIB_NAME}_LIBRARY_DIR}/$${CMAKE_SHARED_LIBRARY_PREFIX}$${LIB_NAME}$${CMAKE_SHARED_LIBRARY_SUFFIX})
			
 
				+
			
 
				+# for linking
			
 
				+target_link_libraries($${TARGET_WITH_NAMESPACE} INTERFACE $${$${LIB_NAME}_LIBRARY})
			
 
				+
			
 
				+# add runtime dependencies
			
 
				+ly_add_target_files(TARGETS $${TARGET_WITH_NAMESPACE} FILES $${$${LIB_NAME}_RUNTIME_DEPENDENCIES})
			
 
				+
			
 
				+# using squish causes your target to get a USING_SQUISH_SDK applied to it.
			
 
				+target_compile_definitions($${TARGET_WITH_NAMESPACE} INTERFACE 
			
 
				+    USING_SQUISH_SDK
			
 
				+    SQUISH_USE_SSE=2
			
 
				+    SQUISH_USE_CPP
			
 
				+    SQUISH_USE_CCR
			
 
				+    )
			
 
				+
			
 
				+set($${LIB_NAME}_FOUND True)
			
--- a/package-system/squish-ccr/LICENSE.txt
+++ b/package-system/squish-ccr/LICENSE.txt
@@ -0,0 +1,32 @@
 
				+LICENSE
			
 
				+-------
			
 
				+
			
 
				+The squish library is distributed under the terms and conditions of the MIT
			
 
				+license. This license is specified at the top of each source file and must be
			
 
				+preserved in its entirety.
			
 
				+
			
 
				+/* -----------------------------------------------------------------------------
			
 
				+
			
 
				+	Copyright (c) 2006 Simon Brown                          [email protected]
			
 
				+	Copyright (c) 2012 Niels Fröhling              [email protected]
			
 
				+
			
 
				+	Permission is hereby granted, free of charge, to any person obtaining
			
 
				+	a copy of this software and associated documentation files (the
			
 
				+	"Software"), to	deal in the Software without restriction, including
			
 
				+	without limitation the rights to use, copy, modify, merge, publish,
			
 
				+	distribute, sublicense, and/or sell copies of the Software, and to
			
 
				+	permit persons to whom the Software is furnished to do so, subject to
			
 
				+	the following conditions:
			
 
				+
			
 
				+	The above copyright notice and this permission notice shall be included
			
 
				+	in all copies or substantial portions of the Software.
			
 
				+
			
 
				+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
			
 
				+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
			
 
				+
			
 
				+   -------------------------------------------------------------------------- */
			
--- a/package-system/squish-ccr/build_config.json
+++ b/package-system/squish-ccr/build_config.json
@@ -0,0 +1,78 @@
 
				+{
			
 
				+   "git_url":"https://github.com/Ethatron/squish-ccr.git",
			
 
				+   "git_tag":"master",
			
 
				+   "git_commit":"deb557d2fa647b191b37a2d8682df54ec8a7cfba",
			
 
				+   "package_name":"squish-ccr",
			
 
				+   "package_version":"deb557d-rev1",
			
 
				+   "package_url":"http://sjbrown.co.uk/2006/01/19/dxt-compression-techniques/",
			
 
				+   "package_license":"MIT",
			
 
				+   "package_license_file":"LICENSE.txt",
			
 
				+   "cmake_find_template":"Findsquish-ccr.cmake.template",
			
 
				+   "cmake_find_target":"Findsquish-ccr.cmake",
			
 
				+   "patch_file":"squish-ccr-deb557d-rev1.patch",
			
 
				+   "additional_src_files":[
			
 
				+      "CMakeLists.txt",
			
 
				+      "LICENSE.txt"
			
 
				+   ],
			
 
				+   "Platforms":{
			
 
				+      "Windows":{
			
 
				+        "Windows": {
			
 
				+            "custom_cmake_install": true,
			
 
				+            "cmake_generate_args_release": [
			
 
				+                "-G",
			
 
				+                "\"Visual Studio 16 2019\"",
			
 
				+                "-DCMAKE_CXX_STANDARD=17",
			
 
				+                "-DCMAKE_WINDOWS_EXPORT_ALL_SYMBOLS=TRUE",
			
 
				+                "-DBUILD_SHARED_LIBS=TRUE"
			
 
				+            ],
			
 
				+            "cmake_build_args": [
			
 
				+                "-j"
			
 
				+            ],
			
 
				+            "build_configs": [
			
 
				+                "Release"
			
 
				+            ]
			
 
				+        }
			
 
				+      },
			
 
				+      "Darwin":{
			
 
				+        "Mac": {
			
 
				+            "custom_cmake_install": true,
			
 
				+            "cmake_generate_args_release": [
			
 
				+                "-G",
			
 
				+                "Xcode",
			
 
				+                "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.12",
			
 
				+                "-DCMAKE_OSX_ARCHITECTURES=x86_64",
			
 
				+                "-DCMAKE_CXX_FLAGS=\"-fPIC -O2\"",
			
 
				+                "-DCMAKE_CXX_STANDARD=17",
			
 
				+                "-DCMAKE_BUILD_TYPE=Release"
			
 
				+            ],
			
 
				+            "cmake_build_args": [
			
 
				+                "-j",
			
 
				+                "8"
			
 
				+            ],
			
 
				+            "build_configs": [
			
 
				+                "Release"
			
 
				+            ]
			
 
				+        }
			
 
				+      },
			
 
				+      "Linux":{
			
 
				+         "Linux":{
			
 
				+            "custom_cmake_install":true,
			
 
				+            "cmake_generate_args_release": [
			
 
				+                "-G",
			
 
				+                "Unix\\ Makefiles",
			
 
				+                "-DCMAKE_C_COMPILER=clang-6.0",
			
 
				+                "-DCMAKE_CXX_COMPILER=clang++-6.0",
			
 
				+                "-DCMAKE_CXX_FLAGS=\"-fPIC -O2\"",
			
 
				+                "-DCMAKE_CXX_STANDARD=17",
			
 
				+                "-DCMAKE_BUILD_TYPE=Release"
			
 
				+            ],
			
 
				+            "cmake_build_args":[
			
 
				+               "-j"
			
 
				+            ],
			
 
				+            "build_configs":[
			
 
				+                "Release"
			
 
				+            ]
			
 
				+         }
			
 
				+      }
			
 
				+   }
			
 
				+}
			
--- a/package-system/squish-ccr/squish-ccr-deb557d-rev1.patch
+++ b/package-system/squish-ccr/squish-ccr-deb557d-rev1.patch
@@ -0,0 +1,2537 @@
 
				+diff --git a/bitoneset.cpp b/bitoneset.cpp
			
 
				+index bc0a0a7..3dc456d 100644
			
 
				+--- a/bitoneset.cpp
			
 
				++++ b/bitoneset.cpp
			
 
				+@@ -371,7 +371,7 @@ BitoneSet::BitoneSet(f23 const* rgba, int mask, int flags)
			
 
				+ void BitoneSet::RemapIndices(u8 const* source, u8* target) const
			
 
				+ {
			
 
				+   for (int i = 0; i < 16; ++i) {
			
 
				+-    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[m_remap[i]]); target[i] = t;
			
 
				++    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[static_cast<int>(m_remap[i])]); target[i] = t;
			
 
				+   }
			
 
				+ }
			
 
				+ #endif
			
 
				+diff --git a/colourset.cpp b/colourset.cpp
			
 
				+index 9af55ef..dcc4a5d 100644
			
 
				+--- a/colourset.cpp
			
 
				++++ b/colourset.cpp
			
 
				+@@ -25,6 +25,7 @@
			
 
				+    -------------------------------------------------------------------------- */
			
 
				+ 
			
 
				+ #include <assert.h>
			
 
				++#include <string.h>
			
 
				+ #include "colourset.h"
			
 
				+ #include "helpers.h"
			
 
				+ 
			
 
				+@@ -409,7 +410,7 @@ bool ColourSet::RemoveBlack(const Vec3 &metric, Scr3 &error)
			
 
				+ 	continue;
			
 
				+ 
			
 
				+       // maps to black
			
 
				+-      Vec3 colour = m_points[m_remap[i]];
			
 
				++      Vec3 colour = m_points[static_cast<int>(m_remap[i])];
			
 
				+       /*Vec3 result = q.SnapToLattice(colour);*/
			
 
				+       if (true /*CompareAllEqualTo(result, Vec3(0.0f))*/) {
			
 
				+ 	Scr3 len = LengthSquared(metric * colour);
			
 
				+@@ -451,7 +452,7 @@ bool ColourSet::RemoveBlack(const Vec3 &metric, Scr3 &error)
			
 
				+ void ColourSet::RemapIndices(u8 const* source, u8* target) const
			
 
				+ {
			
 
				+   for (int i = 0; i < 16; ++i) {
			
 
				+-    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[m_remap[i]]); target[i] = t;
			
 
				++    u8 t = 3; t = ((m_remap[i] == -1) ? t : source[static_cast<int>(m_remap[i])]); target[i] = t;
			
 
				+   }
			
 
				+ }
			
 
				+ #endif
			
 
				+diff --git a/config.h b/config.h
			
 
				+index ef7dbbd..9b1bf89 100644
			
 
				+--- a/config.h
			
 
				++++ b/config.h
			
 
				+@@ -413,7 +413,8 @@ using namespace ::Concurrency;
			
 
				+ #ifdef __GNUC__
			
 
				+ #define assume
			
 
				+ #define doinline
			
 
				+-#define	passreg		__fastcall
			
 
				++// clang reports warnings with __fastcall with x86_64 and __fastcall only works for i386 anyway
			
 
				++#define	passreg
			
 
				+ #else
			
 
				+ #define assume		__assume
			
 
				+ #define doinline	__forceinline
			
 
				+diff --git a/inlineables.cpp b/inlineables.cpp
			
 
				+index f2e0ca1..cdb51bc 100644
			
 
				+--- a/inlineables.cpp
			
 
				++++ b/inlineables.cpp
			
 
				+@@ -162,6 +162,8 @@ static const vQuantizer q8880s1(8, 8, 8, 0, ~0);
			
 
				+ static const vQuantizer q7770s1(7, 7, 7, 0, ~0);
			
 
				+ static const vQuantizer q5550s1(5, 5, 5, 0, ~0);
			
 
				+ 
			
 
				++static const vQuantizer invalidQuantizer(0, 0, 0, 0, 0);
			
 
				++
			
 
				+ #define vGetQuantizer(r, g, b, a)					\
			
 
				+ 	(((r) == 7) && ((a) == 8)                ? q7778s1 :		\
			
 
				+ 	(((r) == 5) && ((a) == 6)                ? q5556s1 :		\
			
 
				+@@ -171,7 +173,7 @@ static const vQuantizer q5550s1(5, 5, 5, 0, ~0);
			
 
				+ 	(((r) == 8) && ((a) == 1)                ? q8880s1 :		\
			
 
				+ 	(((r) == 7) && ((a) == 1)                ? q7770s1 :		\
			
 
				+ 	(((r) == 5) && ((a) == 1)                ? q5550s1 :		\
			
 
				+-	(vQuantizer&)*(vQuantizer*)nullptr))))))))
			
 
				++	invalidQuantizer))))))))
			
 
				+ 
			
 
				+ #define eGetQuantizer(r, g, b, a, e)					\
			
 
				+ 	(((r) == 7) && ((a) == 8) && ((e) == ~0) ? q7778s1 :		\
			
 
				+@@ -182,7 +184,7 @@ static const vQuantizer q5550s1(5, 5, 5, 0, ~0);
			
 
				+ 	(((r) == 8) && ((a) == 1) && ((e) ==  0) ? q8880s0 :		\
			
 
				+ 	(((r) == 7) && ((a) == 1) && ((e) ==  0) ? q7770s0 :		\
			
 
				+ 	(((r) == 5) && ((a) == 1) && ((e) ==  0) ? q5550s0 :		\
			
 
				+-	(vQuantizer&)*(vQuantizer*)nullptr))))))))
			
 
				++	invalidQuantizer))))))))
			
 
				+ 
			
 
				+ template<const int rb, const int gb, const int bb, const int ab, const int eb, const int sb>
			
 
				+ static doinline void passreg FloatTo(Vec4 (&colour)[1], Col4 (&field)[1][FIELDN], int bitset) ccr_restricted
			
 
				+@@ -900,15 +902,16 @@ static doinline void passreg Codebook6or8(s16 (&codes)[8*1], bool bw) ccr_restri
			
 
				+       cd = (2 * c + 3 * d); codes[4 + i] = (s16)((cd * 0x3334) >> 16) + (cd < 0);
			
 
				+       cd = (1 * c + 4 * d); codes[5 + i] = (s16)((cd * 0x3334) >> 16) + (cd < 0);
			
 
				+ 
			
 
				+-      codes[6 + i] = (s16)-127 << prc;
			
 
				+-      codes[7 + i] = (s16) 127 << prc;
			
 
				++      // Negative number doesn't support shift. Need to convert it to unsigned first
			
 
				++      codes[6 + i] = (s16) (((u16)(-127)) << prc);
			
 
				++      codes[7 + i] = (s16) (127 << prc);
			
 
				+ 
			
 
				+       assert(s16(codes[2]) == (((s16(4) * s16(codes[0])) + (s16(1) * s16(codes[1]))) / 5));
			
 
				+       assert(s16(codes[3]) == (((s16(3) * s16(codes[0])) + (s16(2) * s16(codes[1]))) / 5));
			
 
				+       assert(s16(codes[4]) == (((s16(2) * s16(codes[0])) + (s16(3) * s16(codes[1]))) / 5));
			
 
				+       assert(s16(codes[5]) == (((s16(1) * s16(codes[0])) + (s16(4) * s16(codes[1]))) / 5));
			
 
				+-      assert(s16(codes[6]) == (-127 << prc));
			
 
				+-      assert(s16(codes[7]) == ( 127 << prc));
			
 
				++      assert(s16(codes[6]) == (((u16)(-127)) << prc));
			
 
				++      assert(s16(codes[7]) == (127 << prc));
			
 
				+     }
			
 
				+     else {
			
 
				+       cd = (6 * c + 1 * d); codes[2 + i] = (s16)((cd * 0x4925) >> 17) + (cd < 0);
			
 
				+@@ -1063,7 +1066,8 @@ static doinline void passreg Codebook6(Col8 &codes, Col8::Arg start, Col8::Arg e
			
 
				+   // max   signed: (5 * 127) << 5 = 20320 / 0x4F60 fits   signed short
			
 
				+   const Col8 smul = Col8(0x05 << pb, 0x00 << pb, 0x04 << pb, 0x03 << pb, 0x02 << pb, 0x01 << pb, 0x00 << pb, 0x00 << pb);
			
 
				+   const Col8 emul = Col8(0x00 << pb, 0x05 << pb, 0x01 << pb, 0x02 << pb, 0x03 << pb, 0x04 << pb, 0x00 << pb, 0x00 << pb);
			
 
				+-  const Col8 mask = Col8(0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, min  << pb, max  << pb);
			
 
				++  // Negative number doesn't support shift. Need to convert it to unsigned first
			
 
				++  const Col8 mask = Col8(0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, 0x00 << pb, ((u16)min) << pb, ((u16)min) << pb);
			
 
				+ 
			
 
				+   // range [0,2*5*255]
			
 
				+   Col8 ipol = (smul * start) + (emul * end);
			
 
				+diff --git a/maths.cpp b/maths.cpp
			
 
				+index d9c3808..b58c36a 100644
			
 
				+--- a/maths.cpp
			
 
				++++ b/maths.cpp
			
 
				+@@ -790,7 +790,16 @@ void EstimatePrincipleComponent(Sym3x3 const& matrix, Vec4 &out)
			
 
				+     Scr4 y = Dot(v, row1);
			
 
				+     Scr4 z = Dot(v, row2);
			
 
				+ 
			
 
				+-    v  = Vec4(x, y, z);
			
 
				++    //This is to fix Nans caused by really really small values.
			
 
				++    if(Vec3(x,y,z) < Vec3(FLT_EPSILON))
			
 
				++    {
			
 
				++        v  = Vec4(FLT_EPSILON,FLT_EPSILON,FLT_EPSILON);
			
 
				++    }
			
 
				++    else
			
 
				++    {
			
 
				++        v  = Vec4(x, y, z);
			
 
				++    }
			
 
				++
			
 
				+     v *= Reciprocal(HorizontalMax(Abs(v)));
			
 
				+   }
			
 
				+ #if POWER_ITERATION_COUNT <= 0
			
 
				+diff --git a/paletteclusterfit.cpp b/paletteclusterfit.cpp
			
 
				+index 2d6f5a1..b98e975 100644
			
 
				+--- a/paletteclusterfit.cpp
			
 
				++++ b/paletteclusterfit.cpp
			
 
				+@@ -26,6 +26,7 @@
			
 
				+    -------------------------------------------------------------------------- */
			
 
				+ 
			
 
				+ #include <assert.h>
			
 
				++#include <stdio.h>
			
 
				+ 
			
 
				+ #include "paletteclusterfit.h"
			
 
				+ #include "paletteset.h"
			
 
				+diff --git a/palettefit.cpp b/palettefit.cpp
			
 
				+index 062f45c..120da27 100644
			
 
				+--- a/palettefit.cpp
			
 
				++++ b/palettefit.cpp
			
 
				+@@ -150,9 +150,9 @@ const int *PaletteFit::GetSharedMap(int mode) {
			
 
				+ }
			
 
				+ 
			
 
				+ int PaletteFit::GetSharedSkip(int mode) {
			
 
				+-  if (PBcfg[mode].EPB) return skip[1][PBcfg[mode].NS];
			
 
				+-  if (PBcfg[mode].SPB) return skip[0][PBcfg[mode].NS];
			
 
				+-  return NULL;
			
 
				++  if (PBcfg[mode].EPB) return skip[1][static_cast<int>(PBcfg[mode].NS)];
			
 
				++  if (PBcfg[mode].SPB) return skip[0][static_cast<int>(PBcfg[mode].NS)];
			
 
				++  return 0;
			
 
				+ }
			
 
				+ 
			
 
				+ int PaletteFit::GetPrecisionBits(int mode) {
			
 
				+diff --git a/paletteset.cpp b/paletteset.cpp
			
 
				+index bee740c..8c7aea0 100644
			
 
				+--- a/paletteset.cpp
			
 
				++++ b/paletteset.cpp
			
 
				+@@ -1248,7 +1248,7 @@ void PaletteSet::RemapIndices(u8 const* source, u8* target, int set) const
			
 
				+       if ((imask & 1) == 0)
			
 
				+ 	continue;
			
 
				+ 
			
 
				+-      u8 t = 0; t = ((m_remap[s][i] == -1) ? t : source[m_remap[s][i]]); target[i] = t;
			
 
				++      u8 t = 0; t = ((m_remap[s][i] == -1) ? t : source[static_cast<int>(m_remap[s][i])]); target[i] = t;
			
 
				+     }
			
 
				+   }
			
 
				+ }
			
 
				+diff --git a/simd_sse.h b/simd_sse.h
			
 
				+index f959e20..1a2f6b8 100644
			
 
				+--- a/simd_sse.h
			
 
				++++ b/simd_sse.h
			
 
				+@@ -1,7 +1,7 @@
			
 
				+ /* -----------------------------------------------------------------------------
			
 
				+ 
			
 
				+ 	Copyright (c) 2006 Simon Brown                          [email protected]
			
 
				+-	Copyright (c) 2012 Niels Fröhling              [email protected]
			
 
				++	Copyright (c) 2012 Niels Fr?hling              [email protected]
			
 
				+ 
			
 
				+ 	Permission is hereby granted, free of charge, to any person obtaining
			
 
				+ 	a copy of this software and associated documentation files (the
			
 
				+@@ -33,6 +33,7 @@
			
 
				+ #endif
			
 
				+ #if ( SQUISH_USE_SSE >= 3 )
			
 
				+ #include <pmmintrin.h>
			
 
				++#include <smmintrin.h>
			
 
				+ #endif
			
 
				+ #if ( SQUISH_USE_SSE >= 4 )
			
 
				+ #include <smmintrin.h>
			
 
				+@@ -69,6 +70,12 @@
			
 
				+ 
			
 
				+ namespace squish {
			
 
				+ 
			
 
				++class Col3;
			
 
				++class Col4;
			
 
				++class Col8;
			
 
				++class Vec3;
			
 
				++class Vec4;
			
 
				++
			
 
				+ #define COL4_CONST( X ) Col4( X )
			
 
				+ 
			
 
				+ 
			
 
				+@@ -263,7 +270,7 @@ public:
			
 
				+ 	Col3& operator/=( short v )
			
 
				+ 	{
			
 
				+ 		__m128
			
 
				+-			
			
 
				++
			
 
				+ 		fp = _mm_cvtepi32_ps(m_v);
			
 
				+ 		fp = _mm_div_ps(fp, _mm_set1_ps(v));
			
 
				+ 		m_v = _mm_cvttps_epi32(fp);
			
 
				+@@ -351,64 +358,18 @@ public:
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col3 ShiftLeft( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col3 ShiftLeft( Arg a )
			
 
				+-	{
			
 
				+-		if ((n) <= 0)
			
 
				+-			return Col3( a.m_v );
			
 
				+-		if ((n) <= 7)
			
 
				+-			return Col3( _mm_slli_epi32( a.m_v, (n) & 7 ) );
			
 
				+-		if ((n) & 7)
			
 
				+-			return Col3( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				+-
			
 
				+-			return Col3( _mm_slli_si128( a.m_v, (n) >> 3 ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col3 ShiftRight( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col3 ShiftRight( Arg a )
			
 
				+-	{
			
 
				+-		if ((n) <= 0)
			
 
				+-			return Col3( a.m_v );
			
 
				+-		if ((n) <= 7)
			
 
				+-			return Col3( _mm_srli_epi32( a.m_v, (n) & 7 ) );
			
 
				+-		if ((n) & 7)
			
 
				+-			return Col3( _mm_srli_epi32( _mm_srli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				+-
			
 
				+-			return Col3( _mm_srli_si128( a.m_v, (n) >> 3 ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col3 ShiftRightHalf( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col3 ShiftRightHalf( Arg a )
			
 
				+-	{
			
 
				+-		return Col3( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col3 ShiftRightHalf( Arg a, const int n )
			
 
				+-	{
			
 
				+-		return Col3( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col3 ShiftRightHalf( Arg a, Arg b )
			
 
				+-	{
			
 
				+-		return Col3( _mm_srl_epi64( a.m_v, b.m_v ) );
			
 
				+-	}
			
 
				++	friend Col3 ShiftRightHalf( Arg a, const int n );
			
 
				++	friend Col3 ShiftRightHalf( Arg a, Arg b );
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col3 ShiftLeftHalf( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col3 ShiftLeftHalf( Arg a )
			
 
				+-	{
			
 
				+-		return Col3( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col3 ShiftLeftHalf( Arg a, const int n )
			
 
				+-	{
			
 
				+-		return Col3( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				+-	}
			
 
				++	friend Col3 ShiftLeftHalf( Arg a, const int n );
			
 
				+ 
			
 
				+ 	template<const int r, const int g, const int b>
			
 
				+ 	friend Col3 ShiftLeftLo( Arg v )
			
 
				+@@ -422,140 +383,24 @@ public:
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col3 MaskBits( Arg a );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col3 MaskBits( Arg a )
			
 
				+-	{
			
 
				+-		if ((p + n) <= 0)
			
 
				+-			return Col3(0);
			
 
				+-		if ((p + n) >= 64)
			
 
				+-			return a;
			
 
				+-
			
 
				+-		// compile time
			
 
				+-		__int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     (p + n) & 63));
			
 
				+-	//	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
			
 
				+-		__m128i mask = _mm_setr_epi32(
			
 
				+-		  (int)(base >>  0),
			
 
				+-		  (int)(base >> 32), 0, 0
			
 
				+-		);
			
 
				+-
			
 
				+-		return Col3( _mm_and_si128( a.m_v, mask ) );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col3 MaskBits( Arg a, const int n, const int p )
			
 
				+-	{
			
 
				+-		const int val = 64 - (p + n);
			
 
				+-
			
 
				+-		__m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
			
 
				+-		__m128i mask = _mm_setr_epi32(
			
 
				+-		  0xFFFFFFFF,
			
 
				+-		  0xFFFFFFFF, 0, 0
			
 
				+-		);
			
 
				+-
			
 
				+-		mask = _mm_srl_epi64( mask, shift );
			
 
				+-
			
 
				+-		// (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
			
 
				+-		return Col3( _mm_and_si128( a.m_v, mask ) );
			
 
				+-	}
			
 
				++    friend Col3 MaskBits(Arg a, const int n, const int p);
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col3 CopyBits( Arg left, Arg right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col3 CopyBits( Arg left, Arg right )
			
 
				+-	{
			
 
				+-		if (!(n))
			
 
				+-			return left;
			
 
				+-		if (!(p))
			
 
				+-			return MaskBits<n, 0>(right);
			
 
				+-		if (((p) + (n)) >= 64)
			
 
				+-			return (left) + ShiftLeftHalf<p>(right);
			
 
				+-
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		return Col3( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
			
 
				+-#else
			
 
				+-		return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				+-	//	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				+-#endif
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col3 CopyBits( Arg left, Col3 &right, const int n, const int p )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		/* ---- ---bl xxxx xxxx */
			
 
				+-		const int val = (p << 8) + (n << 0);
			
 
				+-
			
 
				+-		right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
			
 
				+-		return Col3( _mm_insert_si64( left.m_v, right.m_v ) );
			
 
				+-#else
			
 
				+-		return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				+-	//	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				+-#endif
			
 
				+-	}
			
 
				+ 
			
 
				++	friend Col3 CopyBits( Arg left, Col3 &right, const int n, const int p );
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col3 ExtrBits( Arg a );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col3 ExtrBits( Arg a )
			
 
				+-	{
			
 
				+-		if (!(n))
			
 
				+-			return Col3(0);
			
 
				+-		if (!(p))
			
 
				+-			return MaskBits<n, 0>(a);
			
 
				+-		if (((n) + (p)) >= 64)
			
 
				+-			return ShiftRightHalf<p>(a);
			
 
				+-
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		return Col3( _mm_extracti_si64( a.m_v, n, p ) );
			
 
				+-#else
			
 
				+-		return MaskBits<n, 0>(ShiftRightHalf<p>(a));
			
 
				+-#endif
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col3 ExtrBits( Arg a, const int n, const int p )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		/* ---- ----- ---- ---bl */
			
 
				+-		const int val = (p << 8) + (n << 0);
			
 
				+-
			
 
				+-		return Col3( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
			
 
				+-#else
			
 
				+-		return MaskBits(ShiftRightHalf(a, p), n, 0);
			
 
				+-#endif
			
 
				+-	}
			
 
				+ 
			
 
				++	friend Col3 ExtrBits( Arg a, const int n, const int p );
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend void ExtrBits( Arg left, Col3 &right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend void ExtrBits( Arg left, Col3 &right )
			
 
				+-	{
			
 
				+-		right  = ExtrBits<n, p>( left );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend void ConcBits( Arg left, Col3 &right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend void ConcBits( Arg left, Col3 &right )
			
 
				+-	{
			
 
				+-		right  = ShiftLeft<32>( right );
			
 
				+-		if (n > 0)
			
 
				+-			right += ExtrBits<n, p>( left );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend void ReplBits( Arg left, Col3 &right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend void ReplBits( Arg left, Col3 &right )
			
 
				+-	{
			
 
				+-		if (!n)
			
 
				+-			return;
			
 
				+-		if ((n < 0)) {
			
 
				+-			right  = ExtrBits<-n, p>( left );
			
 
				+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
			
 
				+-		}
			
 
				+-		else {
			
 
				+-			right  = ExtrBits< n, p>( left );
			
 
				+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				+-		}
			
 
				+-	}
			
 
				+ 
			
 
				+ 	friend Col3 Mul16x16u( Arg a, Arg b )
			
 
				+ 	{
			
 
				+@@ -652,18 +497,7 @@ public:
			
 
				+ 	template<const int f, const int t>
			
 
				+ 	friend Col3 Exchange( Arg a );
			
 
				+ 	template<const int f, const int t>
			
 
				+-	friend Col3 Exchange( Arg a )
			
 
				+-	{
			
 
				+-		if (f == t)
			
 
				+-			return a;
			
 
				+-
			
 
				+-		return Col3( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
			
 
				+-			(t == 0 ? f : (f == 0 ? t : 0)),
			
 
				+-			(t == 1 ? f : (f == 1 ? t : 1)),
			
 
				+-			(t == 2 ? f : (f == 2 ? t : 2)),
			
 
				+-			(t == 3 ? f : (f == 3 ? t : 3))
			
 
				+-		) ) );
			
 
				+-	}
			
 
				++	friend Col3 Exchange( Arg a );
			
 
				+ 
			
 
				+ 	friend Col3 HorizontalAdd( Arg a )
			
 
				+ 	{
			
 
				+@@ -751,7 +585,7 @@ public:
			
 
				+ 		return HorizontalAdd( a, b );
			
 
				+ #endif
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col3 HorizontalMaxTiny( Arg a )
			
 
				+ 	{
			
 
				+ #if ( SQUISH_USE_SSE >= 4 ) && 0
			
 
				+@@ -867,7 +701,7 @@ public:
			
 
				+ 
			
 
				+ 	      return Col3( _mm_castps_si128 ( resc ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend bool CompareFirstLessThan( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		__m128i bits = _mm_cmplt_epi32( left.m_v, right.m_v );
			
 
				+@@ -937,7 +771,7 @@ public:
			
 
				+ 
			
 
				+ 		loc = _mm_cvtsi128_si32( r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void PackBytes( Arg a, int &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+@@ -947,7 +781,7 @@ public:
			
 
				+ 
			
 
				+ 		loc = _mm_cvtsi128_si32( r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void PackWords( Arg a, unsigned__int64 &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+@@ -964,17 +798,17 @@ public:
			
 
				+ //		loc = _mm_cvtsi128_si64( r );
			
 
				+ 		_mm_storel_epi64( (__m128i *)&loc, r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void PackWords( Arg a, __int64 &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+-		  
			
 
				++
			
 
				+ 		r = _mm_packs_epi32( a.m_v, a.m_v );
			
 
				+ 
			
 
				+ //		loc = _mm_cvtsi128_si64( r );
			
 
				+ 		_mm_storel_epi64( (__m128i *)&loc, r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	// clamp the output to [0, 1]
			
 
				+ 	Col3 Clamp() const {
			
 
				+ 		Col3 const one (0xFF);
			
 
				+@@ -1020,17 +854,17 @@ public:
			
 
				+ 	{
			
 
				+ 		_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void StoreUnaligned( Arg a, void *destination )
			
 
				+ 	{
			
 
				+ 		_mm_storeu_si128( (__m128i *)destination, a.m_v );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void StoreUnaligned( Arg a, Arg b, void *destination )
			
 
				+ 	{
			
 
				+ 		_mm_storeu_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void StoreUnaligned( Arg a, u8* loc ) {
			
 
				+ 	  PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) ); }
			
 
				+ 	friend void StoreUnaligned( Arg a, u16* loc ) {
			
 
				+@@ -1043,10 +877,202 @@ public:
			
 
				+ private:
			
 
				+ 	__m128i m_v;
			
 
				+ 
			
 
				+-	friend class Col4;
			
 
				+-	friend class Vec3;
			
 
				++	friend squish::Col4;
			
 
				++	friend squish::Vec3;
			
 
				+ };
			
 
				+ 
			
 
				++template<const int f, const int t>
			
 
				++Col3 Exchange( Col3::Arg a )
			
 
				++{
			
 
				++    if (f == t)
			
 
				++        return a;
			
 
				++
			
 
				++    return Col3( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
			
 
				++                                                            (t == 0 ? f : (f == 0 ? t : 0)),
			
 
				++                                                            (t == 1 ? f : (f == 1 ? t : 1)),
			
 
				++                                                            (t == 2 ? f : (f == 2 ? t : 2)),
			
 
				++                                                            (t == 3 ? f : (f == 3 ? t : 3))
			
 
				++                                                            ) ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col3 ShiftRight(Col3::Arg a)
			
 
				++{
			
 
				++	if ((n) <= 0)
			
 
				++		return Col3(a.m_v);
			
 
				++	if ((n) <= 7)
			
 
				++		return Col3(_mm_srli_epi32(a.m_v, (n) & 7));
			
 
				++	if ((n) & 7)
			
 
				++		return Col3(_mm_srli_epi32(_mm_srli_si128(a.m_v, (n) >> 3), (n) & 7));
			
 
				++
			
 
				++	return Col3(_mm_srli_si128(a.m_v, (n) >> 3));
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col3 ShiftLeftHalf( Col3::Arg a )
			
 
				++{
			
 
				++    return Col3( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				++}
			
 
				++
			
 
				++inline Col3 ShiftLeftHalf( Col3::Arg a, const int n )
			
 
				++{
			
 
				++    return Col3( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col3 ShiftRightHalf( Col3::Arg a )
			
 
				++{
			
 
				++    return Col3( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				++}
			
 
				++
			
 
				++inline Col3 ShiftRightHalf( Col3::Arg a, const int n )
			
 
				++{
			
 
				++    return Col3( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				++}
			
 
				++
			
 
				++inline Col3 ShiftRightHalf( Col3::Arg a, Col3::Arg b )
			
 
				++{
			
 
				++    return Col3( _mm_srl_epi64( a.m_v, b.m_v ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++Col3 MaskBits( Col3::Arg a )
			
 
				++{
			
 
				++    if ((p + n) <= 0)
			
 
				++        return Col3(0);
			
 
				++    if ((p + n) >= 64)
			
 
				++        return a;
			
 
				++
			
 
				++    // compile time
			
 
				++    __int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     (p + n) & 63));
			
 
				++    //	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
			
 
				++    __m128i mask = _mm_setr_epi32(
			
 
				++                                  (int)(base >>  0),
			
 
				++                                  (int)(base >> 32), 0, 0
			
 
				++                                  );
			
 
				++
			
 
				++    return Col3( _mm_and_si128( a.m_v, mask ) );
			
 
				++}
			
 
				++
			
 
				++inline Col3 MaskBits( Col3::Arg a, const int n, const int p )
			
 
				++{
			
 
				++    const int val = 64 - (p + n);
			
 
				++
			
 
				++    __m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
			
 
				++    __m128i mask = _mm_setr_epi32(
			
 
				++                                  0xFFFFFFFF,
			
 
				++                                  0xFFFFFFFF, 0, 0
			
 
				++                                  );
			
 
				++
			
 
				++    mask = _mm_srl_epi64( mask, shift );
			
 
				++
			
 
				++    // (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
			
 
				++    return Col3( _mm_and_si128( a.m_v, mask ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++Col3 CopyBits( Col3::Arg left, Col3::Arg right )
			
 
				++{
			
 
				++    if (!(n))
			
 
				++        return left;
			
 
				++    if (!(p))
			
 
				++        return MaskBits<n, 0>(right);
			
 
				++    if (((p) + (n)) >= 64)
			
 
				++        return (left) + ShiftLeftHalf<p>(right);
			
 
				++
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    return Col3( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
			
 
				++#else
			
 
				++    return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				++    //	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++inline Col3 CopyBits( Col3::Arg left, Col3 &right, const int n, const int p )
			
 
				++{
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    /* ---- ---bl xxxx xxxx */
			
 
				++    const int val = (p << 8) + (n << 0);
			
 
				++
			
 
				++    right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
			
 
				++    return Col3( _mm_insert_si64( left.m_v, right.m_v ) );
			
 
				++#else
			
 
				++    return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				++    //	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++Col3 ExtrBits( Col3::Arg a )
			
 
				++{
			
 
				++    if (!(n))
			
 
				++        return Col3(0);
			
 
				++    if (!(p))
			
 
				++        return MaskBits<n, 0>(a);
			
 
				++    if (((n) + (p)) >= 64)
			
 
				++        return ShiftRightHalf<p>(a);
			
 
				++
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    return Col3( _mm_extracti_si64( a.m_v, n, p ) );
			
 
				++#else
			
 
				++    return MaskBits<n, 0>(ShiftRightHalf<p>(a));
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++inline Col3 ExtrBits( Col3::Arg a, const int n, const int p )
			
 
				++{
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    /* ---- ----- ---- ---bl */
			
 
				++    const int val = (p << 8) + (n << 0);
			
 
				++
			
 
				++    return Col3( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
			
 
				++#else
			
 
				++    return MaskBits(ShiftRightHalf(a, p), n, 0);
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col3 ShiftLeft( Col3::Arg a )
			
 
				++{
			
 
				++    if ((n) <= 0)
			
 
				++        return Col3( a.m_v );
			
 
				++    if ((n) <= 7)
			
 
				++        return Col3( _mm_slli_epi32( a.m_v, (n) & 7 ) );
			
 
				++    if ((n) & 7)
			
 
				++        return Col3( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				++
			
 
				++    return Col3( _mm_slli_si128( a.m_v, (n) >> 3 ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++void ExtrBits( Col3::Arg left, Col3 &right )
			
 
				++{
			
 
				++    right  = ExtrBits<n, p>( left );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++void ConcBits( Col3::Arg left, Col3 &right )
			
 
				++{
			
 
				++    right  = ShiftLeft<32>( right );
			
 
				++    if (n > 0)
			
 
				++        right += ExtrBits<n, p>( left );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++void ReplBits( Col3::Arg left, Col3 &right )
			
 
				++{
			
 
				++    if (!n)
			
 
				++        return;
			
 
				++    if ((n < 0)) {
			
 
				++        right  = ExtrBits<-n, p>( left );
			
 
				++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
			
 
				++    }
			
 
				++    else {
			
 
				++        right  = ExtrBits< n, p>( left );
			
 
				++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				++    }
			
 
				++}
			
 
				++
			
 
				+ class Col4
			
 
				+ {
			
 
				+ public:
			
 
				+@@ -1305,317 +1331,56 @@ public:
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col4 FillSign( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col4 FillSign( Arg a )
			
 
				+-	{
			
 
				+-		return Col4( _mm_srai_epi32( _mm_slli_epi32( a.m_v, n ), n ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col4 ExtendSign( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col4 ExtendSign( Arg a )
			
 
				+-	{
			
 
				+-		return Col4( _mm_srai_epi32( a.m_v, n ) );
			
 
				+-	}
			
 
				+-	
			
 
				++
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col4 ShiftLeft( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col4 ShiftLeft( Arg a )
			
 
				+-	{
			
 
				+-		if ((n) <= 0)
			
 
				+-			return Col4( a.m_v );
			
 
				+-		if ((n) <= 7)
			
 
				+-			return Col4( _mm_slli_epi32( a.m_v, (n) & 7 ) );
			
 
				+-		if ((n) & 7)
			
 
				+-			return Col4( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				+-
			
 
				+-			return Col4( _mm_slli_si128( a.m_v, (n) >> 3 ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col4 ShiftRight( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col4 ShiftRight( Arg a )
			
 
				+-	{
			
 
				+-		if ((n) <= 0)
			
 
				+-			return Col4( a.m_v );
			
 
				+-		if ((n) <= 7)
			
 
				+-			return Col4( _mm_srli_epi32( a.m_v, (n) & 7 ) );
			
 
				+-		if ((n) & 7)
			
 
				+-			return Col4( _mm_srli_epi32( _mm_srli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				+-
			
 
				+-			return Col4( _mm_srli_si128( a.m_v, (n) >> 3 ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col4 ShiftRightHalf( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col4 ShiftRightHalf( Arg a )
			
 
				+-	{
			
 
				+-		return Col4( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 ShiftRightHalf( Arg a, const int n )
			
 
				+-	{
			
 
				+-		return Col4( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 ShiftRightHalf( Arg a, Arg b )
			
 
				+-	{
			
 
				+-		return Col4( _mm_srl_epi64( a.m_v, b.m_v ) );
			
 
				+-	}
			
 
				++	friend Col4 ShiftRightHalf( Arg a, const int n );
			
 
				++	friend Col4 ShiftRightHalf( Arg a, Arg b );
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col4 ShiftLeftHalf( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Col4 ShiftLeftHalf( Arg a )
			
 
				+-	{
			
 
				+-		return Col4( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 ShiftLeftHalf( Arg a, const int n )
			
 
				+-	{
			
 
				+-		return Col4( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				+-	}
			
 
				++    friend Col4 ShiftLeftHalf( Arg a, const int n  );
			
 
				+ 
			
 
				+ 	template<const int r, const int g, const int b, const int a>
			
 
				+ 	friend Col4 ShiftLeftLo( Arg v );
			
 
				+-	template<const int r, const int g, const int b, const int a>
			
 
				+-	friend Col4 ShiftLeftLo( Arg v )
			
 
				+-	{
			
 
				+-		// (1 << r, 1 << g, 1 << b, 1 << a);
			
 
				+-		Col4 p2; p2.SetRGBApow2<0>(r, g, b, a);
			
 
				+-
			
 
				+-#if ( SQUISH_USE_SSE >= 4 )
			
 
				+-		return Col4( _mm_mullo_epi32( v.m_v, p2.m_v ) );
			
 
				+-#else
			
 
				+-		return Col4( _mm_mullo_epi16( v.m_v, p2.m_v ) );
			
 
				+-#endif
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col4 MaskBits( Arg a );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col4 MaskBits( Arg a )
			
 
				+-	{
			
 
				+-		if (((p) + (n)) <= 0)
			
 
				+-			return Col4(0);
			
 
				+-		if (((p) + (n)) >= 64)
			
 
				+-			return a;
			
 
				+-
			
 
				+-		// compile time
			
 
				+-		__int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     ((p) + (n)) & 63));
			
 
				+-	//	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - ((p) + (n)) & 63));
			
 
				+-		__m128i mask = _mm_setr_epi32(
			
 
				+-		  (int)(base >>  0),
			
 
				+-		  (int)(base >> 32), 0, 0
			
 
				+-		);
			
 
				+-
			
 
				+-		return Col4( _mm_and_si128( a.m_v, mask ) );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 MaskBits( Arg a, const int n, const int p )
			
 
				+-	{
			
 
				+-		const int val = 64 - ((p) + (n));
			
 
				+-
			
 
				+-		__m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
			
 
				+-		__m128i mask = _mm_setr_epi32(
			
 
				+-		  0xFFFFFFFF,
			
 
				+-		  0xFFFFFFFF, 0, 0
			
 
				+-		);
			
 
				+-
			
 
				+-		mask = _mm_srl_epi64( mask, shift );
			
 
				+-
			
 
				+-		// (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
			
 
				+-		return Col4( _mm_and_si128( a.m_v, mask ) );
			
 
				+-	}
			
 
				++    friend Col4 MaskBits( Arg a, const int n, const int p );
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col4 CopyBits( Arg left, Arg right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col4 CopyBits( Arg left, Arg right )
			
 
				+-	{
			
 
				+-		if (!(n))
			
 
				+-			return left;
			
 
				+-		if (!(p))
			
 
				+-			return MaskBits<n, 0>(right);
			
 
				+-		if (((p) + (n)) >= 64)
			
 
				+-			return (left) + ShiftLeftHalf<p>(right);
			
 
				+-
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
			
 
				+-#else
			
 
				+-		return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				+-	//	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				+-#endif
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 CopyBits( Arg left, Col4& right, const int n, const int p )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		/* ---- ---bl xxxx xxxx */
			
 
				+-		const int val = (p << 8) + (n << 0);
			
 
				+-
			
 
				+-		right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
			
 
				+-		return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
			
 
				+-#else
			
 
				+-		return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				+-	//	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				+-#endif
			
 
				+-	}
			
 
				++    friend Col4 CopyBits( Arg left, Col4& right, const int n, const int p );
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col4 KillBits( Arg a );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col4 KillBits( Arg a )
			
 
				+-	{
			
 
				+-		if (!n || (p >= 64))
			
 
				+-			return a;
			
 
				+-		if (!p && (n >= 64))
			
 
				+-			return Col4(0);
			
 
				+-
			
 
				+-		// compile time
			
 
				+-		__int64 base1 =  (0xFFFFFFFFFFFFFFFFULL << (     (p + 0) & 63));
			
 
				+-		__int64 base2 =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
			
 
				+-	//	__int64 base1 = ~(0xFFFFFFFFFFFFFFFFULL >> (64 - (p + 0) & 63));
			
 
				+-	//	__int64 base2 = ~(0xFFFFFFFFFFFFFFFFULL << (64 - (p + n) & 63));
			
 
				+-
			
 
				+-		__m128i mask;
			
 
				+-
			
 
				+-		if ((p + n) >= 64)
			
 
				+-		  base2 = 0xFFFFFFFFFFFFFFFFULL;
			
 
				+-
			
 
				+-		mask = _mm_setr_epi32(
			
 
				+-		  (int)((base1 ^ base2) >>  0),
			
 
				+-		  (int)((base1 ^ base2) >> 32), 0, 0
			
 
				+-		);
			
 
				+-
			
 
				+-		return Col4( _mm_and_si128( a.m_v, mask ) );
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 KillBits( Arg a, const int n, const int p )
			
 
				+-	{
			
 
				+-		const int val1 =      (p + 0);
			
 
				+-		const int val2 = 64 - (p + n);
			
 
				+-
			
 
				+-		__m128i shift1 = _mm_max_epi16( _mm_cvtsi32_si128( val1 ), _mm_set1_epi32( 0 ) );
			
 
				+-		__m128i shift2 = _mm_max_epi16( _mm_cvtsi32_si128( val2 ), _mm_set1_epi32( 0 ) );
			
 
				+-		__m128i mask1 = _mm_setr_epi32(
			
 
				+-		  0xFFFFFFFF,
			
 
				+-		  0xFFFFFFFF, 0, 0
			
 
				+-		);
			
 
				+-		__m128i mask2 = _mm_setr_epi32(
			
 
				+-		  0xFFFFFFFF,
			
 
				+-		  0xFFFFFFFF, 0, 0
			
 
				+-		);
			
 
				+-
			
 
				+-		mask1 = _mm_sll_epi64( mask1, shift1 );
			
 
				+-		mask2 = _mm_srl_epi64( mask2, shift2 );
			
 
				+-
			
 
				+-		return Col4( _mm_and_si128( a.m_v, _mm_xor_si128( mask1, mask2 ) ) );
			
 
				+-	}
			
 
				++    friend Col4 KillBits( Arg a, const int n, const int p );
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col4 InjtBits( Arg left, Arg right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col4 InjtBits( Arg left, Arg right )
			
 
				+-	{
			
 
				+-		if (!n || (p >= 64))
			
 
				+-			return right;
			
 
				+-		if ((p + n) >= 64)
			
 
				+-			return KillBits<n, p>(left) + ShiftLeftHalf<p>(right);
			
 
				+-	//		return               (left) + ShiftLeftHalf<p>(right);
			
 
				+-
			
 
				+-
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
			
 
				+-#else
			
 
				+-		return KillBits<n, p>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				+-	//	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				+-#endif
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 InjtBits( Arg left, Col4& right, const int n, const int p )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		/* ---- ---bl xxxx xxxx */
			
 
				+-		const int val = (p << 8) + (n << 0);
			
 
				+-
			
 
				+-		right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
			
 
				+-		return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
			
 
				+-#else
			
 
				+-		return KillBits(left, n, p) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				+-	//	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				+-#endif
			
 
				+-	}
			
 
				++    friend Col4 InjtBits( Arg left, Col4& right, const int n, const int p );
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend Col4 ExtrBits( Arg a );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend Col4 ExtrBits( Arg a )
			
 
				+-	{
			
 
				+-		if (!n)
			
 
				+-			return Col4(0);
			
 
				+-		if (!p)
			
 
				+-			return MaskBits<n, 0>(a);
			
 
				+-		if ((n + p) >= 64)
			
 
				+-			return ShiftRightHalf<p>(a);
			
 
				+-
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		return Col4( _mm_extracti_si64( a.m_v, n, p ) );
			
 
				+-#else
			
 
				+-		return MaskBits<n, 0>(ShiftRightHalf<p>(a));
			
 
				+-#endif
			
 
				+-	}
			
 
				+-
			
 
				+-	friend Col4 ExtrBits( Arg a, const int n, const int p )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_XSSE == 4 )
			
 
				+-		/* ---- ----- ---- ---bl */
			
 
				+-		const int val = (p << 8) + (n << 0);
			
 
				+-
			
 
				+-		return Col4( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
			
 
				+-#else
			
 
				+-		return MaskBits(ShiftRightHalf(a, p), n, 0);
			
 
				+-#endif
			
 
				+-	}
			
 
				++    friend Col4 ExtrBits( Arg a, const int n, const int p );
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend void ExtrBits( Arg left, Col4 &right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend void ExtrBits( Arg left, Col4 &right )
			
 
				+-	{
			
 
				+-		right  = ExtrBits<n, p>( left );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend void ConcBits( Arg left, Col4 &right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend void ConcBits( Arg left, Col4 &right )
			
 
				+-	{
			
 
				+-		right  = ShiftLeft<32>( right );
			
 
				+-		if (n > 0)
			
 
				+-			right += ExtrBits<n, p>( left );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int n, const int p>
			
 
				+ 	friend void ReplBits( Arg left, Col4 &right );
			
 
				+-	template<const int n, const int p>
			
 
				+-	friend void ReplBits( Arg left, Col4 &right )
			
 
				+-	{
			
 
				+-		if (!n)
			
 
				+-			return;
			
 
				+-		if ((n < 0)) {
			
 
				+-			right  = ExtrBits<-n, p>( left );
			
 
				+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
			
 
				+-		}
			
 
				+-		else {
			
 
				+-			right  = ExtrBits< n, p>( left );
			
 
				+-			right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				+-		}
			
 
				+-	}
			
 
				+ 
			
 
				+ 	friend Col4 RevsBits( Col4::Arg v )
			
 
				+ 	{
			
 
				+@@ -1679,19 +1444,7 @@ public:
			
 
				+ 
			
 
				+ 	template<const int f, const int t>
			
 
				+ 	friend Col4 Shuffle( Arg a );
			
 
				+-	template<const int f, const int t>
			
 
				+-	friend Col4 Shuffle( Arg a )
			
 
				+-	{
			
 
				+-		if (f == t)
			
 
				+-			return a;
			
 
				+ 
			
 
				+-		return Col4( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
			
 
				+-			(t == 0 ? f : 0),
			
 
				+-			(t == 1 ? f : 1),
			
 
				+-			(t == 2 ? f : 2),
			
 
				+-			(t == 3 ? f : 3)
			
 
				+-		) ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const int f, const int t>
			
 
				+ 	friend Col4 Exchange( Arg a );
			
 
				+@@ -1888,7 +1641,7 @@ public:
			
 
				+ 		return Col4( _mm_max_epi16( left.m_v, right.m_v ) );
			
 
				+ #endif
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 MaxTiny( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		__m128 resa = _mm_castsi128_ps( left.m_v );
			
 
				+@@ -1973,7 +1726,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_cmplt_epi8( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 CompareAllEqualTo_M8( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_cmpeq_epi8( left.m_v, right.m_v ) );
			
 
				+@@ -1996,11 +1749,6 @@ public:
			
 
				+ 
			
 
				+ 	template<const int value>
			
 
				+ 	friend Col4 IsValue( Arg v );
			
 
				+-	template<const int value>
			
 
				+-	friend Col4 IsValue( Arg v )
			
 
				+-	{
			
 
				+-		return Col4( _mm_cmpeq_epi32( v.m_v, _mm_set1_epi32( value ) ) );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	friend Col4 TransferA( Arg left, Arg right )
			
 
				+ 	{
			
 
				+@@ -2014,7 +1762,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_or_si128( left.m_v, _mm_setr_epi32( 0x00, 0x00, 0x00, 0xFF ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 CollapseA( Arg r, Arg g, Arg b, Arg a )
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_packus_epi16(
			
 
				+@@ -2032,7 +1780,7 @@ public:
			
 
				+ 
			
 
				+ 		loc = _mm_cvtsi128_si32 ( r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void PackBytes( Arg a, int &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+@@ -2042,7 +1790,7 @@ public:
			
 
				+ 
			
 
				+ 		loc = _mm_cvtsi128_si32 ( r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void PackWords( Arg a, unsigned__int64 &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+@@ -2059,11 +1807,11 @@ public:
			
 
				+ //		loc = _mm_cvtsi128_si64( r );
			
 
				+ 		_mm_storel_epi64( (__m128i *)&loc, r );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend void PackWords( Arg a, __int64 &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+-		  
			
 
				++
			
 
				+ 		r = _mm_packs_epi32( a.m_v, a.m_v );
			
 
				+ 
			
 
				+ //		loc = _mm_cvtsi128_si64( r );
			
 
				+@@ -2100,18 +1848,9 @@ public:
			
 
				+ 
			
 
				+ 		a = Col4( r );
			
 
				+ 	}
			
 
				+-	
			
 
				+-	friend void UnpackBytes( Col4 &a, const int &loc )
			
 
				+-	{
			
 
				+-		__m128i
			
 
				+ 
			
 
				+-		r = _mm_cvtsi32_si128 ( loc );
			
 
				+-		r = _mm_unpacklo_epi8( r, r );
			
 
				+-		r = _mm_unpacklo_epi16( r, r );
			
 
				+-		
			
 
				+-		a = ExtendSign<24>( Col4( r ) );
			
 
				+-	}
			
 
				+-	
			
 
				++    friend void UnpackBytes( Col4 &a, const int &loc );
			
 
				++
			
 
				+ 	friend void UnpackWords( Col4 &a, const unsigned__int64 &loc )
			
 
				+ 	{
			
 
				+ 		__m128i
			
 
				+@@ -2121,110 +1860,447 @@ public:
			
 
				+ 
			
 
				+ 		a = Col4( r );
			
 
				+ 	}
			
 
				+-	
			
 
				+-	friend void UnpackWords( Col4 &a, const __int64 &loc )
			
 
				++
			
 
				++    friend void UnpackWords( Col4 &a, const __int64 &loc );
			
 
				++
			
 
				++	// clamp the output to [0, 1]
			
 
				++	Col4 Clamp() const {
			
 
				++		Col4 const one (0xFF);
			
 
				++		Col4 const zero(0x00);
			
 
				++
			
 
				++		return Min(one, Max(zero, *this));
			
 
				++	}
			
 
				++
			
 
				++	friend void Interleave( Col4 &a, Arg b, Arg c )
			
 
				+ 	{
			
 
				+-		__m128i
			
 
				++		a = Col4( _mm_shuffle_epi32( _mm_unpacklo_epi32( b.m_v , c.m_v ), SQUISH_SSE_SHUF(0, 3, 0, 3) ) );
			
 
				++	}
			
 
				++
			
 
				++	friend void LoadAligned( Col4 &a, Col4 &b, Arg c )
			
 
				++	{
			
 
				++	        a.m_v = c.m_v;
			
 
				++		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
			
 
				++	}
			
 
				++
			
 
				++	friend void LoadAligned( Col4 &a, void const *source )
			
 
				++	{
			
 
				++		a.m_v = _mm_load_si128( (__m128i const *)source );
			
 
				++	}
			
 
				++
			
 
				++	friend void LoadAligned( Col4 &a, Col4 &b, void const *source )
			
 
				++	{
			
 
				++		a.m_v = _mm_load_si128( (__m128i const *)source );
			
 
				++		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
			
 
				++	}
			
 
				++
			
 
				++	friend void LoadUnaligned( Col4 &a, Col4 &b, void const *source )
			
 
				++	{
			
 
				++		a.m_v = _mm_loadu_si128( (__m128i const *)source );
			
 
				++		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
			
 
				++	}
			
 
				++
			
 
				++	friend void StoreAligned( Arg a, Arg b, Col4 &c )
			
 
				++	{
			
 
				++		c.m_v = _mm_unpacklo_epi64( a.m_v, b.m_v );
			
 
				++	}
			
 
				++
			
 
				++	friend void StoreAligned( Arg a, void *destination )
			
 
				++	{
			
 
				++		_mm_store_si128( (__m128i *)destination, a.m_v );
			
 
				++	}
			
 
				++
			
 
				++	friend void StoreAligned( Arg a, Arg b, void *destination )
			
 
				++	{
			
 
				++		_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
			
 
				++	}
			
 
				++
			
 
				++	friend void StoreUnaligned( Arg a, void *destination )
			
 
				++	{
			
 
				++		_mm_storeu_si128( (__m128i *)destination, a.m_v );
			
 
				++	}
			
 
				++
			
 
				++	friend void StoreUnaligned( Arg a, Arg b, void *destination )
			
 
				++	{
			
 
				++		_mm_storeu_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
			
 
				++	}
			
 
				++
			
 
				++	friend void StoreUnaligned( Arg a, u8* loc )
			
 
				++	{
			
 
				++		PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) );
			
 
				++	}
			
 
				++	friend void StoreUnaligned( Arg a, u16* loc )
			
 
				++	{
			
 
				++		PackWords( a, (unsigned__int64&) (*((unsigned__int64 *)loc)) );
			
 
				++	}
			
 
				++	friend void StoreUnaligned( Arg a, s8* loc )
			
 
				++	{
			
 
				++		PackBytes( a, (int&) (*((int *)loc)) );
			
 
				++	}
			
 
				++	friend void StoreUnaligned( Arg a, s16* loc )
			
 
				++	{
			
 
				++		PackWords( a, (__int64&) (*((__int64 *)loc)) );
			
 
				++	}
			
 
				++
			
 
				++	friend void LoadUnaligned( Col4 &a, const u8* loc );
			
 
				++	friend void LoadUnaligned( Col4 &a, const u16* loc );
			
 
				++	friend void LoadUnaligned( Col4 &a, const s8* loc )
			
 
				++	{
			
 
				++	    UnpackBytes( a, (const int&) (*((const int *)loc)) );
			
 
				++	}
			
 
				++	friend void LoadUnaligned( Col4 &a, const s16* loc )
			
 
				++	{
			
 
				++	    UnpackWords( a, (const __int64&) (*((const __int64 *)loc)) );
			
 
				++	}
			
 
				++
			
 
				++	void SwapRGBA( Col4 &with )
			
 
				++	{
			
 
				++	  /* inplace swap based on xors */
			
 
				++	       m_v = _mm_xor_si128( m_v, with.m_v );
			
 
				++	  with.m_v = _mm_xor_si128( with.m_v, m_v );
			
 
				++	       m_v = _mm_xor_si128( m_v, with.m_v );
			
 
				++	}
			
 
				++
			
 
				++private:
			
 
				++	__m128i m_v;
			
 
				++
			
 
				++	friend squish::Vec4;
			
 
				++	friend squish::Col8;
			
 
				++};
			
 
				++
			
 
				++template<const int n>
			
 
				++Col4 ExtendSign( Col4::Arg a )
			
 
				++{
			
 
				++    return Col4( _mm_srai_epi32( a.m_v, n ) );
			
 
				++}
			
 
				++
			
 
				++inline void UnpackBytes( Col4 &a, const int &loc )
			
 
				++{
			
 
				++    __m128i
			
 
				++
			
 
				++    r = _mm_cvtsi32_si128 ( loc );
			
 
				++    r = _mm_unpacklo_epi8( r, r );
			
 
				++    r = _mm_unpacklo_epi16( r, r );
			
 
				++
			
 
				++    a = ExtendSign<24>( Col4( r ) );
			
 
				++}
			
 
				++
			
 
				++inline void UnpackWords( Col4 &a, const __int64 &loc )
			
 
				++{
			
 
				++    __m128i
			
 
				++
			
 
				++    r = _mm_loadl_epi64( (__m128i *)&loc );
			
 
				++    r = _mm_unpacklo_epi16( r, r );
			
 
				++
			
 
				++    a = ExtendSign<16>( Col4( r ) );
			
 
				++}
			
 
				++
			
 
				++inline void LoadUnaligned( Col4 &a, const u8* loc )
			
 
				++{
			
 
				++    UnpackBytes( a, (const unsigned int&) (*((const unsigned int *)loc)) );
			
 
				++}
			
 
				++
			
 
				++inline void LoadUnaligned( Col4 &a, const u16* loc )
			
 
				++{
			
 
				++    UnpackWords( a, (const unsigned__int64&) (*((const unsigned__int64 *)loc)) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col4 ShiftLeft( Col4::Arg a )
			
 
				++{
			
 
				++    if ((n) <= 0)
			
 
				++        return Col4( a.m_v );
			
 
				++    if ((n) <= 7)
			
 
				++        return Col4( _mm_slli_epi32( a.m_v, (n) & 7 ) );
			
 
				++    if ((n) & 7)
			
 
				++        return Col4( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				++
			
 
				++    return Col4( _mm_slli_si128( a.m_v, (n) >> 3 ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++void ReplBits( Col4::Arg left, Col4 &right )
			
 
				++{
			
 
				++    if (!n)
			
 
				++        return;
			
 
				++    if ((n < 0)) {
			
 
				++        right  = ExtrBits<-n, p>( left );
			
 
				++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 3 ) );
			
 
				++    }
			
 
				++    else {
			
 
				++        right  = ExtrBits< n, p>( left );
			
 
				++        right.m_v = _mm_shuffle_epi32( right.m_v, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				++    }
			
 
				++}
			
 
				++
			
 
				++template<const int value>
			
 
				++Col4 IsValue( Col4::Arg v )
			
 
				++{
			
 
				++    return Col4( _mm_cmpeq_epi32( v.m_v, _mm_set1_epi32( value ) ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col4 ShiftLeftHalf( Col4::Arg a )
			
 
				++{
			
 
				++    return Col4( (n) > 0 ? _mm_slli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				++}
			
 
				++
			
 
				++inline Col4 ShiftLeftHalf( Col4::Arg a, const int n )
			
 
				++{
			
 
				++    return Col4( _mm_sll_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col4 ShiftRightHalf( Col4::Arg a )
			
 
				++{
			
 
				++    return Col4( (n) > 0 ? _mm_srli_epi64( a.m_v, (n) ) : a.m_v );
			
 
				++}
			
 
				++
			
 
				++inline Col4 ShiftRightHalf( Col4::Arg a, const int n )
			
 
				++{
			
 
				++    return Col4( _mm_srl_epi64( a.m_v, _mm_cvtsi32_si128( n ) ) );
			
 
				++}
			
 
				++
			
 
				++inline Col4 ShiftRightHalf( Col4::Arg a, Col4::Arg b )
			
 
				++{
			
 
				++    return Col4( _mm_srl_epi64( a.m_v, b.m_v ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col4 ShiftRight( Col4::Arg a )
			
 
				++{
			
 
				++    if ((n) <= 0)
			
 
				++        return Col4( a.m_v );
			
 
				++    if ((n) <= 7)
			
 
				++        return Col4( _mm_srli_epi32( a.m_v, (n) & 7 ) );
			
 
				++    if ((n) & 7)
			
 
				++        return Col4( _mm_srli_epi32( _mm_srli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
			
 
				++
			
 
				++    return Col4( _mm_srli_si128( a.m_v, (n) >> 3 ) );
			
 
				++}
			
 
				++
			
 
				++template<const int f, const int t>
			
 
				++Col4 Shuffle( Col4::Arg a )
			
 
				++{
			
 
				++    if (f == t)
			
 
				++        return a;
			
 
				++
			
 
				++    return Col4( _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SHUF(
			
 
				++                                                           (t == 0 ? f : 0),
			
 
				++                                                           (t == 1 ? f : 1),
			
 
				++                                                           (t == 2 ? f : 2),
			
 
				++                                                           (t == 3 ? f : 3)
			
 
				++                                                           ) ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col4 FillSign( Col4::Arg a )
			
 
				++{
			
 
				++    return Col4( _mm_srai_epi32( _mm_slli_epi32( a.m_v, n ), n ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++Col4 MaskBits( Col4::Arg a )
			
 
				++{
			
 
				++    if (((p) + (n)) <= 0)
			
 
				++        return Col4(0);
			
 
				++    if (((p) + (n)) >= 64)
			
 
				++        return a;
			
 
				++
			
 
				++    // compile time
			
 
				++    __int64 base = ~(0xFFFFFFFFFFFFFFFFULL << (     ((p) + (n)) & 63));
			
 
				++    //	__int64 base =  (0xFFFFFFFFFFFFFFFFULL >> (64 - ((p) + (n)) & 63));
			
 
				++    __m128i mask = _mm_setr_epi32(
			
 
				++                                  (int)(base >>  0),
			
 
				++                                  (int)(base >> 32), 0, 0
			
 
				++                                  );
			
 
				++
			
 
				++    return Col4( _mm_and_si128( a.m_v, mask ) );
			
 
				++}
			
 
				++
			
 
				++inline Col4 MaskBits( Col4::Arg a, const int n, const int p )
			
 
				++{
			
 
				++    const int val = 64 - ((p) + (n));
			
 
				++
			
 
				++    __m128i shift = _mm_max_epi16( _mm_cvtsi32_si128( val ), _mm_set1_epi32( 0 ) );
			
 
				++    __m128i mask = _mm_setr_epi32(
			
 
				++                                  0xFFFFFFFF,
			
 
				++                                  0xFFFFFFFF, 0, 0
			
 
				++                                  );
			
 
				++
			
 
				++    mask = _mm_srl_epi64( mask, shift );
			
 
				++
			
 
				++    // (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63))
			
 
				++    return Col4( _mm_and_si128( a.m_v, mask ) );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++Col4 CopyBits( Col4::Arg left, Col4::Arg right )
			
 
				++{
			
 
				++    if (!(n))
			
 
				++        return left;
			
 
				++    if (!(p))
			
 
				++        return MaskBits<n, 0>(right);
			
 
				++    if (((p) + (n)) >= 64)
			
 
				++        return (left) + ShiftLeftHalf<p>(right);
			
 
				++
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
			
 
				++#else
			
 
				++    return MaskBits<p, 0>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				++    //	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++inline Col4 CopyBits( Col4::Arg left, Col4& right, const int n, const int p )
			
 
				++{
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    /* ---- ---bl xxxx xxxx */
			
 
				++    const int val = (p << 8) + (n << 0);
			
 
				++
			
 
				++    right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
			
 
				++    return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
			
 
				++#else
			
 
				++    return MaskBits(left, p, 0) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				++    //	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++template<const int r, const int g, const int b, const int a>
			
 
				++Col4 ShiftLeftLo( Col4::Arg v )
			
 
				++{
			
 
				++    // (1 << r, 1 << g, 1 << b, 1 << a);
			
 
				++    Col4 p2; p2.SetRGBApow2<0>(r, g, b, a);
			
 
				++
			
 
				++#if ( SQUISH_USE_SSE >= 4 )
			
 
				++    return Col4( _mm_mullo_epi32( v.m_v, p2.m_v ) );
			
 
				++#else
			
 
				++    return Col4( _mm_mullo_epi16( v.m_v, p2.m_v ) );
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++void ExtrBits( Col4::Arg left, Col4 &right )
			
 
				++{
			
 
				++    right  = ExtrBits<n, p>( left );
			
 
				++}
			
 
				++
			
 
				++template<const int n, const int p>
			
 
				++Col4 ExtrBits( Col4::Arg a )
			
 
				++{
			
 
				++    if (!n)
			
 
				++        return Col4(0);
			
 
				++    if (!p)
			
 
				++        return MaskBits<n, 0>(a);
			
 
				++    if ((n + p) >= 64)
			
 
				++        return ShiftRightHalf<p>(a);
			
 
				++
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    return Col4( _mm_extracti_si64( a.m_v, n, p ) );
			
 
				++#else
			
 
				++    return MaskBits<n, 0>(ShiftRightHalf<p>(a));
			
 
				++#endif
			
 
				++}
			
 
				+ 
			
 
				+-		r = _mm_loadl_epi64( (__m128i *)&loc );
			
 
				+-		r = _mm_unpacklo_epi16( r, r );
			
 
				+-		
			
 
				+-		a = ExtendSign<16>( Col4( r ) );
			
 
				+-	}
			
 
				+-	
			
 
				+-	// clamp the output to [0, 1]
			
 
				+-	Col4 Clamp() const {
			
 
				+-		Col4 const one (0xFF);
			
 
				+-		Col4 const zero(0x00);
			
 
				++inline Col4 ExtrBits( Col4::Arg a, const int n, const int p )
			
 
				++{
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    /* ---- ----- ---- ---bl */
			
 
				++    const int val = (p << 8) + (n << 0);
			
 
				+ 
			
 
				+-		return Min(one, Max(zero, *this));
			
 
				+-	}
			
 
				++    return Col4( _mm_extract_si64( a.m_v, _mm_cvtsi32_si128( val ) ) );
			
 
				++#else
			
 
				++    return MaskBits(ShiftRightHalf(a, p), n, 0);
			
 
				++#endif
			
 
				++}
			
 
				+ 
			
 
				+-	friend void Interleave( Col4 &a, Arg b, Arg c )
			
 
				+-	{
			
 
				+-		a = Col4( _mm_shuffle_epi32( _mm_unpacklo_epi32( b.m_v , c.m_v ), SQUISH_SSE_SHUF(0, 3, 0, 3) ) );
			
 
				+-	}
			
 
				++template<const int n, const int p>
			
 
				++void ConcBits( Col4::Arg left, Col4 &right )
			
 
				++{
			
 
				++    right  = ShiftLeft<32>( right );
			
 
				++    if (n > 0)
			
 
				++        right += ExtrBits<n, p>( left );
			
 
				++}
			
 
				+ 
			
 
				+-	friend void LoadAligned( Col4 &a, Col4 &b, Arg c )
			
 
				+-	{
			
 
				+-	        a.m_v = c.m_v;
			
 
				+-		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
			
 
				+-	}
			
 
				++template<const int n, const int p>
			
 
				++Col4 KillBits( Col4::Arg a )
			
 
				++{
			
 
				++    if (!n || (p >= 64))
			
 
				++        return a;
			
 
				++    if (!p && (n >= 64))
			
 
				++        return Col4(0);
			
 
				+ 
			
 
				+-	friend void LoadAligned( Col4 &a, void const *source )
			
 
				+-	{
			
 
				+-		a.m_v = _mm_load_si128( (__m128i const *)source );
			
 
				+-	}
			
 
				++    // compile time
			
 
				++    __int64 base1 =  (0xFFFFFFFFFFFFFFFFULL << (     (p + 0) & 63));
			
 
				++    __int64 base2 =  (0xFFFFFFFFFFFFFFFFULL >> (64 - (p + n) & 63));
			
 
				++    //	__int64 base1 = ~(0xFFFFFFFFFFFFFFFFULL >> (64 - (p + 0) & 63));
			
 
				++    //	__int64 base2 = ~(0xFFFFFFFFFFFFFFFFULL << (64 - (p + n) & 63));
			
 
				+ 
			
 
				+-	friend void LoadAligned( Col4 &a, Col4 &b, void const *source )
			
 
				+-	{
			
 
				+-		a.m_v = _mm_load_si128( (__m128i const *)source );
			
 
				+-		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
			
 
				+-	}
			
 
				++    __m128i mask;
			
 
				+ 
			
 
				+-	friend void LoadUnaligned( Col4 &a, Col4 &b, void const *source )
			
 
				+-	{
			
 
				+-		a.m_v = _mm_loadu_si128( (__m128i const *)source );
			
 
				+-		b.m_v = _mm_shuffle_epi32( a.m_v, SQUISH_SSE_SWAP64() );
			
 
				+-	}
			
 
				++    if ((p + n) >= 64)
			
 
				++        base2 = 0xFFFFFFFFFFFFFFFFULL;
			
 
				+ 
			
 
				+-	friend void StoreAligned( Arg a, Arg b, Col4 &c )
			
 
				+-	{
			
 
				+-		c.m_v = _mm_unpacklo_epi64( a.m_v, b.m_v );
			
 
				+-	}
			
 
				++    mask = _mm_setr_epi32(
			
 
				++                          (int)((base1 ^ base2) >>  0),
			
 
				++                          (int)((base1 ^ base2) >> 32), 0, 0
			
 
				++                          );
			
 
				+ 
			
 
				+-	friend void StoreAligned( Arg a, void *destination )
			
 
				+-	{
			
 
				+-		_mm_store_si128( (__m128i *)destination, a.m_v );
			
 
				+-	}
			
 
				++    return Col4( _mm_and_si128( a.m_v, mask ) );
			
 
				++}
			
 
				+ 
			
 
				+-	friend void StoreAligned( Arg a, Arg b, void *destination )
			
 
				+-	{
			
 
				+-		_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
			
 
				+-	}
			
 
				++inline Col4 KillBits( Col4::Arg a, const int n, const int p )
			
 
				++{
			
 
				++    const int val1 =      (p + 0);
			
 
				++    const int val2 = 64 - (p + n);
			
 
				++
			
 
				++    __m128i shift1 = _mm_max_epi16( _mm_cvtsi32_si128( val1 ), _mm_set1_epi32( 0 ) );
			
 
				++    __m128i shift2 = _mm_max_epi16( _mm_cvtsi32_si128( val2 ), _mm_set1_epi32( 0 ) );
			
 
				++    __m128i mask1 = _mm_setr_epi32(
			
 
				++                                   0xFFFFFFFF,
			
 
				++                                   0xFFFFFFFF, 0, 0
			
 
				++                                   );
			
 
				++    __m128i mask2 = _mm_setr_epi32(
			
 
				++                                   0xFFFFFFFF,
			
 
				++                                   0xFFFFFFFF, 0, 0
			
 
				++                                   );
			
 
				++
			
 
				++    mask1 = _mm_sll_epi64( mask1, shift1 );
			
 
				++    mask2 = _mm_srl_epi64( mask2, shift2 );
			
 
				++
			
 
				++    return Col4( _mm_and_si128( a.m_v, _mm_xor_si128( mask1, mask2 ) ) );
			
 
				++}
			
 
				+ 
			
 
				+-	friend void StoreUnaligned( Arg a, void *destination )
			
 
				+-	{
			
 
				+-		_mm_storeu_si128( (__m128i *)destination, a.m_v );
			
 
				+-	}
			
 
				++template<const int n, const int p>
			
 
				++Col4 InjtBits( Col4::Arg left, Col4::Arg right )
			
 
				++{
			
 
				++    if (!n || (p >= 64))
			
 
				++        return right;
			
 
				++    if ((p + n) >= 64)
			
 
				++        return KillBits<n, p>(left) + ShiftLeftHalf<p>(right);
			
 
				++    //		return               (left) + ShiftLeftHalf<p>(right);
			
 
				+ 
			
 
				+-	friend void StoreUnaligned( Arg a, Arg b, void *destination )
			
 
				+-	{
			
 
				+-		_mm_storeu_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
			
 
				+-	}
			
 
				+-	
			
 
				+-	friend void StoreUnaligned( Arg a, u8* loc ) {
			
 
				+-	  PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) ); }
			
 
				+-	friend void StoreUnaligned( Arg a, u16* loc ) {
			
 
				+-	  PackWords( a, (unsigned__int64&) (*((unsigned__int64 *)loc)) ); }
			
 
				+-	friend void StoreUnaligned( Arg a, s8* loc ) {
			
 
				+-	  PackBytes( a, (int&) (*((int *)loc)) ); }
			
 
				+-	friend void StoreUnaligned( Arg a, s16* loc ) {
			
 
				+-	  PackWords( a, (__int64&) (*((__int64 *)loc)) ); }
			
 
				+-	
			
 
				+-	friend void LoadUnaligned( Col4 &a, const u8* loc ) {
			
 
				+-	  UnpackBytes( a, (const unsigned int&) (*((const unsigned int *)loc)) ); }
			
 
				+-	friend void LoadUnaligned( Col4 &a, const u16* loc ) {
			
 
				+-	  UnpackWords( a, (const unsigned__int64&) (*((const unsigned__int64 *)loc)) ); }
			
 
				+-	friend void LoadUnaligned( Col4 &a, const s8* loc ) {
			
 
				+-	  UnpackBytes( a, (const int&) (*((const int *)loc)) ); }
			
 
				+-	friend void LoadUnaligned( Col4 &a, const s16* loc ) {
			
 
				+-	  UnpackWords( a, (const __int64&) (*((const __int64 *)loc)) ); }
			
 
				+ 
			
 
				+-	void SwapRGBA( Col4 &with )
			
 
				+-	{
			
 
				+-	  /* inplace swap based on xors */
			
 
				+-	       m_v = _mm_xor_si128( m_v, with.m_v );
			
 
				+-	  with.m_v = _mm_xor_si128( with.m_v, m_v );
			
 
				+-	       m_v = _mm_xor_si128( m_v, with.m_v );
			
 
				+-	}
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    return Col4( _mm_inserti_si64( left.m_v, right.m_v, n, p ) );
			
 
				++#else
			
 
				++    return KillBits<n, p>(left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				++    //	return               (left) + MaskBits<n, p>(ShiftLeftHalf<p>(right));
			
 
				++#endif
			
 
				++}
			
 
				+ 
			
 
				+-private:
			
 
				+-	__m128i m_v;
			
 
				++inline Col4 InjtBits( Col4::Arg left, Col4& right, const int n, const int p )
			
 
				++{
			
 
				++#if ( SQUISH_USE_XSSE == 4 )
			
 
				++    /* ---- ---bl xxxx xxxx */
			
 
				++    const int val = (p << 8) + (n << 0);
			
 
				+ 
			
 
				+-	friend class Vec4;
			
 
				+-	friend class Col8;
			
 
				+-};
			
 
				++    right.m_v = _mm_unpacklo_epi64( right.m_v, _mm_cvtsi32_si128( val ) );
			
 
				++    return Col4( _mm_insert_si64( left.m_v, right.m_v ) );
			
 
				++#else
			
 
				++    return KillBits(left, n, p) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				++    //	return         (left      ) + MaskBits(ShiftLeftHalf(right, p), n, p);
			
 
				++#endif
			
 
				++}
			
 
				+ 
			
 
				+ #if	!defined(SQUISH_USE_PRE)
			
 
				+ inline Col3 LengthSquared( Col3::Arg v )
			
 
				+@@ -2291,30 +2367,30 @@ public:
			
 
				+ 	{
			
 
				+ 		return _mm_extract_epi16( m_v, 0 );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ #pragma warning ( push )
			
 
				+ #pragma warning ( disable : 4100 )
			
 
				+ 	friend Col4 LoCol4(Arg v, const unsigned dummy)
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_unpacklo_epi16( v.m_v, _mm_setzero_si128() ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 HiCol4(Arg v, const unsigned dummy)
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_unpackhi_epi16( v.m_v, _mm_setzero_si128() ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 LoCol4(Arg v, const signed dummy)
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_srai_epi32( _mm_unpacklo_epi16( _mm_setzero_si128(), v.m_v ), 16 ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 HiCol4(Arg v, const signed dummy)
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_srai_epi32( _mm_unpackhi_epi16( _mm_setzero_si128(), v.m_v ), 16 ) );
			
 
				+ 	}
			
 
				+ #pragma warning ( pop )
			
 
				+-	
			
 
				++
			
 
				+ 	const u16 &operator[]( int pos ) const
			
 
				+ 	{
			
 
				+ 		return ((u16 *)&m_v)[pos];
			
 
				+@@ -2331,7 +2407,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_srli_epi16( left.m_v, right ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col8 operator>>( Arg left, int right )
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_srai_epi16( left.m_v, right ) );
			
 
				+@@ -2341,7 +2417,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_slli_epi16( left.m_v, right ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col8 operator<<( Arg left, int right )
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_slli_epi16( left.m_v, right ) );
			
 
				+@@ -2366,7 +2442,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_mulhi_epu16( left.m_v, _mm_set1_epi16( (unsigned short)right ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col8 operator*( Arg left, int right )
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_mulhi_epi16( left.m_v, _mm_set1_epi16( (short)right ) ) );
			
 
				+@@ -2374,12 +2450,7 @@ public:
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col8 ExtendSign(Arg a);
			
 
				+-	template<const int n>
			
 
				+-	friend Col8 ExtendSign(Arg a)
			
 
				+-	{
			
 
				+-		return Col8( _mm_srai_epi16( a.m_v, n ) );
			
 
				+-	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col8 HorizontalMin(Arg a)
			
 
				+ 	{
			
 
				+ 		__m128i res = a.m_v;
			
 
				+@@ -2420,17 +2491,13 @@ public:
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Col8 ShiftUp(Arg a);
			
 
				+-	template<const int n>
			
 
				+-	friend Col8 ShiftUp(Arg a)
			
 
				+-	{
			
 
				+-		return Col8( _mm_slli_si128( a.m_v, n << 1 ) );
			
 
				+-	}
			
 
				+-	
			
 
				++
			
 
				++
			
 
				+ #pragma warning ( push )
			
 
				+ #pragma warning ( disable : 4100 )
			
 
				+ 	friend Col4 ExpandUpper(Arg a, const unsigned dummy) {
			
 
				+ 		__m128i res = a.m_v;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
			
 
				+ 
			
 
				+ #ifdef _MSV_VER
			
 
				+@@ -2445,7 +2512,7 @@ public:
			
 
				+ 
			
 
				+ 	friend Col4 RepeatUpper(Arg a, const unsigned dummy) {
			
 
				+ 		__m128i res = a.m_v;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
			
 
				+ 		res = _mm_shuffle_epi32( res, SQUISH_SSE_SPLAT(3) );
			
 
				+ 
			
 
				+@@ -2458,10 +2525,10 @@ public:
			
 
				+ 
			
 
				+ 		return Col4( res );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 InterleaveUpper(Arg a, Arg b, const unsigned dummy) {
			
 
				+ 		__m128i res;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi16( a.m_v, b.m_v );
			
 
				+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
			
 
				+ 		res = _mm_unpackhi_epi64( res, res );
			
 
				+@@ -2478,7 +2545,7 @@ public:
			
 
				+ 
			
 
				+ 	friend Col4 ReplicateUpper(Arg a, Arg b, const unsigned dummy) {
			
 
				+ 		__m128i res;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi16( a.m_v, b.m_v );
			
 
				+ 		res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
			
 
				+ 		res = _mm_unpackhi_epi32( res, res );
			
 
				+@@ -2495,7 +2562,7 @@ public:
			
 
				+ 
			
 
				+ 	friend Col4 ExpandUpper(Arg a, const signed dummy) {
			
 
				+ 		__m128i res = a.m_v;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi16( res, res );
			
 
				+ 		res = _mm_srai_epi32( res, 16 );
			
 
				+ 
			
 
				+@@ -2524,10 +2591,10 @@ public:
			
 
				+ 
			
 
				+ 		return Col4( res );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 InterleaveUpper(Arg a, Arg b, const signed dummy) {
			
 
				+ 		__m128i res;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi32( a.m_v, b.m_v );
			
 
				+ 		res = _mm_srai_epi32( res, 16 );
			
 
				+ 		res = _mm_unpackhi_epi64( res, res );
			
 
				+@@ -2544,11 +2611,11 @@ public:
			
 
				+ 
			
 
				+ 	friend Col4 ReplicateUpper(Arg a, Arg b, const signed dummy) {
			
 
				+ 		__m128i res;
			
 
				+-		
			
 
				++
			
 
				+ 		res = _mm_unpackhi_epi32( a.m_v, b.m_v );
			
 
				+ 		res = _mm_srai_epi32( res, 16 );
			
 
				+ 		res = _mm_unpackhi_epi32( res, res );
			
 
				+-		
			
 
				++
			
 
				+ #ifdef _MSV_VER
			
 
				+ 		assert(res.m128i_i32[0] == a.m_v.m128i_i16[7]);
			
 
				+ 		assert(res.m128i_i32[1] == a.m_v.m128i_i16[7]);
			
 
				+@@ -2559,7 +2626,7 @@ public:
			
 
				+ 		return Col4( res );
			
 
				+ 	}
			
 
				+ #pragma warning ( pop )
			
 
				+-	
			
 
				++
			
 
				+ 	/*
			
 
				+ 	friend Col4 Expand(Arg a, int ia) {
			
 
				+ 		__m128i res = _mm_setzero_si128();
			
 
				+@@ -2601,17 +2668,17 @@ public:
			
 
				+ 		return Col4( res );
			
 
				+ 	}
			
 
				+ 	*/
			
 
				+-	
			
 
				++
			
 
				+ 	friend int CompareEqualTo( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return _mm_movemask_epi8( _mm_cmpeq_epi16( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col8 CompareAllEqualTo( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_cmpeq_epi16( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col8 CompareAllLessThan( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Col8( _mm_cmplt_epi16( left.m_v, right.m_v ) );
			
 
				+@@ -2620,9 +2687,21 @@ public:
			
 
				+ private:
			
 
				+ 	__m128i m_v;
			
 
				+ 
			
 
				+-	friend class Vec4;
			
 
				++	friend squish::Vec4;
			
 
				+ };
			
 
				+ 
			
 
				++template<const int n>
			
 
				++Col8 ExtendSign(Col8::Arg a)
			
 
				++{
			
 
				++	return Col8(_mm_srai_epi16(a.m_v, n));
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Col8 ShiftUp(Col8::Arg a)
			
 
				++{
			
 
				++    return Col8( _mm_slli_si128( a.m_v, n << 1 ) );
			
 
				++}
			
 
				++
			
 
				+ #define VEC4_CONST( X ) Vec4( X )
			
 
				+ 
			
 
				+ class Vec3
			
 
				+@@ -2649,7 +2728,7 @@ public:
			
 
				+ 		m_v = _mm_unpacklo_ps(_mm_load_ss(x), _mm_load_ss(y));
			
 
				+ 		m_v = _mm_movelh_ps(m_v, _mm_load_ss(z));
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec3( bool x, bool y, bool z ) : m_v( _mm_castsi128_ps( _mm_setr_epi32( x ? ~0 : 0, y ? ~0 : 0, z ? ~0 : 0, 0 ) ) ) {}
			
 
				+ 
			
 
				+ 	Vec3( float x, float y, float z ) : m_v( _mm_setr_ps( x, y, z, 0.0f ) ) {}
			
 
				+@@ -2662,7 +2741,7 @@ public:
			
 
				+ 	void StoreX(float *x) const { _mm_store_ss(x, m_v); }
			
 
				+ 	void StoreY(float *y) const { _mm_store_ss(y, _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) )); }
			
 
				+ 	void StoreZ(float *z) const { _mm_store_ss(z, _mm_movehl_ps( m_v, m_v ) ); }
			
 
				+-	
			
 
				++
			
 
				+ 	float X() const { return ((float *)&m_v)[0]; }
			
 
				+ 	float Y() const { return ((float *)&m_v)[1]; }
			
 
				+ 	float Z() const { return ((float *)&m_v)[2]; }
			
 
				+@@ -2729,7 +2808,7 @@ public:
			
 
				+ 		m_v = _mm_mul_ps( m_v, v.m_v );
			
 
				+ 		return *this;
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec3& operator/=( Arg v )
			
 
				+ 	{
			
 
				+ 		*this *= Reciprocal( v );
			
 
				+@@ -2863,16 +2942,7 @@ public:
			
 
				+ 
			
 
				+ 	template<const int n>
			
 
				+ 	friend Vec3 RotateLeft( Arg a );
			
 
				+-	template<const int n>
			
 
				+-	friend Vec3 RotateLeft( Arg a )
			
 
				+-	{
			
 
				+-		return Vec3( _mm_shuffle_ps( a.m_v , a.m_v , SQUISH_SSE_SHUF(
			
 
				+-			(n + 0) % 3,
			
 
				+-			(n + 1) % 3,
			
 
				+-			(n + 2) % 3,
			
 
				+-			3
			
 
				+-		) ) );
			
 
				+-	}
			
 
				++
			
 
				+ 
			
 
				+ 	friend Vec3 HorizontalAdd( Arg a )
			
 
				+ 	{
			
 
				+@@ -2974,7 +3044,7 @@ public:
			
 
				+ 
			
 
				+ 		return Vec3( res );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec3 HorizontalMaxXY( Arg a )
			
 
				+ 	{
			
 
				+ 		__m128 res = a.m_v;
			
 
				+@@ -2986,7 +3056,7 @@ public:
			
 
				+ 
			
 
				+ 		return Vec3( res );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec3 HorizontalMinXY( Arg a )
			
 
				+ 	{
			
 
				+ 		__m128 res = a.m_v;
			
 
				+@@ -3063,37 +3133,6 @@ public:
			
 
				+ 
			
 
				+ 	template<const bool disarm>
			
 
				+ 	friend Vec3 Complement( Arg left );
			
 
				+-	template<const bool disarm>
			
 
				+-	friend Vec3 Complement( Arg left )
			
 
				+-	{
			
 
				+-		__m128 ren, res, rez;
			
 
				+-
			
 
				+-		ren = left.m_v;
			
 
				+-		rez = _mm_set1_ps( 1.0f );
			
 
				+-		res = _mm_mul_ps( left.m_v, left.m_v );
			
 
				+-#if ( SQUISH_USE_SSE >= 3 )
			
 
				+-		res = _mm_hadd_ps( res, res );
			
 
				+-#else
			
 
				+-		res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
			
 
				+-#endif
			
 
				+-		if (!disarm) {
			
 
				+-			// correct x² + y² > 1.0f by renormalization
			
 
				+-			if ( _mm_comigt_ss( res, rez ) ) {
			
 
				+-				res = ReciprocalSqrt( Vec3(res) ).m_v;
			
 
				+-				res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				+-
			
 
				+-				ren = _mm_mul_ps( ren, res );
			
 
				+-				res = rez;
			
 
				+-			}
			
 
				+-		}
			
 
				+-		
			
 
				+-		rez = _mm_sub_ps( rez, _mm_min_ps( rez, res ) );
			
 
				+-		rez = _mm_sqrt_ps( rez );
			
 
				+-		res = _mm_movelh_ps( left.m_v, rez );
			
 
				+-
			
 
				+-		// sqrt(1.0f - (x*x + y*y))
			
 
				+-		return Vec3( res );
			
 
				+-	}
			
 
				+ 
			
 
				+ 	template<const bool disarm>
			
 
				+ 	friend Vec3 Complement( Vec3 &left, Vec3 &right );
			
 
				+@@ -3104,20 +3143,20 @@ public:
			
 
				+ 			Vec3 len = (left * left) + (right * right);
			
 
				+ 			Vec3 adj = ReciprocalSqrt(Max(Vec3(1.0f), len));
			
 
				+ 
			
 
				+-			// correct x² + y² > 1.0f by renormalization
			
 
				++			// correct x? + y? > 1.0f by renormalization
			
 
				+ 			left  *= adj;
			
 
				+ 			right *= adj;
			
 
				+ 
			
 
				+-			// sqrt(1.0f - (x² + y²))
			
 
				++			// sqrt(1.0f - (x? + y?))
			
 
				+ 			return Sqrt(Vec3(1.0f) - Min(Vec3(1.0f), len));
			
 
				+ 		}
			
 
				+ 		else {
			
 
				+ 			Vec3 len = (left * left) + (right * right);
			
 
				+ 
			
 
				+-			// disarm x² + y² > 1.0f by letting NaN happen
			
 
				++			// disarm x? + y? > 1.0f by letting NaN happen
			
 
				+ 			// ...
			
 
				+ 
			
 
				+-			// sqrt(1.0f - (x² + y²))
			
 
				++			// sqrt(1.0f - (x? + y?))
			
 
				+ 			return Sqrt(Vec3(1.0f) - len);
			
 
				+ 		}
			
 
				+ 	}
			
 
				+@@ -3168,7 +3207,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Vec3( _mm_and_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x7FFFFFFF ) ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec3 Neg( Arg a )
			
 
				+ 	{
			
 
				+ 		return Vec3( _mm_or_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x80000000 ) ) ) );
			
 
				+@@ -3192,21 +3231,9 @@ public:
			
 
				+ 		return Min(one, Max(zero, *this));
			
 
				+ 	}
			
 
				+ 
			
 
				+-	template<const bool round>
			
 
				+-	friend Col3 FloatToInt( Arg v );
			
 
				+-	template<const bool round>
			
 
				+-	friend Col3 FloatToInt( Arg v )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_SSE == 1 )
			
 
				+-		...
			
 
				+-#else
			
 
				+-		// use SSE2 instructions
			
 
				+-		if (round)
			
 
				+-		      return Col3( _mm_cvtps_epi32( v.m_v ) );
			
 
				+-		else
			
 
				+-		      return Col3( _mm_cvttps_epi32( v.m_v ) );
			
 
				+-#endif
			
 
				+-	}
			
 
				++    template<const bool round>
			
 
				++    friend Col3 FloatToInt( Arg v );
			
 
				++
			
 
				+ 
			
 
				+ 	friend Vec3 Truncate( Arg v )
			
 
				+ 	{
			
 
				+@@ -3296,7 +3323,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Vec3( _mm_cmpneq_ps( m_v, _mm_set1_ps( 1.0f ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec3 TransferZ( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Vec3( _mm_shuffle_ps( left.m_v, right.m_v, SQUISH_SSE_SHUF( 0, 1, 2, 3 ) ) );
			
 
				+@@ -3351,9 +3378,70 @@ public:
			
 
				+ private:
			
 
				+ 	__m128 m_v;
			
 
				+ 
			
 
				+-	friend class Vec4;
			
 
				++	friend squish::Vec4;
			
 
				+ };
			
 
				+ 
			
 
				++
			
 
				++template<const bool round>
			
 
				++Col3 FloatToInt(Vec3::Arg v )
			
 
				++{
			
 
				++
			
 
				++#if ( SQUISH_USE_SSE == 1 )
			
 
				++        dasda
			
 
				++        ...
			
 
				++#else
			
 
				++        // use SSE2 instructions
			
 
				++        if (round)
			
 
				++            return Col3( _mm_cvtps_epi32( v.m_v ) );
			
 
				++        else
			
 
				++            return Col3( _mm_cvttps_epi32( v.m_v ) );
			
 
				++#endif
			
 
				++
			
 
				++}
			
 
				++
			
 
				++template<const int n>
			
 
				++Vec3 RotateLeft( Vec3::Arg a )
			
 
				++{
			
 
				++    return Vec3( _mm_shuffle_ps( a.m_v , a.m_v , SQUISH_SSE_SHUF(
			
 
				++                                                                 (n + 0) % 3,
			
 
				++                                                                 (n + 1) % 3,
			
 
				++                                                                 (n + 2) % 3,
			
 
				++                                                                 3
			
 
				++                                                                 ) ) );
			
 
				++}
			
 
				++
			
 
				++template<const bool disarm>
			
 
				++Vec3 Complement( Vec3::Arg left )
			
 
				++{
			
 
				++    __m128 ren, res, rez;
			
 
				++
			
 
				++    ren = left.m_v;
			
 
				++    rez = _mm_set1_ps( 1.0f );
			
 
				++    res = _mm_mul_ps( left.m_v, left.m_v );
			
 
				++#if ( SQUISH_USE_SSE >= 3 )
			
 
				++    res = _mm_hadd_ps( res, res );
			
 
				++#else
			
 
				++    res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
			
 
				++#endif
			
 
				++    if (!disarm) {
			
 
				++        // correct x? + y? > 1.0f by renormalization
			
 
				++        if ( _mm_comigt_ss( res, rez ) ) {
			
 
				++            res = ReciprocalSqrt( Vec3(res) ).m_v;
			
 
				++            res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				++
			
 
				++            ren = _mm_mul_ps( ren, res );
			
 
				++            res = rez;
			
 
				++        }
			
 
				++    }
			
 
				++
			
 
				++    rez = _mm_sub_ps( rez, _mm_min_ps( rez, res ) );
			
 
				++    rez = _mm_sqrt_ps( rez );
			
 
				++    res = _mm_movelh_ps( left.m_v, rez );
			
 
				++
			
 
				++    // sqrt(1.0f - (x*x + y*y))
			
 
				++    return Vec3( res );
			
 
				++}
			
 
				++
			
 
				+ template<const bool round>
			
 
				+ Col3 FloatToUHalf( Vec3::Arg v );
			
 
				+ template<const bool round>
			
 
				+@@ -3382,7 +3470,7 @@ Col3 FloatToSHalf( Vec3::Arg v )
			
 
				+ 	return h;
			
 
				+ }
			
 
				+ 
			
 
				+-Vec3 UHalfToFloat( Col3::Arg v )
			
 
				++inline Vec3 UHalfToFloat( Col3::Arg v )
			
 
				+ {
			
 
				+ 	Vec3 f;
			
 
				+ 
			
 
				+@@ -3393,7 +3481,7 @@ Vec3 UHalfToFloat( Col3::Arg v )
			
 
				+ 	return f;
			
 
				+ }
			
 
				+ 
			
 
				+-Vec3 SHalfToFloat( Col3::Arg v )
			
 
				++inline Vec3 SHalfToFloat( Col3::Arg v )
			
 
				+ {
			
 
				+ 	Vec3 f;
			
 
				+ 
			
 
				+@@ -3427,7 +3515,7 @@ public:
			
 
				+ 		m_v = arg.m_v;
			
 
				+ 		return *this;
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	operator Vec3()
			
 
				+ 	{
			
 
				+ 		return Vec3(m_v);
			
 
				+@@ -3458,21 +3546,21 @@ public:
			
 
				+ 		m_v = _mm_load_ss(x);
			
 
				+ 		m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec4( const unsigned short* x ) {
			
 
				+ 		__m128i v = _mm_setzero_si128();
			
 
				+ 
			
 
				+ 		m_v = _mm_cvtepi32_ps( _mm_insert_epi16( v, *x, 0 ) );
			
 
				+ 		m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec4( const signed short* x ) {
			
 
				+ 		__m128i v = _mm_setzero_si128();
			
 
				+ 
			
 
				+ 		m_v = _mm_cvtepi32_ps( _mm_insert_epi16( v, *x, 0 ) );
			
 
				+ 		m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec4( bool x, bool y, bool z, bool w ) : m_v( _mm_castsi128_ps( _mm_setr_epi32( x ? ~0 : 0, y ? ~0 : 0, z ? ~0 : 0, w ? ~0 : 0 ) ) ) {}
			
 
				+ 
			
 
				+ 	Vec4( int x, int y, int z, int w ) : m_v( _mm_cvtepi32_ps( _mm_setr_epi32( x, y, z, w ) ) ) {}
			
 
				+@@ -3498,23 +3586,17 @@ public:
			
 
				+ 	{
			
 
				+ 		return Vec3( m_v );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	int GetM4() const
			
 
				+ 	{
			
 
				+ 		return _mm_movemask_ps( m_v );
			
 
				+ 	}
			
 
				+ 
			
 
				+ 	template<class dtyp> friend Vec4 LoVec4(Col8 const&v, const dtyp& dummy);
			
 
				+-	template<class dtyp> friend Vec4 LoVec4(Col8 const&v, const dtyp& dummy)
			
 
				+-	{
			
 
				+-		return Vec4( LoCol4( v, dummy ) );
			
 
				+-	}
			
 
				++
			
 
				+ 
			
 
				+ 	template<class dtyp> friend Vec4 HiVec4(Col8 const&v, const dtyp& dummy);
			
 
				+-	template<class dtyp> friend Vec4 HiVec4(Col8 const&v, const dtyp& dummy)
			
 
				+-	{
			
 
				+-		return Vec4( HiCol4( v, dummy ) );
			
 
				+-	}
			
 
				++
			
 
				+ 
			
 
				+ 	void StoreX(float *x) const { _mm_store_ss(x, m_v); }
			
 
				+ 	void StoreY(float *y) const { _mm_store_ss(y, _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) )); }
			
 
				+@@ -3619,7 +3701,7 @@ public:
			
 
				+ 		m_v = _mm_mul_ps( m_v, v.m_v );
			
 
				+ 		return *this;
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec4& operator*=( float v )
			
 
				+ 	{
			
 
				+ 		m_v = _mm_mul_ps( m_v, Vec4( v ).m_v );
			
 
				+@@ -3631,7 +3713,7 @@ public:
			
 
				+ 		*this *= Reciprocal( v );
			
 
				+ 		return *this;
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	Vec4& operator/=( float v )
			
 
				+ 	{
			
 
				+ 		*this *= Reciprocal( Vec4( v ) );
			
 
				+@@ -3732,16 +3814,7 @@ public:
			
 
				+ 
			
 
				+ 	template<const int a, const int b, const int c, const int d>
			
 
				+ 	friend Vec4 Merge( Arg lo, Arg hi );
			
 
				+-	template<const int a, const int b, const int c, const int d>
			
 
				+-	friend Vec4 Merge( Arg lo, Arg hi )
			
 
				+-	{
			
 
				+-		return Vec4( _mm_shuffle_ps( lo.m_v , hi.m_v , SQUISH_SSE_SHUF(
			
 
				+-			a % 4,
			
 
				+-			b % 4,
			
 
				+-			c % 4,
			
 
				+-			d % 4
			
 
				+-		) ) );
			
 
				+-	}
			
 
				++
			
 
				+ 
			
 
				+ 	template<const int f, const int t>
			
 
				+ 	friend Vec4 Shuffle( Arg a );
			
 
				+@@ -3900,7 +3973,7 @@ public:
			
 
				+ 
			
 
				+ 		return Vec4( res );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 HorizontalMaxXY( Arg a )
			
 
				+ 	{
			
 
				+ 		__m128 res = a.m_v;
			
 
				+@@ -3912,7 +3985,7 @@ public:
			
 
				+ 
			
 
				+ 		return Vec4( res );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 HorizontalMinXY( Arg a )
			
 
				+ 	{
			
 
				+ 		__m128 res = a.m_v;
			
 
				+@@ -3965,7 +4038,7 @@ public:
			
 
				+ 
			
 
				+ 		return rsq;
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 Normalize( Arg left )
			
 
				+ 	{
			
 
				+ 		Vec4 sum = HorizontalAdd( Vec4( _mm_mul_ps( left.m_v, left.m_v ) ) );
			
 
				+@@ -3973,7 +4046,7 @@ public:
			
 
				+ 
			
 
				+ 		return left * rsq;
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 Normalize( Vec4& x, Vec4& y, Vec4& z )
			
 
				+ 	{
			
 
				+ 		Vec4 xx = x * x;
			
 
				+@@ -4006,7 +4079,7 @@ public:
			
 
				+ 		res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
			
 
				+ #endif
			
 
				+ 		if (!disarm) {
			
 
				+-			// correct x² + y² > 1.0f by renormalization
			
 
				++			// correct x? + y? > 1.0f by renormalization
			
 
				+ 			if ( _mm_comigt_ss( res, rez ) ) {
			
 
				+ 				res = ReciprocalSqrt( Vec4(res) ).m_v;
			
 
				+ 				res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
			
 
				+@@ -4028,7 +4101,7 @@ public:
			
 
				+ 			res = _mm_and_ps( res, _mm_castsi128_ps ( _mm_setr_epi32( ~0, ~0, ~0,  0 ) ) );
			
 
				+ 		}
			
 
				+ 
			
 
				+-		// sqrt(1.0f - (x² + y²))
			
 
				++		// sqrt(1.0f - (x? + y?))
			
 
				+ 		return Vec4( res );
			
 
				+ 	}
			
 
				+ 
			
 
				+@@ -4041,20 +4114,20 @@ public:
			
 
				+ 			Vec4 len = left * left + right * right;
			
 
				+ 			Vec4 adj = ReciprocalSqrt(Max(Vec4(1.0f), len));
			
 
				+ 
			
 
				+-			// correct x² + y² > 1.0f by renormalization
			
 
				++			// correct x? + y? > 1.0f by renormalization
			
 
				+ 			left  *= adj;
			
 
				+ 			right *= adj;
			
 
				+ 
			
 
				+-			// sqrt(1.0f - (x² + y²))
			
 
				++			// sqrt(1.0f - (x? + y?))
			
 
				+ 			return Sqrt(Vec4(1.0f) - Min(Vec4(1.0f), len));
			
 
				+ 		}
			
 
				+ 		else {
			
 
				+ 			Vec4 len = (left * left) + (right * right);
			
 
				+ 
			
 
				+-			// disarm x² + y² > 1.0f by letting NaN happen
			
 
				++			// disarm x? + y? > 1.0f by letting NaN happen
			
 
				+ 			// ...
			
 
				+ 
			
 
				+-			// sqrt(1.0f - (x² + y²))
			
 
				++			// sqrt(1.0f - (x? + y?))
			
 
				+ 			return Sqrt(Vec4(1.0f) - len);
			
 
				+ 		}
			
 
				+ 	}
			
 
				+@@ -4105,7 +4178,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_and_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x7FFFFFFF ) ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 Neg( Arg a )
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_or_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x80000000 ) ) ) );
			
 
				+@@ -4131,19 +4204,7 @@ public:
			
 
				+ 
			
 
				+ 	template<const bool round>
			
 
				+ 	friend Col4 FloatToInt( Vec4::Arg v );
			
 
				+-	template<const bool round>
			
 
				+-	friend Col4 FloatToInt( Vec4::Arg v )
			
 
				+-	{
			
 
				+-#if ( SQUISH_USE_SSE == 1 )
			
 
				+-		...
			
 
				+-#else
			
 
				+-		// use SSE2 instructions
			
 
				+-		if (round)
			
 
				+-		      return Col4( _mm_cvtps_epi32( v.m_v ) );
			
 
				+-		else
			
 
				+-		      return Col4( _mm_cvttps_epi32( v.m_v ) );
			
 
				+-#endif
			
 
				+-	}
			
 
				++
			
 
				+ 
			
 
				+ 	friend Vec4 Truncate( Arg v )
			
 
				+ 	{
			
 
				+@@ -4159,7 +4220,7 @@ public:
			
 
				+ 
			
 
				+ 		// clear out the MMX multimedia state to allow FP calls later
			
 
				+ 		_mm_empty();
			
 
				+-		
			
 
				++
			
 
				+ 		return Vec4( truncated );
			
 
				+ #else
			
 
				+ 		// use SSE2 instructions
			
 
				+@@ -4188,7 +4249,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return _mm_movemask_ps( _mm_cmpeq_ps( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend int CompareNotEqualTo( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return _mm_movemask_ps( _mm_cmpneq_ps( left.m_v, right.m_v ) );
			
 
				+@@ -4198,7 +4259,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return _mm_movemask_ps( _mm_cmplt_ps( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend int CompareGreaterThan( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return _mm_movemask_ps( _mm_cmpgt_ps( left.m_v, right.m_v ) );
			
 
				+@@ -4234,17 +4295,17 @@ public:
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_cmpeq_epi32( _mm_castps_si128 ( left.m_v ), _mm_castps_si128 ( right.m_v ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Col4 CompareAllEqualTo_M8( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Col4( _mm_cmpeq_epi8( _mm_castps_si128 ( left.m_v ), _mm_castps_si128 ( right.m_v ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend int CompareFirstLessThan( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return _mm_comilt_ss( left.m_v, right.m_v );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend int CompareFirstLessEqualTo( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return _mm_comile_ss( left.m_v, right.m_v );
			
 
				+@@ -4264,17 +4325,17 @@ public:
			
 
				+ 	{
			
 
				+ 		return _mm_comieq_ss( left.m_v, right.m_v );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 IsGreaterThan( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_cmpgt_ps( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 IsGreaterEqual( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_cmpge_ps( left.m_v, right.m_v ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 IsNotEqualTo( Arg left, Arg right )
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_cmpneq_ps( left.m_v, right.m_v ) );
			
 
				+@@ -4326,7 +4387,7 @@ public:
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_and_ps( left.m_v, _mm_castsi128_ps ( _mm_setr_epi32(  0,  0,  0, ~0 ) ) ) );
			
 
				+ 	}
			
 
				+-	
			
 
				++
			
 
				+ 	friend Vec4 CollapseW( Arg x, Arg y, Arg z, Arg w )
			
 
				+ 	{
			
 
				+ 		return Vec4( _mm_unpackhi_ps( _mm_unpackhi_ps( x.m_v, z.m_v ), _mm_unpackhi_ps( y.m_v, w.m_v ) ) );
			
 
				+@@ -4420,6 +4481,41 @@ private:
			
 
				+ 	__m128 m_v;
			
 
				+ };
			
 
				+ 
			
 
				++template<const bool round>
			
 
				++Col4 FloatToInt( Vec4::Arg v )
			
 
				++{
			
 
				++#if ( SQUISH_USE_SSE == 1 )
			
 
				++    ...
			
 
				++#else
			
 
				++    // use SSE2 instructions
			
 
				++    if (round)
			
 
				++        return Col4( _mm_cvtps_epi32( v.m_v ) );
			
 
				++    else
			
 
				++        return Col4( _mm_cvttps_epi32( v.m_v ) );
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++template<class dtyp> Vec4 LoVec4(Col8 const&v, const dtyp& dummy)
			
 
				++{
			
 
				++    return Vec4( LoCol4( v, dummy ) );
			
 
				++}
			
 
				++
			
 
				++template<class dtyp> Vec4 HiVec4(Col8 const&v, const dtyp& dummy)
			
 
				++{
			
 
				++    return Vec4( HiCol4( v, dummy ) );
			
 
				++}
			
 
				++
			
 
				++template<const int a, const int b, const int c, const int d>
			
 
				++Vec4 Merge( Vec4::Arg lo, Vec4::Arg hi )
			
 
				++{
			
 
				++    return Vec4( _mm_shuffle_ps( lo.m_v , hi.m_v , SQUISH_SSE_SHUF(
			
 
				++                                                                   a % 4,
			
 
				++                                                                   b % 4,
			
 
				++                                                                   c % 4,
			
 
				++                                                                   d % 4
			
 
				++                                                                   ) ) );
			
 
				++}
			
 
				++
			
 
				+ template<const bool round>
			
 
				+ Col4 FloatToUHalf( Vec4::Arg v );
			
 
				+ template<const bool round>
			
 
				+@@ -4450,7 +4546,7 @@ Col4 FloatToSHalf( Vec4::Arg v )
			
 
				+ 	return h;
			
 
				+ }
			
 
				+ 
			
 
				+-Vec4 UHalfToFloat( Col4::Arg v )
			
 
				++inline Vec4 UHalfToFloat( Col4::Arg v )
			
 
				+ {
			
 
				+ 	Vec4 f;
			
 
				+ 
			
 
				+@@ -4462,7 +4558,7 @@ Vec4 UHalfToFloat( Col4::Arg v )
			
 
				+ 	return f;
			
 
				+ }
			
 
				+ 
			
 
				+-Vec4 SHalfToFloat( Col4::Arg v )
			
 
				++inline Vec4 SHalfToFloat( Col4::Arg v )
			
 
				+ {
			
 
				+ 	Vec4 f;
			
 
				+ 
			
--- a/package_build_list_host_darwin.json
+++ b/package_build_list_host_darwin.json
@@ -28,7 +28,8 @@
 
				         "poly2tri-7f0487a-rev1-mac": "package-system/poly2tri/build_package_image.py --platform-name mac",
			
 
				         "v-hacd-2.3-1a49edf-rev1-mac": "package-system/v-hacd/build_package_image.py --platform-name mac",
			
 
				         "SPIRVCross-2021.04.29-rev1-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/SPIRVCross --platform-name Mac --package-root ../../package-system --clean",
			
 
				-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Mac --package-root ../../package-system --clean",
			
 
				+        "squish-ccr-deb557d-rev1-mac" : "Scripts/extras/pull_and_build_from_git.py ../../package-system/squish-ccr --platform-name Mac --package-root ../../package-system --clean",
			
 
				+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Mac --package-root ../../package-system --clean",
			
 
				         "azslc-1.7.23-rev2-mac": "Scripts/extras/pull_and_build_from_git.py ../../package-system/azslc --platform-name Mac --package-root ../../package-system --clean",
			
 
				         "tiff-4.2.0.10-mac" : "package-system/tiff/build_package_image.py --platform mac",
			
 
				         "tiff-4.2.0.10-ios" : "package-system/tiff/build_package_image.py --platform ios",
			
@@ -72,7 +73,8 @@
 
				         "v-hacd-2.3-1a49edf-rev1-mac": "package-system/v-hacd-mac",
			
 
				         "mcpp-2.7.2_az.1-rev1-mac": "package-system/mcpp-mac",
			
 
				         "SPIRVCross-2021.04.29-rev1-mac": "package-system/SPIRVCross-mac",
			
 
				-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-mac": "package-system/DirectXShaderCompilerDxc-mac",
			
 
				+        "squish-ccr-deb557d-rev1-mac": "package-system/squish-ccr-mac",
			
 
				+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-mac": "package-system/DirectXShaderCompilerDxc-mac",
			
 
				         "azslc-1.7.23-rev2-mac": "package-system/azslc-mac",
			
 
				         "SQLite-3.32.2-rev3-multiplatform" : "package-system/SQLite-multiplatform",
			
 
				         "xxhash-0.7.4-rev1-multiplatform":  "package-system/xxhash-multiplatform",
			
--- a/package_build_list_host_linux.json
+++ b/package_build_list_host_linux.json
@@ -20,7 +20,8 @@
 
				         "poly2tri-7f0487a-rev1-linux": "package-system/poly2tri/build_package_image.py --platform-name linux",
			
 
				         "v-hacd-2.3-1a49edf-rev1-linux": "package-system/v-hacd/build_package_image.py --platform-name linux",
			
 
				         "SPIRVCross-2021.04.29-rev1-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/SPIRVCross --platform-name Linux --package-root ../../package-system --clean",
			
 
				-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Linux --package-root ../../package-system --clean",
			
 
				+        "squish-ccr-deb557d-rev1-linux" : "Scripts/extras/pull_and_build_from_git.py ../../package-system/squish-ccr --platform-name Linux --package-root ../../package-system --clean",
			
 
				+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Linux --package-root ../../package-system --clean",
			
 
				         "azslc-1.7.23-rev2-linux": "Scripts/extras/pull_and_build_from_git.py ../../package-system/azslc --platform-name Linux --package-root ../../package-system --clean",
			
 
				         "tiff-4.2.0.10-linux" : "package-system/tiff/build_package_image.py --platform linux",
			
 
				         "python-3.7.10-rev2-linux" : "package-system/python/build_package_image.py",
			
@@ -41,7 +42,8 @@
 
				         "OpenSSL-1.1.1b-rev2-linux": "package-system/OpenSSL-linux",
			
 
				         "ilmbase-2.3.0-rev4-linux": "package-system/ilmbase-linux",
			
 
				         "SPIRVCross-2021.04.29-rev1-linux": "package-system/SPIRVCross-linux",
			
 
				-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-linux": "package-system/DirectXShaderCompilerDxc-linux",
			
 
				+        "squish-ccr-deb557d-rev1-linux" : "package-system/squish-ccr-linux",
			
 
				+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-linux": "package-system/DirectXShaderCompilerDxc-linux",
			
 
				         "azslc-1.7.23-rev2-linux": "package-system/azslc-linux",
			
 
				         "tiff-4.2.0.10-linux" : "package-system/tiff-linux",
			
 
				         "python-3.7.10-rev2-linux" : "package-system/python/linux_x64/package",
			
--- a/package_build_list_host_windows.json
+++ b/package_build_list_host_windows.json
@@ -26,7 +26,8 @@
 
				         "OpenSSL-1.1.1b-rev1-android": "package-system/OpenSSL/build_package_image.py --platform-name android",
			
 
				         "ilmbase-2.3.0-rev4-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/OpenEXR --platform-name Windows --package-root ../../package-system --clean",
			
 
				         "SPIRVCross-2021.04.29-rev1-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/SPIRVCross --platform-name Windows --package-root ../../package-system --clean",
			
 
				-        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Windows --package-root ../../package-system --clean",
			
 
				+        "squish-ccr-deb557d-rev1-windows" : "Scripts/extras/pull_and_build_from_git.py ../../package-system/squish-ccr --platform-name Windows --package-root ../../package-system --clean",
			
 
				+        "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/DirectXShaderCompiler --platform-name Windows --package-root ../../package-system --clean",
			
 
				         "azslc-1.7.23-rev2-windows": "Scripts/extras/pull_and_build_from_git.py ../../package-system/azslc --platform-name Windows --package-root ../../package-system --clean",
			
 
				         "PhysX-4.1.2.29882248-rev3-windows" : "package-system/PhysX/build_package_image.py --platform windows",
			
 
				         "PhysX-4.1.2.29882248-rev3-android" : "package-system/PhysX/build_package_image.py --platform android",
			
@@ -72,7 +73,6 @@
 
				     "alembic-1.7.11-rev3-multiplatform": "package-system/alembic-multiplatform",
			
 
				     "ilmbase-2.3.0-rev4-windows": "package-system/ilmbase-windows",
			
 
				     "assimp-5.0.1-rev11-multiplatform": "package-system/assimp-multiplatform",
			
 
				-    "squish-ccr-20150601-rev3-multiplatform": "package-system/squish-ccr-multiplatform",
			
 
				     "md5-2.0-multiplatform": "package-system/md5-multiplatform",
			
 
				     "RapidJSON-1.1.0-rev1-multiplatform": "package-system/RapidJSON-multiplatform",
			
 
				     "RapidXML-1.13-multiplatform": "package-system/RapidXML-multiplatform",
			
@@ -94,7 +94,8 @@
 
				     "openimageio-2.1.16.0-rev2-windows": "package-system/openimageio-windows",
			
 
				     "v-hacd-2.3-1a49edf-rev1-windows": "package-system/v-hacd-windows",
			
 
				     "SPIRVCross-2021.04.29-rev1-windows": "package-system/SPIRVCross-windows",
			
 
				-    "DirectXShaderCompilerDxc-1.6.2104-o3de-rev3-windows": "package-system/DirectXShaderCompilerDxc-windows",
			
 
				+    "squish-ccr-deb557d-rev1-windows" : "package-system/squish-ccr-windows",
			
 
				+    "DirectXShaderCompilerDxc-1.6.2104-o3de-rev2-windows": "package-system/DirectXShaderCompilerDxc-windows",
			
 
				     "azslc-1.7.23-rev2-windows": "package-system/azslc-windows",
			
 
				     "zstd-1.35-multiplatform": "package-system/zstd-multiplatform",
			
 
				     "SQLite-3.32.2-rev3-multiplatform": "package-system/SQLite-multiplatform",