Просмотр исходного кода

Feature: Added an API for vector (SIMD) instructions through simdpp library
- Transparent multi-architecture (SSE, AVX, NEON) support

BearishSun 8 лет назад
Родитель
Сommit
d5ed0632f2
100 измененных файлов с 11034 добавлено и 2 удалено
  1. 8 2
      Source/BansheeUtility/CMakeLists.txt
  2. 1 0
      Source/BansheeUtility/CMakeSources.cmake
  3. 89 0
      Source/BansheeUtility/Math/BsSIMD.h
  4. 57 0
      Source/BansheeUtility/ThirdParty/simdpp/CMakeLists.txt
  5. 118 0
      Source/BansheeUtility/ThirdParty/simdpp/capabilities.h
  6. 200 0
      Source/BansheeUtility/ThirdParty/simdpp/core/align.h
  7. 127 0
      Source/BansheeUtility/ThirdParty/simdpp/core/aligned_allocator.h
  8. 123 0
      Source/BansheeUtility/ThirdParty/simdpp/core/bit_and.h
  9. 120 0
      Source/BansheeUtility/ThirdParty/simdpp/core/bit_andnot.h
  10. 68 0
      Source/BansheeUtility/ThirdParty/simdpp/core/bit_not.h
  11. 122 0
      Source/BansheeUtility/ThirdParty/simdpp/core/bit_or.h
  12. 126 0
      Source/BansheeUtility/ThirdParty/simdpp/core/bit_xor.h
  13. 193 0
      Source/BansheeUtility/ThirdParty/simdpp/core/blend.h
  14. 63 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cache.h
  15. 104 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cast.h
  16. 173 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cmp_eq.h
  17. 149 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cmp_ge.h
  18. 248 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cmp_gt.h
  19. 141 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cmp_le.h
  20. 246 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cmp_lt.h
  21. 196 0
      Source/BansheeUtility/ThirdParty/simdpp/core/cmp_neq.h
  22. 97 0
      Source/BansheeUtility/ThirdParty/simdpp/core/combine.h
  23. 169 0
      Source/BansheeUtility/ThirdParty/simdpp/core/detail/get_expr_bitwise.h
  24. 233 0
      Source/BansheeUtility/ThirdParty/simdpp/core/detail/get_expr_uint.h
  25. 216 0
      Source/BansheeUtility/ThirdParty/simdpp/core/detail/scalar_arg_impl.h
  26. 103 0
      Source/BansheeUtility/ThirdParty/simdpp/core/detail/subvec_extract.h
  27. 62 0
      Source/BansheeUtility/ThirdParty/simdpp/core/detail/subvec_insert.h
  28. 103 0
      Source/BansheeUtility/ThirdParty/simdpp/core/extract.h
  29. 67 0
      Source/BansheeUtility/ThirdParty/simdpp/core/extract_bits.h
  30. 73 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_abs.h
  31. 71 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_add.h
  32. 54 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_ceil.h
  33. 73 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_div.h
  34. 55 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_floor.h
  35. 56 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_fmadd.h
  36. 56 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_fmsub.h
  37. 63 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_isnan.h
  38. 69 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_isnan2.h
  39. 73 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_max.h
  40. 74 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_min.h
  41. 73 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_mul.h
  42. 70 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_neg.h
  43. 50 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_rcp_e.h
  44. 64 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_rcp_rh.h
  45. 44 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_add.h
  46. 44 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_max.h
  47. 44 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_min.h
  48. 44 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_mul.h
  49. 50 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_rsqrt_e.h
  50. 53 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_rsqrt_rh.h
  51. 71 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_sign.h
  52. 70 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_sqrt.h
  53. 74 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_sub.h
  54. 53 0
      Source/BansheeUtility/ThirdParty/simdpp/core/f_trunc.h
  55. 41 0
      Source/BansheeUtility/ThirdParty/simdpp/core/for_each.h
  56. 117 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_abs.h
  57. 117 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_add.h
  58. 111 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_add_sat.h
  59. 162 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_avg.h
  60. 177 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_avg_trunc.h
  61. 131 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_div_p.h
  62. 193 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_max.h
  63. 194 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_min.h
  64. 129 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_mul.h
  65. 156 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_mull.h
  66. 97 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_neg.h
  67. 82 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_popcnt.h
  68. 82 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_add.h
  69. 80 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_and.h
  70. 80 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_max.h
  71. 80 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_min.h
  72. 58 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_mul.h
  73. 80 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_or.h
  74. 80 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_popcnt.h
  75. 303 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_shift_l.h
  76. 398 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_shift_r.h
  77. 117 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_sub.h
  78. 110 0
      Source/BansheeUtility/ThirdParty/simdpp/core/i_sub_sat.h
  79. 107 0
      Source/BansheeUtility/ThirdParty/simdpp/core/insert.h
  80. 62 0
      Source/BansheeUtility/ThirdParty/simdpp/core/load.h
  81. 49 0
      Source/BansheeUtility/ThirdParty/simdpp/core/load_packed2.h
  82. 52 0
      Source/BansheeUtility/ThirdParty/simdpp/core/load_packed3.h
  83. 55 0
      Source/BansheeUtility/ThirdParty/simdpp/core/load_packed4.h
  84. 51 0
      Source/BansheeUtility/ThirdParty/simdpp/core/load_splat.h
  85. 67 0
      Source/BansheeUtility/ThirdParty/simdpp/core/load_u.h
  86. 166 0
      Source/BansheeUtility/ThirdParty/simdpp/core/make_float.h
  87. 171 0
      Source/BansheeUtility/ThirdParty/simdpp/core/make_int.h
  88. 559 0
      Source/BansheeUtility/ThirdParty/simdpp/core/make_shuffle_bytes_mask.h
  89. 199 0
      Source/BansheeUtility/ThirdParty/simdpp/core/make_uint.h
  90. 139 0
      Source/BansheeUtility/ThirdParty/simdpp/core/move_l.h
  91. 141 0
      Source/BansheeUtility/ThirdParty/simdpp/core/move_r.h
  92. 135 0
      Source/BansheeUtility/ThirdParty/simdpp/core/permute2.h
  93. 142 0
      Source/BansheeUtility/ThirdParty/simdpp/core/permute4.h
  94. 62 0
      Source/BansheeUtility/ThirdParty/simdpp/core/permute_bytes16.h
  95. 64 0
      Source/BansheeUtility/ThirdParty/simdpp/core/permute_zbytes16.h
  96. 54 0
      Source/BansheeUtility/ThirdParty/simdpp/core/set_splat.h
  97. 65 0
      Source/BansheeUtility/ThirdParty/simdpp/core/shuffle1.h
  98. 122 0
      Source/BansheeUtility/ThirdParty/simdpp/core/shuffle2.h
  99. 75 0
      Source/BansheeUtility/ThirdParty/simdpp/core/shuffle2x2.h
  100. 79 0
      Source/BansheeUtility/ThirdParty/simdpp/core/shuffle4x2.h

+ 8 - 2
Source/BansheeUtility/CMakeLists.txt

@@ -17,10 +17,13 @@ if(LINUX)
 	endif()
 	endif()
 endif()
 endif()
 
 
+# Third party (non-package) libraries
+add_library(ThirdParty INTERFACE)
+target_include_directories(ThirdParty INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty")
+
 # Includes
 # Includes
 set(BansheeUtility_INC 
 set(BansheeUtility_INC 
-	"./" 
-	"ThirdParty")
+	"./")
 
 
 if(WIN32)
 if(WIN32)
 	set(BansheeUtility_INC ${BansheeUtility_INC} "Win32")
 	set(BansheeUtility_INC ${BansheeUtility_INC} "Win32")
@@ -41,6 +44,9 @@ target_compile_definitions(BansheeUtility PRIVATE -DBS_UTILITY_EXPORTS)
 ## External lib: Snappy
 ## External lib: Snappy
 target_link_libraries(BansheeUtility PRIVATE ${snappy_LIBRARIES})	
 target_link_libraries(BansheeUtility PRIVATE ${snappy_LIBRARIES})	
 
 
+## External libs: Header only libraries
+target_link_libraries(BansheeUtility PUBLIC ThirdParty)
+
 if(WIN32)
 if(WIN32)
 	## OS libs
 	## OS libs
 	target_link_libraries(BansheeUtility PRIVATE DbgHelp IPHLPAPI Rpcrt4)
 	target_link_libraries(BansheeUtility PRIVATE DbgHelp IPHLPAPI Rpcrt4)

+ 1 - 0
Source/BansheeUtility/CMakeSources.cmake

@@ -216,6 +216,7 @@ set(BS_BANSHEEUTILITY_INC_MATH
 	"Math/BsCapsule.h"
 	"Math/BsCapsule.h"
 	"Math/BsMatrixNxM.h"
 	"Math/BsMatrixNxM.h"
 	"Math/BsLine2.h"
 	"Math/BsLine2.h"
+	"Math/BsSIMD.h"
 )
 )
 
 
 set(BS_BANSHEEUTILITY_SRC_ERROR
 set(BS_BANSHEEUTILITY_SRC_ERROR

+ 89 - 0
Source/BansheeUtility/Math/BsSIMD.h

@@ -0,0 +1,89 @@
+//********************************** Banshee Engine (www.banshee3d.com) **************************************************//
+//**************** Copyright (c) 2017 Marko Pintera ([email protected]). All rights reserved. **********************//
+#pragma once
+
+#include "Prerequisites/BsPrerequisitesUtil.h"
+#include "Math/BsVector4.h"
+#include "Math/BsAABox.h"
+#include "Math/BsSphere.h"
+
+#define SIMDPP_ARCH_X86_SSE4_1
+
+#if BS_COMPILER == BS_COMPILER_MSVC
+#pragma warning(disable: 4244)
+#endif
+
+#include "ThirdParty/simdpp/simd.h"
+
+#if BS_COMPILER == BS_COMPILER_MSVC
+#pragma warning(default: 4244)
+#endif
+
+namespace bs
+{
+	namespace simd
+	{
+		using namespace simdpp;
+
+		/** @addtogroup Math
+		 *  @{
+		 */
+
+		/** 
+		 * Version of bs::AABox suitable for SIMD use. Takes up a bit more memory than standard AABox and is always 16-byte
+		 * aligned.
+		 */
+		struct AABox
+		{
+			/** Center of the bounds, W component unused. */
+			SIMDPP_ALIGN(16) Vector4 center;
+
+			/** Extents (half-size) of the bounds, W component unused. */
+			SIMDPP_ALIGN(16) Vector4 extents;
+
+			AABox()
+			{ }
+
+			/** Initializes bounds from an AABox. */
+			AABox(const bs::AABox& box)
+			{
+				center = Vector4(box.getCenter());
+				extents = Vector4(box.getHalfSize());
+			}
+
+			/** Initializes bounds from a Sphere. */
+			AABox(const Sphere& sphere)
+			{
+				center = Vector4(sphere.getCenter());
+
+				float radius = sphere.getRadius();
+				extents = Vector4(radius, radius, radius, 0.0f);
+			}
+
+			/** Initializes bounds from a vector representing the center and equal extents in all directions. */
+			AABox(const Vector3& center, float extent)
+			{
+				this->center = Vector4(center);
+				extents = Vector4(extent, extent, extent, 0.0f);
+			}
+
+			/** Returns true if the current bounds object intersects the provided object. */
+			bool intersects(const AABox& other) const
+			{
+				auto myCenter = load<float32x4>(&center);
+				auto otherCenter = load<float32x4>(&other.center);
+
+				float32x4 diff = abs(sub(myCenter, otherCenter));
+
+				auto myExtents = simd::load<float32x4>(&extents);
+				auto otherExtents = simd::load<float32x4>(&other.extents);
+
+				float32x4 extents = add(myExtents, otherExtents);
+
+				return test_bits_any(bit_cast<uint32x4>(cmp_gt(diff, extents))) == false;
+			}
+		};
+
+		/** @} */
+	}
+}

+ 57 - 0
Source/BansheeUtility/ThirdParty/simdpp/CMakeLists.txt

@@ -0,0 +1,57 @@
+#   Copyright (C) 2013  Povilas Kanapickas <[email protected]>
+#
+#   Distributed under the Boost Software License, Version 1.0.
+#       (See accompanying file LICENSE_1_0.txt or copy at
+#           http://www.boost.org/LICENSE_1_0.txt)
+
+file(GLOB_RECURSE HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.h *.inl)
+
+foreach(FILE ${HEADERS})
+    get_filename_component(FILE_PATH "${FILE}" PATH)
+    install(FILES "${FILE}" DESTINATION "${SIMDPP_INCLUDEDIR}/simdpp/${FILE_PATH}")
+endforeach()
+
+# Don't enable header tests by default because configuring it takes excessive
+# amount of time
+set(ENABLE_HEADER_TESTS "0")
+
+if(${ENABLE_HEADER_TESTS} STREQUAL "1")
+
+    simdpp_get_compilable_archs(COMPILABLE_ARCHS)
+
+    set(HEADER_TESTS "")
+    add_custom_target(check_headers)
+
+    foreach(ARCH ${COMPILABLE_ARCHS})
+        simdpp_get_arch_info(CXX_FLAGS DEFINES_LIST SUFFIX ${ARCH})
+        foreach(FILE ${HEADERS})
+
+            if("${FILE}" STREQUAL ".inl")
+                continue()
+            endif()
+
+            string(REPLACE "/" "_" TEST "${FILE}")
+            string(REPLACE "." "_" TEST "${TEST}")
+            set(TEST "${TEST}${SUFFIX}")
+            set(TEST_OUT "check_headers/test_header_compiles_${TEST}")
+            string(REPLACE "-" "_" TEST_TARGET "check_headers_${TEST}")
+
+
+            file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/check_headers")
+            separate_arguments(CXX_FLAGS)
+            add_custom_command(
+                OUTPUT ${TEST_OUT}
+                COMMAND ${CMAKE_CXX_COMPILER}
+                        -DLIBSIMDPP_SIMD_H
+                        -I "${CMAKE_SOURCE_DIR}"
+                        ${CXX_FLAGS} -x c++ -std=c++11 -g2 -Wall
+                        ${CMAKE_SOURCE_DIR}/simdpp/${FILE}
+                        -c -o ${CMAKE_BINARY_DIR}/${TEST_OUT}
+                DEPENDS ${FILE} )
+            add_custom_target(${TEST_TARGET} DEPENDS ${TEST_OUT})
+            add_dependencies(check_headers "${TEST_TARGET}")
+        endforeach()
+    endforeach()
+
+endif()
+

+ 118 - 0
Source/BansheeUtility/ThirdParty/simdpp/capabilities.h

@@ -0,0 +1,118 @@
+/*  Copyright (C) 2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMD_CAPABILITIES_H
+#define LIBSIMDPP_SIMD_CAPABILITIES_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+#include <simdpp/setup_arch.h>
+
+#if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT8_SIMD 1
+#define SIMDPP_HAS_INT16_SIMD 1
+#define SIMDPP_HAS_INT32_SIMD 1
+#else
+#define SIMDPP_HAS_INT8_SIMD 0
+#define SIMDPP_HAS_INT16_SIMD 0
+#define SIMDPP_HAS_INT32_SIMD 0
+#endif
+
+#if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT64_SIMD 1
+#else
+#define SIMDPP_HAS_INT64_SIMD 0
+#endif
+
+#if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || (SIMDPP_USE_NEON && SIMDPP_64_BITS) || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_FLOAT32_SIMD 1
+#else
+#define SIMDPP_HAS_FLOAT32_SIMD 0
+#endif
+
+#if SIMDPP_USE_SSE2 || (SIMDPP_USE_NEON && SIMDPP_64_BITS) || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
+#define SIMDPP_HAS_FLOAT64_SIMD 1
+#else
+#define SIMDPP_HAS_FLOAT64_SIMD 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_AVX512F || (SIMDPP_USE_NEON && SIMDPP_64_BITS) || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
+#define SIMDPP_HAS_FLOAT64_TO_UINT32_CONVERSION 1
+#else
+#define SIMDPP_HAS_FLOAT64_TO_UINT32_CONVERSION 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_AVX512DQ || (SIMDPP_USE_NEON && SIMDPP_64_BITS) || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT64_TO_FLOAT64_CONVERSION 1
+#define SIMDPP_HAS_INT64_TO_FLOAT32_CONVERSION 1
+#define SIMDPP_HAS_UINT64_TO_FLOAT64_CONVERSION 1
+#define SIMDPP_HAS_UINT64_TO_FLOAT32_CONVERSION 1
+
+#define SIMDPP_HAS_FLOAT32_TO_INT64_CONVERSION 1
+#define SIMDPP_HAS_FLOAT32_TO_UINT64_CONVERSION 1
+
+#define SIMDPP_HAS_FLOAT64_TO_INT64_CONVERSION 1
+#define SIMDPP_HAS_FLOAT64_TO_UINT64_CONVERSION 1
+#else
+#define SIMDPP_HAS_INT64_TO_FLOAT64_CONVERSION 0
+#define SIMDPP_HAS_INT64_TO_FLOAT32_CONVERSION 0
+#define SIMDPP_HAS_UINT64_TO_FLOAT64_CONVERSION 0
+#define SIMDPP_HAS_UINT64_TO_FLOAT32_CONVERSION 0
+
+#define SIMDPP_HAS_FLOAT32_TO_INT64_CONVERSION 0
+#define SIMDPP_HAS_FLOAT32_TO_UINT64_CONVERSION 0
+
+#define SIMDPP_HAS_FLOAT64_TO_INT64_CONVERSION 0
+#define SIMDPP_HAS_FLOAT64_TO_UINT64_CONVERSION 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSSE3 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT8_SHIFT_L_BY_VECTOR 1
+#define SIMDPP_HAS_UINT8_SHIFT_L_BY_VECTOR 1
+#define SIMDPP_HAS_INT16_SHIFT_L_BY_VECTOR 1
+#define SIMDPP_HAS_UINT16_SHIFT_L_BY_VECTOR 1
+#else
+#define SIMDPP_HAS_INT8_SHIFT_L_BY_VECTOR 0
+#define SIMDPP_HAS_UINT8_SHIFT_L_BY_VECTOR 0
+#define SIMDPP_HAS_INT16_SHIFT_L_BY_VECTOR 0
+#define SIMDPP_HAS_UINT16_SHIFT_L_BY_VECTOR 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT32_SHIFT_L_BY_VECTOR 1
+#define SIMDPP_HAS_UINT32_SHIFT_L_BY_VECTOR 1
+#else
+#define SIMDPP_HAS_INT32_SHIFT_L_BY_VECTOR 0
+#define SIMDPP_HAS_UINT32_SHIFT_L_BY_VECTOR 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSSE3 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT8_SHIFT_R_BY_VECTOR 1
+#define SIMDPP_HAS_UINT8_SHIFT_R_BY_VECTOR 1
+#define SIMDPP_HAS_UINT16_SHIFT_R_BY_VECTOR 1
+#else
+#define SIMDPP_HAS_INT8_SHIFT_R_BY_VECTOR 0
+#define SIMDPP_HAS_UINT8_SHIFT_R_BY_VECTOR 0
+#define SIMDPP_HAS_UINT16_SHIFT_R_BY_VECTOR 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_AVX512BW || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT16_SHIFT_R_BY_VECTOR 1
+#else
+#define SIMDPP_HAS_INT16_SHIFT_R_BY_VECTOR 0
+#endif
+
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#define SIMDPP_HAS_INT32_SHIFT_R_BY_VECTOR 1
+#define SIMDPP_HAS_UINT32_SHIFT_R_BY_VECTOR 1
+#else
+#define SIMDPP_HAS_INT32_SHIFT_R_BY_VECTOR 0
+#define SIMDPP_HAS_UINT32_SHIFT_R_BY_VECTOR 0
+#endif
+
+#endif

+ 200 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/align.h

@@ -0,0 +1,200 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_ALIGN_H
+#define LIBSIMDPP_SIMDPP_CORE_ALIGN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/align.h>
+#include <simdpp/detail/get_expr.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Extracts a int8x16 vector from two concatenated int8x16 vectors
+
+    @code
+    shift:  pos:| 0   1    .  14  15  |
+     0      r = [ l0  l1   .  l14 l15 ]
+     1      r = [ l1  l2   .  l15 u0  ]
+     2      r = [ l2  l3   .  u0  l1  ]
+      ...    ..   .. ..  ... .. ..
+     15     r = [ l15 u0   .  u13 u14 ]
+     16     r = [ u0  u1   .  u14 u15 ]
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    align16(const any_vec8<N,V1>& lower,
+            const any_vec8<N,V2>& upper)
+{
+    static_assert(shift <= 16, "Shift out of bounds");
+    if (shift == 0) return lower.wrapped().eval();
+    if (shift == 16) return upper.wrapped().eval();
+
+    typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
+    qlower = lower.wrapped().eval();
+    qupper = upper.wrapped().eval();
+    return detail::insn::i_align16<shift>(qlower, qupper);
+}
+
+/** Extracts a int16x8 vector from two concatenated int16x8 vectors
+
+    @code
+    shift:  pos:| 0  1    .  6  7  |
+     0      r = [ l0 l1   .  l6 l7 ]
+     1      r = [ l1 l2   .  l7 u0 ]
+     2      r = [ l2 l3   .  u0 l1 ]
+      ...    ..   .. ..  ... .. ..
+     7      r = [ l3 u0   .  u5 u6 ]
+     8      r = [ u0 u1   .  u6 u7 ]
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
+
+    The all 128-bit sub-vectors are processed as if 128-bit instruction
+    was applied to each of them separately.
+*/
+template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    align8(const any_vec16<N,V1>& lower,
+           const any_vec16<N,V2>& upper)
+{
+    static_assert(shift <= 8, "Shift out of bounds");
+    if (shift == 0) return lower.wrapped().eval();
+    if (shift == 8) return upper.wrapped().eval();
+
+    typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
+    qlower = lower.wrapped().eval();
+    qupper = upper.wrapped().eval();
+    return detail::insn::i_align8<shift>(qlower, qupper);
+}
+
+/** Extracts a int32x4 vector from two concatenated int32x4 vectors
+
+    @code
+    shift:  pos:| 0  1  2  3  |
+     0      r = [ l0 l1 l2 l3 ]
+     1      r = [ l1 l2 l3 u0 ]
+     2      r = [ l2 l3 u0 u1 ]
+     3      r = [ l3 u0 u1 u2 ]
+     4      r = [ u0 u1 u2 u3 ]
+    @endcode
+
+    @par int32
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
+
+    @par float32
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-SSE4.1 NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    align4(const any_vec32<N,V1>& lower,
+           const any_vec32<N,V2>& upper)
+{
+    static_assert(shift <= 4, "Shift out of bounds");
+    if (shift == 0) return lower.wrapped().eval();
+    if (shift == 4) return upper.wrapped().eval();
+
+    typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
+    qlower = lower.wrapped().eval();
+    qupper = upper.wrapped().eval();
+    return detail::insn::i_align4<shift>(qlower, qupper);
+}
+
+
+/** Extracts a int64x2 vector from two concatenated int64x2 vectors
+
+    @code
+    shift:  pos:| 0  1  |
+     0      r = [ l0 l1 ]
+     1      r = [ l1 u0 ]
+     2      r = [ u0 u1 ]
+    @endcode
+
+    @par int64
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
+
+    @par float64
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-SSE4.1 NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    align2(const any_vec64<N,V1>& lower,
+           const any_vec64<N,V2>& upper)
+{
+    static_assert(shift <= 2, "Shift out of bounds");
+    if (shift == 0) return lower.wrapped().eval();
+    if (shift == 2) return upper.wrapped().eval();
+
+    typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
+    qlower = lower.wrapped().eval();
+    qupper = upper.wrapped().eval();
+    return detail::insn::i_align2<shift>(qlower, qupper);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 127 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/aligned_allocator.h

@@ -0,0 +1,127 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_CORE_ALIGNED_ALLOCATOR_H
+#define LIBSIMDPP_CORE_ALIGNED_ALLOCATOR_H
+
+#include <memory>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** An allocator that allocates memory with stricter alignment requirements than
+    the defaults. @a A must be a power of two.
+*/
+template<class T, std::size_t A>
+class aligned_allocator {
+private:
+
+    static_assert(!(A & (A - 1)), "A is not a power of two");
+
+public:
+    using value_type = T;
+    using pointer = T*;
+    using const_pointer = const T*;
+    using reference = T&;
+    using const_reference = const T&;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    aligned_allocator() = default;
+    aligned_allocator(const aligned_allocator&) = default;
+
+    template<class U>
+    aligned_allocator(const aligned_allocator<U,A>&) {}
+
+    ~aligned_allocator() = default;
+
+    aligned_allocator& operator=(const aligned_allocator&) = delete;
+
+    template<class U>
+    struct rebind {
+        using other = aligned_allocator<U,A>;
+    };
+
+    T* address(T& x) const
+    {
+        return &x;
+    }
+
+    std::size_t max_size() const
+    {
+        return (static_cast<std::size_t>(0) - static_cast<std::size_t>(1)) / sizeof(T);
+    }
+
+    // stateless
+    bool operator!=(const aligned_allocator&) const { return false; }
+    bool operator==(const aligned_allocator&) const { return true; }
+
+    void construct(T* p, const T& t) const
+    {
+        void* pv = static_cast<void*>(p);
+        new (pv) T(t);
+    }
+
+    void destroy(T* p) const
+    {
+        p->~T();
+    }
+
+    T* allocate(std::size_t n) const
+    {
+        if (n == 0) {
+            return nullptr;
+        }
+
+        if (n > max_size()) {
+            throw std::length_error("aligned_allocator<T,A>::allocate() - Integer overflow.");
+        }
+
+
+        /* We align the storage by adding @a alignment bytes and truncating the
+            pointer. The pointer to the original location returned by @a new is
+            stored just before the location the returned pointer refers to.
+            To ensure that there is always at least @a sizeof(void*) space
+            there, @a alignment is at least 2*sizoef(void*)
+        */
+        std::size_t al = A < 2*sizeof(void*) ? 2*sizeof(void*) : A;
+
+        char* pv = new char[n*sizeof(T) + al];
+        std::uintptr_t upv = reinterpret_cast<std::uintptr_t>(pv);
+        upv = (upv + al) & ~(al - 1);
+        char** aligned_pv = reinterpret_cast<char**>(upv);
+
+        *(aligned_pv-1) = pv; // original pointer
+
+        return reinterpret_cast<T*>(aligned_pv);
+    }
+
+    void deallocate(T* p, std::size_t n) const
+    {
+        (void) n;
+        if (!p) {
+            return;
+        }
+        char** pptr = reinterpret_cast<char**>(p);
+        delete[](*(pptr - 1));
+    }
+
+    template<class U>
+    T * allocate(std::size_t n, const U* hint) const
+    {
+        (void) hint;
+        return allocate(n);
+    }
+};
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 123 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/bit_and.h

@@ -0,0 +1,123 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_BIT_AND_H
+#define LIBSIMDPP_SIMDPP_CORE_BIT_AND_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/bit_and.h>
+#include <simdpp/detail/expr/bit_and.h>
+#include <simdpp/detail/get_expr.h>
+#include <simdpp/core/detail/get_expr_bitwise.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes bitwise AND of integer or floating-point vectors.
+
+    @code
+    r0 = a0 & b0
+    ...
+    rN = aN & bN
+    @endcode
+
+    @todo: icost
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V1, V2>::type
+        bit_and(const any_vec<N,V1>& a,
+                const any_vec<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+// support scalar arguments
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, unsigned, V>::type
+        bit_and(const unsigned& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, unsigned long, V>::type
+        bit_and(const unsigned long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, unsigned long long, V>::type
+        bit_and(const unsigned long long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, int, V>::type
+        bit_and(const int& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, long, V>::type
+        bit_and(const long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, long long, V>::type
+        bit_and(const long long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V, unsigned>::type
+        bit_and(const any_vec<N,V>& a, const unsigned& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V, unsigned long>::type
+        bit_and(const any_vec<N,V>& a, const unsigned long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V, unsigned long long>::type
+        bit_and(const any_vec<N,V>& a, const unsigned long long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V, int>::type
+        bit_and(const any_vec<N,V>& a, const int& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V, long>::type
+        bit_and(const any_vec<N,V>& a, const long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_and, V, long long>::type
+        bit_and(const any_vec<N,V>& a, const long long& b)
+{
+    return { { a.wrapped(), b } };
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 120 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/bit_andnot.h

@@ -0,0 +1,120 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_BIT_ANDNOT_H
+#define LIBSIMDPP_SIMDPP_CORE_BIT_ANDNOT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/bit_andnot.h>
+#include <simdpp/detail/expr/bit_andnot.h>
+#include <simdpp/core/detail/get_expr_bitwise.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes bitwise AND NOT of two integer or floating-point vectors.
+
+    @code
+    r0 = a0 & ~b0
+    ...
+    rN = aN & ~bN
+    @endcode
+
+    @todo: icost
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V1, V2>::type
+        bit_andnot(const any_vec<N,V1>& a, const any_vec<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+// support scalar arguments
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, unsigned, V>::type
+        bit_andnot(const unsigned& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, unsigned long, V>::type
+        bit_andnot(const unsigned long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, unsigned long long, V>::type
+        bit_andnot(const unsigned long long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, int, V>::type
+        bit_andnot(const int& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, long, V>::type
+        bit_andnot(const long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, long long, V>::type
+        bit_andnot(const long long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V, unsigned>::type
+        bit_andnot(const any_vec<N,V>& a, const unsigned& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V, unsigned long>::type
+        bit_andnot(const any_vec<N,V>& a, const unsigned long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V, unsigned long long>::type
+        bit_andnot(const any_vec<N,V>& a, const unsigned long long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V, int>::type
+        bit_andnot(const any_vec<N,V>& a, const int& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V, long>::type
+        bit_andnot(const any_vec<N,V>& a, const long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_andnot, V, long long>::type
+        bit_andnot(const any_vec<N,V>& a, const long long& b)
+{
+    return { { a.wrapped(), b } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 68 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/bit_not.h

@@ -0,0 +1,68 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_BIT_NOT_H
+#define LIBSIMDPP_SIMDPP_CORE_BIT_NOT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/bit_not.h>
+#include <simdpp/detail/expr/bit_not.h>
+#include <simdpp/detail/get_expr.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes bitwise NOT of an integer or floating-point vector
+
+    @code
+    r = ~a
+    @endcode
+
+    @todo icost
+*/
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr<V, expr_bit_not<V>>::empty
+    bit_not(const any_vec<N,V>& a)
+{
+    typename detail::get_expr_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_bit_not(ra);
+}
+
+/* FIXME
+template<unsigned N, class E> SIMDPP_INL
+mask_int32<N, expr_bit_not<mask_int32<N,E>>> bit_not(mask_int32<N,E> a)
+{
+    return { { a } };
+}
+template<unsigned N, class E> SIMDPP_INL
+mask_int64<N, expr_bit_not<mask_int64<N,E>>> bit_not(mask_int64<N,E> a)
+{
+    return { { a } };
+}
+
+template<unsigned N, class E> SIMDPP_INL
+mask_float32<N, expr_bit_not<mask_float32<N,E>>> bit_not(mask_float32<N,E> a)
+{
+    return { { a } };
+}
+template<unsigned N, class E> SIMDPP_INL
+mask_float64<N, expr_bit_not<mask_float64<N,E>>> bit_not(mask_float64<N,E> a)
+{
+    return { { a } };
+}
+*/
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 122 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/bit_or.h

@@ -0,0 +1,122 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_BIT_OR_H
+#define LIBSIMDPP_SIMDPP_CORE_BIT_OR_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/bit_or.h>
+#include <simdpp/detail/expr/bit_or.h>
+#include <simdpp/core/detail/get_expr_bitwise.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes bitwise OR of integer vectors.
+
+    @code
+    r0 = a0 | b0
+    ...
+    rN = aN | bN
+    @endcode
+
+    @todo icost
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_bit_or<V1, V2>::type
+        bit_or(const any_vec<N,V1>& a, const any_vec<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+// support scalar arguments
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, unsigned, V>::type
+        bit_or(const unsigned& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, unsigned long, V>::type
+        bit_or(const unsigned long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, unsigned long long, V>::type
+        bit_or(const unsigned long long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, int, V>::type
+        bit_or(const int& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, long, V>::type
+        bit_or(const long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, long long, V>::type
+        bit_or(const long long& a, const any_vec<N,V>& b)
+{
+    return { { a, b.wrapped() } };
+}
+
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, V, unsigned>::type
+        bit_or(const any_vec<N,V>& a, const unsigned& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, V, unsigned long>::type
+        bit_or(const any_vec<N,V>& a, const unsigned long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, V, unsigned long long>::type
+        bit_or(const any_vec<N,V>& a, const unsigned long long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, V, int>::type
+        bit_or(const any_vec<N,V>& a, const int& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, V, long>::type
+        bit_or(const any_vec<N,V>& a, const long& b)
+{
+    return { { a.wrapped(), b } };
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_bitwise2_and<expr_bit_or, V, long long>::type
+        bit_or(const any_vec<N,V>& a, const long long& b)
+{
+    return { { a.wrapped(), b } };
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+
+

+ 126 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/bit_xor.h

@@ -0,0 +1,126 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_BIT_XOR_H
+#define LIBSIMDPP_SIMDPP_CORE_BIT_XOR_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/get_expr.h>
+#include <simdpp/detail/insn/bit_xor.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes bitwise XOR of integer or floating-point vectors.
+
+    @code
+    r0 = a0 ^ b0
+    ...
+    rN = aN ^ bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr2<V1, V2>::empty
+    bit_xor(const any_vec<N,V1>& a, const any_vec<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_bit_xor(ra, rb);
+}
+
+// support scalar arguments
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<typename detail::get_expr_nomask<V>::type, V>::empty
+        bit_xor(const unsigned& a, const any_vec<N,V>& b)
+{
+    return bit_xor(detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(a), b);
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<typename detail::get_expr_nomask<V>::type, V>::empty
+        bit_xor(const unsigned long& a, const any_vec<N,V>& b)
+{
+    return bit_xor(detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(a), b);
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<typename detail::get_expr_nomask<V>::type, V>::empty
+        bit_xor(const unsigned long long& a, const any_vec<N,V>& b)
+{
+    return bit_xor(detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(a), b);
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<typename detail::get_expr_nomask<V>::type, V>::empty
+        bit_xor(const int& a, const any_vec<N,V>& b)
+{
+    return bit_xor(detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(a), b);
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<typename detail::get_expr_nomask<V>::type, V>::empty
+        bit_xor(const long& a, const any_vec<N,V>& b)
+{
+    return bit_xor(detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(a), b);
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<typename detail::get_expr_nomask<V>::type, V>::empty
+        bit_xor(const long long& a, const any_vec<N,V>& b)
+{
+    return bit_xor(detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(a), b);
+}
+
+
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<V, typename detail::get_expr_nomask<V>::type>::empty
+        bit_xor(const any_vec<N,V>& a, const unsigned& b)
+{
+    return bit_xor(a, detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(b));
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<V, typename detail::get_expr_nomask<V>::type>::empty
+        bit_xor(const any_vec<N,V>& a, const unsigned long& b)
+{
+    return bit_xor(a, detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(b));
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<V, typename detail::get_expr_nomask<V>::type>::empty
+        bit_xor(const any_vec<N,V>& a, const unsigned long long& b)
+{
+    return bit_xor(a, detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(b));
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<V, typename detail::get_expr_nomask<V>::type>::empty
+        bit_xor(const any_vec<N,V>& a, const int& b)
+{
+    return bit_xor(a, detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(b));
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<V, typename detail::get_expr_nomask<V>::type>::empty
+        bit_xor(const any_vec<N,V>& a, const long& b)
+{
+    return bit_xor(a, detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(b));
+}
+template<unsigned N, class V> SIMDPP_INL
+typename detail::get_expr2<V, typename detail::get_expr_nomask<V>::type>::empty
+        bit_xor(const any_vec<N,V>& a, const long long& b)
+{
+    return bit_xor(a, detail::make_const_bitwise<typename detail::get_expr_nomask<V>::type>(b));
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+
+

+ 193 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/blend.h

@@ -0,0 +1,193 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_BLEND_H
+#define LIBSIMDPP_SIMDPP_CORE_BLEND_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/blend.h>
+#include <simdpp/detail/get_expr.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+namespace detail {
+
+/*  Blend is a special function in that the type of the returned expression
+    depends on three arguments.
+
+    As always, we want to reduce the number of overloads that need to be
+    created in order to match a specific case of an expression tree containing
+    'blend' nodes. In this case we do the following in an attempt to achieve
+    that:
+
+     * the first and the second types have the same type as the expression
+        itself, except that signed integer vectors are converted to unsigned
+     * the third type is the same as the expression itself, except when it is
+        a mask. In that case it is converted to floating-point mask if the
+        expression is floating-point expression and to integer mask otherwise
+     * TODO
+
+     So, as a result, the following tuples of types will appear as the arguments
+     of the returned expression:
+
+      * mask_int8, mask_int8, mask_int8
+      * uint8, uint8, uint8
+      * uint8, uint8, mask_int8
+      * mask_int16, mask_int16, mask_int16
+      * uint16, uint16, uint16
+      * uint16, uint16, mask_uint16
+      * mask_int32, mask_int32, mask_int32
+      * mask_float32, mask_float32, mask_float32
+      * uint32, uint32, uint32
+      * uint32, uint32, mask_int32
+      * float32, float32, float32
+      * float32, float32, mask_float32
+      * mask_int64, mask_int64, mask_int64
+      * mask_float64, mask_float64, mask_float64
+      * uint64, uint64, uint64
+      * uint64, uint64, mask_int64
+      * float64, float64, float64
+      * float64, float64, mask_float64
+
+    The type of the returned expression is governed by the usual rules
+    (see simdpp/types/tag.h)
+*/
+
+template<class V1, class V2, class V3>
+class get_expr_blend {
+
+    // (size_tag) get the size tag of the resulting expression
+    static const unsigned size_tag_t1 = V1::size_tag > V2::size_tag ? V1::size_tag : V2::size_tag;
+    static const unsigned size_tag = size_tag_t1 > V3::size_tag ? size_tag_t1 : V3::size_tag;
+
+    // (type_tag_t2) get the type tag of the first pair of parameters. We
+    // compute it by applying the promotion rules to the first two parameters,
+    // i.e. type_tag_t2 == get_expr2<V1,V2>::type::type_tag
+    static const unsigned type_tag_t1 = V1::type_tag > V2::type_tag ? V1::type_tag : V2::type_tag;
+    static const bool is_mask_op1 = type_tag_t1 == SIMDPP_TAG_MASK_INT ||
+                                    type_tag_t1 == SIMDPP_TAG_MASK_FLOAT;
+    static const unsigned type_tag_t2 = (is_mask_op1 && V1::size_tag != V2::size_tag)
+                                    ? SIMDPP_TAG_UINT : type_tag_t1;
+
+    // (type_tag) get the type tag of the expression. We compute it by applying
+    // the promotion rules to the pair that includes the third parameter and
+    // the result of the first promotion.
+    // I.e. type_tag == get_expr2<get_expr2<V1,V2>::type, V3>::type::type_tag
+    static const unsigned type_tag_t3 = type_tag_t2 > V3::type_tag ? type_tag_t2 : V3::type_tag;
+    static const bool is_mask_op2 = type_tag_t3 == SIMDPP_TAG_MASK_INT ||
+                                    type_tag_t3 == SIMDPP_TAG_MASK_FLOAT;
+    static const unsigned type_tag = (is_mask_op2 && V3::size_tag != size_tag_t1)
+                                    ? SIMDPP_TAG_UINT : type_tag_t3;
+
+    // strip signed types
+    static const unsigned v12_type_tag = type_tag == SIMDPP_TAG_INT ? SIMDPP_TAG_UINT : type_tag;
+
+
+    static const bool is_v3_mask = V3::type_tag == SIMDPP_TAG_MASK_INT ||
+                                   V3::type_tag == SIMDPP_TAG_MASK_FLOAT;
+    static const bool is_v12_float = v12_type_tag == SIMDPP_TAG_FLOAT ||
+                                     v12_type_tag == SIMDPP_TAG_MASK_FLOAT;
+
+    // if third parameter is a mask and its size tag matches the size tag of the
+    // first two parameters, then convert the mask to float mask if the
+    // expression is float and to integer mask otherwise
+    static const unsigned v3_type_tag = (!is_v3_mask || size_tag != V3::size_tag) ? v12_type_tag :
+                                        is_v12_float ? SIMDPP_TAG_MASK_FLOAT :
+                                        SIMDPP_TAG_MASK_INT;
+
+
+public:
+    using v1_final_type = typename type_of_tag<v12_type_tag + size_tag,
+                                               V1::length_bytes, void>::type;
+
+    using v2_final_type = typename type_of_tag<v12_type_tag + size_tag,
+                                               V1::length_bytes, void>::type;
+
+    using v3_final_type = typename type_of_tag<v3_type_tag + size_tag,
+                                               V1::length_bytes, void>::type;
+
+    using type = typename type_of_tag<type_tag + size_tag, V1::length_bytes,
+                                      expr_blend<V1, V2, V3>>::type;
+};
+
+} // namespace detail
+
+/** Composes a vector from two sources according to a mask. Each element within
+    the mask must have either all bits set or all bits unset.
+
+    @code
+    r0 = (mask0 == 0xff ) ? on0 : off0
+    ...
+    rN = (maskN == 0xff ) ? onN : offN
+    @endcode
+
+    @todo icost
+
+    @par int16
+
+    @par 128-bit version:
+    @icost{SSE2-AVX, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6}
+    @icost{NEON, ALTIVEC, 2}
+
+    @par int32
+
+    @par 128-bit version:
+    @icost{SSE2-AVX, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6}
+    @icost{NEON, ALTIVEC, 2}
+
+    @par int64
+
+    @par 128-bit version:
+    @icost{SSE2-AVX, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6}
+    @icost{NEON, ALTIVEC, 2}
+
+    @par float32
+
+    @par 128-bit version:
+    @icost{SSE2-SSE4.1, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 6}
+    @icost{NEON, ALTIVEC, 2}
+
+    @par float64
+
+    @par 128-bit version:
+    @icost{SSE2-SSE4.1, 3}
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 6}
+    @novec{NEON, ALTIVEC}
+*/
+template<unsigned N, class V1, class V2, class V3> SIMDPP_INL
+typename detail::get_expr_blend<V1, V2, V3>::type
+        blend(const any_vec<N,V1>& on, const any_vec<N,V2>& off,
+              const any_vec<N,V3>& mask)
+{
+    return { { on.wrapped(), off.wrapped(), mask.wrapped() } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 63 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cache.h

@@ -0,0 +1,63 @@
+/*  Copyright (C) 2011-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CACHE_H
+#define LIBSIMDPP_SIMDPP_CACHE_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+#include <simdpp/setup_arch.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Prefetches data to the lowest level cache for reading.
+
+    @param ptr pointer to the data to prefetch
+*/
+template<class T>
+SIMDPP_INL void prefetch_read(const T* ptr)
+{
+#if SIMDPP_USE_SSE2
+    _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
+#elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#if __GNUC__
+    // on NEON results in PLD
+    // on Altivec results in DST
+    // on MSA results in PREF
+    __builtin_prefetch(ptr, 0);
+#endif
+#endif
+    (void) ptr;
+}
+
+/** Prefetches data to the lowest level cache for writing.
+
+    @param ptr pointer to the data to prefetch
+*/
+template<class T>
+SIMDPP_INL void prefetch_write(const T* ptr)
+{
+#if SIMDPP_USE_SSE2
+    _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
+#elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
+#if __GNUC__
+    // on NEON results in PLDW
+    // on Altivec results in DSTST
+    // on MSA results in PREF
+    __builtin_prefetch(ptr, 1);
+#endif
+#endif
+    (void) ptr;
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 104 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cast.h

@@ -0,0 +1,104 @@
+/*  Copyright (C) 2011-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CAST_H
+#define LIBSIMDPP_SIMDPP_CORE_CAST_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/setup_arch.h>
+#include <simdpp/detail/cast.h>
+#include <simdpp/types/traits.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+namespace detail {
+
+// on certain architectures mask-mask conversions may need unmasking or remasking
+template<class R, class T> struct cast_mask_override { static const unsigned value = CAST_MASK_MEMCPY; };
+#if SIMDPP_USE_NEON_NO_FLT_SP
+template<unsigned N>
+struct cast_mask_override<mask_float32<N>, mask_int32<N>> { static const unsigned value = CAST_MASK_UNMASK; };
+template<unsigned N>
+struct cast_mask_override<mask_int32<N>, mask_float32<N>> { static const unsigned value = CAST_MASK_REMASK; };
+#endif
+#if SIMDPP_USE_NEON && SIMDPP_32_BITS
+template<unsigned N>
+struct cast_mask_override<mask_int64<N>, mask_float64<N>> { static const unsigned value = CAST_MASK_UNMASK; };
+template<unsigned N>
+struct cast_mask_override<mask_float64<N>, mask_int64<N>> { static const unsigned value = CAST_MASK_REMASK; };
+#endif
+#if SIMDPP_USE_VSX_206 && !SIMDPP_USE_VSX_207
+template<unsigned N>
+struct cast_mask_override<mask_int64<N>, mask_float64<N>> { static const unsigned value = CAST_MASK_REMASK; };
+template<unsigned N>
+struct cast_mask_override<mask_float64<N>, mask_int64<N>> { static const unsigned value = CAST_MASK_UNMASK; };
+#endif
+
+template<class R, class T> SIMDPP_INL
+void bit_cast_impl(const T& t, R& r)
+{
+    const bool is_vector_r = is_vector<R>::value;
+    const bool is_vector_t = is_vector<T>::value;
+    const bool is_mask_r = is_mask<R>::value;
+    const bool is_mask_t = is_mask<T>::value;
+    const unsigned mask_mask_cast_override = detail::cast_mask_override<R,T>::value;
+
+    const unsigned cast_type =
+            (!is_vector_t && !is_vector_r) ? CAST_TYPE_OTHER :
+            (!is_mask_t && !is_mask_r) ? CAST_TYPE_VECTOR_TO_VECTOR :
+            (is_mask_t && !is_mask_r) ? CAST_TYPE_MASK_TO_VECTOR :
+            (!is_mask_t && is_mask_r) ? CAST_TYPE_VECTOR_TO_MASK :
+            // remaining cases deal with is_mask_t && is_mask_r
+            (mask_mask_cast_override == CAST_MASK_REMASK) ? CAST_TYPE_MASK_TO_MASK_REMASK :
+            (mask_mask_cast_override == CAST_MASK_UNMASK) ? CAST_TYPE_MASK_TO_MASK_UNMASK :
+                                                            CAST_TYPE_MASK_TO_MASK_BITWISE;
+
+    static_assert(is_vector_r == is_vector_t,
+                  "bit_cast can't convert between vector and non-vector types");
+
+    detail::cast_wrapper<cast_type>::run(t, r);
+}
+
+template<class T> SIMDPP_INL
+void bit_cast_impl(const T& t, T& r)
+{
+    // Simple implementation for the common case
+    r = t;
+}
+
+} // namespace detail
+
+/** Casts between unrelated types. No changes to the stored values are
+    performed.
+
+    Conversions between vector and non-vector types are not allowed.
+
+    Conversion from non-mask type to mask type is not allowed.
+
+    Conversion from mask type to a non-mask type is not a costless operation
+    because masks may have different logical and physical layout (e.g., in
+    some implementations one bit represents entire element in a vector).
+
+    Conversions between mask types is only allowed if the element size is the
+    same.
+*/
+template<class R, class T> SIMDPP_INL
+R bit_cast(const T& t)
+{
+    R r;
+    detail::bit_cast_impl(t, r);
+    return r;
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 173 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cmp_eq.h

@@ -0,0 +1,173 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CMP_EQ_H
+#define LIBSIMDPP_SIMDPP_CORE_CMP_EQ_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/cmp_eq.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Compares 8-bit values for equality.
+
+    @code
+    r0 = (a0 == b0) ? 0xff : 0x0
+    ...
+    rN = (aN == bN) ? 0xff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_eq(const any_int8<N,V1>& a,
+                               const any_int8<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_eq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_eq, mask_int8, any_int8)
+
+/** Compares 16-bit values for equality.
+
+    @code
+    r0 = (a0 == b0) ? 0xffff : 0x0
+    ...
+    rN = (aN == bN) ? 0xffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_eq(const any_int16<N,V1>& a,
+                                const any_int16<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_eq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_eq, mask_int16, any_int16)
+
+/** Compares the values of two int32x4 vectors for equality
+
+    @code
+    r0 = (a0 == b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN == bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_eq(const any_int32<N,V1>& a,
+                                const any_int32<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_eq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_eq, mask_int32, any_int32)
+
+/** Compares the values of two int64x2 vectors for equality
+
+    @code
+    r0 = (a0 == b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN == bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 5}
+    @icost{XOP, 1}
+    @icost{NEON, 3}
+    @icost{ALTIVEC, 3-4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, AVX, 10}
+    @icost{XOP, SSE4.1, 2}
+    @icost{NEON, 6}
+    @icost{ALTIVEC, 6-7}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_eq(const any_int64<N,V1>& a,
+                                const any_int64<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_eq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_eq, mask_int64, any_int64)
+
+/** Compares the values of two float32x4 vectors for equality
+
+    @code
+    r0 = (a0 == b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN == bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_float32<N,expr_empty> cmp_eq(const any_float32<N,V1>& a,
+                                  const any_float32<N,V2>& b)
+{
+    return detail::insn::i_cmp_eq(a.wrapped().eval(), b.wrapped().eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_eq, mask_float32, any_float32)
+
+/** Compares the values of two float64x2 vectors for equality
+
+    @code
+    r0 = (a0 == b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN == bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_float64<N,expr_empty> cmp_eq(const any_float64<N,V1>& a,
+                                  const any_float64<N,V2>& b)
+{
+    return detail::insn::i_cmp_eq(a.wrapped().eval(), b.wrapped().eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_eq, mask_float64, any_float64)
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 149 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cmp_ge.h

@@ -0,0 +1,149 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CMP_GE_H
+#define LIBSIMDPP_SIMDPP_CORE_CMP_GE_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/cmp_ge.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Compares the values of two signed int16x8 vectors for greater-than
+
+    @code
+    r0 = (a0 >= b0) ? ~0x0 : 0x0
+    ...
+    rN = (aN >= bN) ? ~0x0 : 0x0
+    @endcode
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_ge(const int8<N,E1>& a,
+                               const int8<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int8, int8)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_ge(const uint8<N,E1>& a,
+                               const uint8<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int8, uint8)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_ge(const int16<N,E1>& a,
+                                const int16<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int16, int16)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_ge(const uint16<N,E1>& a,
+                                const uint16<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int16, uint16)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_ge(const int32<N,E1>& a,
+                                const int32<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int32, int32)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_ge(const uint32<N,E1>& a,
+                                const uint32<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int32, uint32)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_ge(const int64<N,E1>& a,
+                                const int64<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int64, int64)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_ge(const uint64<N,E1>& a,
+                                const uint64<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_int64, uint64)
+
+/** Compares the values of two float32x4 vectors for greater-than or equal
+
+    @code
+    r0 = (a0 >= b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN >= bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float32<N,expr_empty> cmp_ge(const float32<N,E1>& a,
+                                  const float32<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_float32, float32)
+
+/** Compares the values of two float64x2 vectors for greater-than
+
+    @code
+    r0 = (a0 >= b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN >= bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float64<N,expr_empty> cmp_ge(const float64<N,E1>& a,
+                                  const float64<N,E2>& b)
+{
+    return detail::insn::i_cmp_ge(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_ge, mask_float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 248 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cmp_gt.h

@@ -0,0 +1,248 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CMP_GT_H
+#define LIBSIMDPP_SIMDPP_CORE_CMP_GT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/cmp_gt.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Compares the values of two signed int16x8 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xff : 0x0
+    ...
+    rN = (aN > bN) ? 0xff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_gt(const int8<N,E1>& a,
+                               const int8<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int8, int8)
+
+
+/** Compares the values of two unsigned int16x8 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xff : 0x0
+    ...
+    rN = (aN > bN) ? 0xff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{XOP, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6-7}
+    @icost{AVX2, 3-4}
+    @icost{XOP, 2}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_gt(const uint8<N,E1>& a,
+                               const uint8<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int8, uint8)
+
+/** Compares the values of two signed int16x8 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_gt(const int16<N,E1>& a,
+                                const int16<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int16, int16)
+
+/** Compares the values of two unsigned int16x8 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{XOP, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6-7}
+    @icost{AVX2, 3-4}
+    @icost{XOP, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_gt(const uint16<N,E1>& a,
+                                const uint16<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int16, uint16)
+
+/** Compares the values of two signed int32x4 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_gt(const int32<N,E1>& a,
+                                const int32<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int32, int32)
+
+/** Compares the values of two unsigned int32x4 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{XOP, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6-7}
+    @icost{AVX2, 3-4}
+    @icost{XOP, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_gt(const uint32<N,E1>& a,
+                                const uint32<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int32, uint32)
+
+/** Compares the values of two signed int64 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffffffff : 0x0
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_gt(const int64<N,E1>& a,
+                                const int64<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int64, int64)
+
+/** Compares the values of two unsigned int64 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffffffff : 0x0
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_gt(const uint64<N,E1>& a,
+                                const uint64<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_int64, uint64)
+
+/** Compares the values of two float32x4 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2>
+mask_float32<N,expr_empty> cmp_gt(const float32<N,E1>& a,
+                                  const float32<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_float32, float32)
+
+/** Compares the values of two float64x2 vectors for greater-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float64<N,expr_empty> cmp_gt(const float64<N,E1>& a,
+                                  const float64<N,E2>& b)
+{
+    return detail::insn::i_cmp_gt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_gt, mask_float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 141 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cmp_le.h

@@ -0,0 +1,141 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CMP_LE_H
+#define LIBSIMDPP_SIMDPP_CORE_CMP_LE_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/cmp_le.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_le(const int8<N,E1>& a,
+                               const int8<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int8, int8)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_le(const uint8<N,E1>& a,
+                               const uint8<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int8, uint8)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_le(const int16<N,E1>& a,
+                                const int16<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int16, int16)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_le(const uint16<N,E1>& a,
+                                const uint16<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int16, uint16)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_le(const int32<N,E1>& a,
+                                const int32<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int32, int32)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_le(const uint32<N,E1>& a,
+                                const uint32<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int32, uint32)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_le(const int64<N,E1>& a,
+                                const int64<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int64, int64)
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_le(const uint64<N,E1>& a,
+                                const uint64<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_int64, uint64)
+
+/** Compares the values of two float32x4 vectors for less-than or equal
+
+    @code
+    r0 = (a0 <= b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN <= bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float32<N,expr_empty> cmp_le(const float32<N,E1>& a,
+                                        const float32<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_float32, float32)
+
+/** Compares the values of two float64x2 vectors for less-than or equal
+
+    @code
+    r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN <= bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float64<N,expr_empty> cmp_le(const float64<N,E1>& a,
+                                        const float64<N,E2>& b)
+{
+    return detail::insn::i_cmp_le(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_le, mask_float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 246 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cmp_lt.h

@@ -0,0 +1,246 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CMP_LT_H
+#define LIBSIMDPP_SIMDPP_CORE_CMP_LT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/cmp_lt.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Compares the values of two signed int8x16 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xff : 0x0
+    ...
+    rN = (aN < bN) ? 0xff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_lt(const int8<N,E1>& a,
+                               const int8<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int8, int8)
+
+/** Compares the values of two unsigned int8x16 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xff : 0x0
+    ...
+    rN = (aN < bN) ? 0xff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{XOP, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6-7}
+    @icost{AVX2, 3-4}
+    @icost{XOP, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_lt(const uint8<N,E1>& a,
+                               const uint8<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int8, uint8)
+
+/** Compares the values of two signed int16x8 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xffff : 0x0
+    ...
+    rN = (aN < bN) ? 0xffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_lt(const int16<N,E1>& a,
+                                const int16<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int16, int16)
+
+/** Compares the values of two unsigned int16x8 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xffff : 0x0
+    ...
+    rN = (aN < bN) ? 0xffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{XOP, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6-7}
+    @icost{AVX2, 3-4}
+    @icost{XOP, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_lt(const uint16<N,E1>& a,
+                                const uint16<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int16, uint16)
+
+/** Compares the values of two signed int32x4 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN < bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_lt(const int32<N,E1>& a,
+                                const int32<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int32, int32)
+
+/** Compares the values of two unsigned int32x4 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN < bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{XOP, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 6-7}
+    @icost{AVX2, 3-4}
+    @icost{XOP, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_lt(const uint32<N,E1>& a,
+                                const uint32<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int32, uint32)
+
+/** Compares the values of two signed int64 vectors for less-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffffffff : 0x0
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_lt(const int64<N,E1>& a,
+                                const int64<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int64, int64)
+
+/** Compares the values of two unsigned int64 vectors for less-than
+
+    @code
+    r0 = (a0 > b0) ? 0xffffffffffff : 0x0
+    ...
+    rN = (aN > bN) ? 0xffffffffffff : 0x0
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_lt(const uint64<N,E1>& a,
+                                const uint64<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_int64, uint64)
+
+/** Compares the values of two float32x4 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN < bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float32<N,expr_empty> cmp_lt(const float32<N,E1>& a,
+                                  const float32<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_float32, float32)
+
+/** Compares the values of two float64x2 vectors for less-than
+
+    @code
+    r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN < bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float64<N,expr_empty> cmp_lt(const float64<N,E1>& a,
+                                  const float64<N,E2>& b)
+{
+    return detail::insn::i_cmp_lt(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(cmp_lt, mask_float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 196 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/cmp_neq.h

@@ -0,0 +1,196 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_CMP_NEQ_H
+#define LIBSIMDPP_SIMDPP_CORE_CMP_NEQ_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/cmp_neq.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Compares the values of two int8x16 vectors for inequality
+
+    @code
+    r0 = (a0 != b0) ? 0xff : 0x0
+    ...
+    rN = (aN != bN) ? 0xff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+    @icost{XOP, 1}
+
+    @par 256-bit version
+    @icost{SSE2-AVX, NEON, ALTIVEC, 4}
+    @icost{AVX2, 2}
+    @icost{XOP, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int8<N,expr_empty> cmp_neq(const any_int8<N,V1>& a,
+                                const any_int8<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_neq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_neq, mask_int8, any_int8)
+
+/** Compares the values of two int16x8 vectors for inequality
+
+    @code
+    r0 = (a0 != b0) ? 0xffff : 0x0
+    ...
+    rN = (aN != bN) ? 0xffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+    @icost{XOP, 1}
+
+    @par 256-bit version
+    @icost{SSE2-AVX, NEON, ALTIVEC, 4}
+    @icost{AVX2, 2}
+    @icost{XOP, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int16<N,expr_empty> cmp_neq(const any_int16<N,V1>& a,
+                                 const any_int16<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_neq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_neq, mask_int16, any_int16)
+
+/** Compares the values of two int32x4 vectors for inequality
+
+    @code
+    r0 = (a0 != b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN != bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+    @icost{XOP, 1}
+
+    @par 256-bit version
+    @icost{SSE2-AVX, NEON, ALTIVEC, 4}
+    @icost{AVX2, 2}
+    @icost{XOP, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int32<N,expr_empty> cmp_neq(const any_int32<N,V1>& a,
+                                 const any_int32<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_neq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_neq, mask_int32, any_int32)
+
+/** Compares the values of two int64x2 vectors for inequality
+
+    @code
+    r0 = (a0 != b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN != bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 5}
+    @icost{SSE4.1, AVX, 2}
+    @icost{XOP, 1}
+    @icost{NEON, 4}
+    @icost{ALTIVEC, 3-5}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, AVX, 10}
+    @icost{SSE4.1, NEON, 4}
+    @icost{AVX2, XOP, 2}
+    @icost{NEON, 8}
+    @icost{ALTIVEC, 6-8}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_int64<N,expr_empty> cmp_neq(const any_int64<N,V1>& a,
+                                 const any_int64<N,V2>& b)
+{
+    typename detail::get_expr2_nosign<V1, V2>::type ra, rb;
+    ra = a.wrapped().eval();
+    rb = b.wrapped().eval();
+    return detail::insn::i_cmp_neq(ra, rb);
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_neq, mask_int64, any_int64)
+
+/** Compares the values of two float32x4 vectors for inequality
+
+    @code
+    r0 = (a0 != b0) ? 0xffffffff : 0x0
+    ...
+    rN = (aN != bN) ? 0xffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @icost{NEON, ALTIVEC, 2}
+
+    @par 256-bit version
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, ALTIVEC, 4}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_float32<N,expr_empty> cmp_neq(const any_float32<N,V1>& a,
+                                         const any_float32<N,V2>& b)
+{
+    return detail::insn::i_cmp_neq(a.wrapped().eval(), b.wrapped().eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_neq, mask_float32, any_float32)
+
+/** Compares the values of two float64x2 vectors for inequality
+
+    @code
+    r0 = (a0 != b0) ? 0xffffffffffffffff : 0x0
+    ...
+    rN = (aN != bN) ? 0xffffffffffffffff : 0x0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+mask_float64<N,expr_empty> cmp_neq(const any_float64<N,V1>& a,
+                                         const any_float64<N,V2>& b)
+{
+    return detail::insn::i_cmp_neq(a.wrapped().eval(), b.wrapped().eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(cmp_neq, mask_float64, any_float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 97 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/combine.h

@@ -0,0 +1,97 @@
+/*  Copyright (C) 2011-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_COMBINE_H
+#define LIBSIMDPP_SIMDPP_CORE_COMBINE_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/combine.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Combines two vectors into one twice as large. This function is useful when
+    the ISA supports multiple vector sizes and the user does some operations
+    with vectors that are narrower than the widest native vector.
+
+    For example, on AVX, two __m128 vectors can be combined into a __m256
+    vector.
+
+    @todo icost
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N*2> combine(const uint8<N,E1>& a1, const uint8<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint8<N*2>>(a1.eval(), a2.eval());
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N*2> combine(const uint16<N,E1>& a1, const uint16<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint16<N*2>>(a1.eval(), a2.eval());
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint32<N*2> combine(const uint32<N,E1>& a1, const uint32<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint32<N*2>>(a1.eval(), a2.eval());
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint64<N*2> combine(const uint64<N,E1>& a1, const uint64<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint64<N*2>>(a1.eval(), a2.eval());
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N*2> combine(const int8<N,E1>& a1, const int8<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint8<N*2>>(uint8<N>(a1.eval()),
+                                               uint8<N>(a2.eval()));
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N*2> combine(const int16<N,E1>& a1, const int16<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint16<N*2>>(uint16<N>(a1.eval()),
+                                                uint16<N>(a2.eval()));
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int32<N*2> combine(const int32<N,E1>& a1, const int32<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint32<N*2>>(uint32<N>(a1.eval()),
+                                                uint32<N>(a2.eval()));
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int64<N*2> combine(const int64<N,E1>& a1, const int64<N,E2>& a2)
+{
+    return detail::insn::i_combine<uint64<N*2>>(uint64<N>(a1.eval()),
+                                                uint64<N>(a2.eval()));
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N*2> combine(const float32<N,E1>& a1, const float32<N,E2>& a2)
+{
+    return detail::insn::i_combine<float32<N*2>>(a1.eval(), a2.eval());
+}
+
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N*2> combine(const float64<N,E1>& a1, const float64<N,E2>& a2)
+{
+    return detail::insn::i_combine<float64<N*2>>(a1.eval(), a2.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 169 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/detail/get_expr_bitwise.h

@@ -0,0 +1,169 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_DETAIL_GET_EXPR_BITWISE_H
+#define LIBSIMDPP_SIMDPP_CORE_DETAIL_GET_EXPR_BITWISE_H
+
+#include <simdpp/detail/get_expr.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+namespace detail {
+
+
+/*  We want to reduce the number of overloads that need to be created in order
+    to match a specific case of an expression tree containing 'bit_and',
+    or 'bit_andnot'. nodes. The following "optimizations"
+    are performed:
+
+      * If the parameters are types have different element sizes then both
+        expression arguments have the same type as the expression itself, except
+        that signed vectors are converted to unsigned vectors.
+      * Otherwise if the expression is of a mask type then both types are the
+        same as the expression itself.
+      * Otherwise both types have the same type as the expression itself, except
+        that signed vectors are converted to unsigned vectors and if the second
+        type is a mask type then it is converted to floating-point mask if the
+        expression is floating-point expression and to integer mask otherwise.
+
+     So, as a result, the following tuples of types will appear as the arguments
+     of the returned expression:
+
+      * mask_int8, mask_int8
+      * uint8, mask_int8
+      * uint8, uint8
+      * mask_int16, mask_int16
+      * uint16, mask_int16
+      * uint16, uint16
+      * mask_int32, mask_int32
+      * uint32, mask_int32
+      * uint32, uint32
+      * mask_int64, mask_int64
+      * uint64, mask_int64
+      * uint64, uint64
+      * mask_float32, mask_float32
+      * float32, mask_float32
+      * float32, float32
+      * mask_float64, mask_float64
+      * float64, mask_float64
+      * float64, float64
+
+    The type of the returned expression is governed by the usual rules
+    (see simdpp/types/tag.h)
+*/
+
+template<class V1, class V2>
+struct get_expr_bitwise2_and_impl {
+    using tags = expr2_maybe_scalar_tags<V1, V2>;
+
+    // (size_tag) get the size tag of the resulting expression
+    static const unsigned size_tag = tags::v1_size_tag > tags::v2_size_tag
+                                    ? tags::v1_size_tag : tags::v2_size_tag;
+
+    // (type_tag) get the type tag of the expression. We compute it in the same
+    // way get_expr2 computes them, i.e.
+    // type_tag == get_expr2<V1,V2>::type::type_tag
+    static const unsigned type_tag_t1 = tags::v1_type_tag > tags::v2_type_tag
+                                    ? tags::v1_type_tag : tags::v2_type_tag;
+    static const bool is_mask_op1 = type_tag_t1 == SIMDPP_TAG_MASK_INT ||
+                                    type_tag_t1 == SIMDPP_TAG_MASK_FLOAT;
+    static const unsigned type_tag = (is_mask_op1 && tags::v1_size_tag != tags::v2_size_tag)
+                                    ? SIMDPP_TAG_UINT : type_tag_t1;
+
+    // strip signed integer types
+    static const unsigned v1_type_tag = type_tag == SIMDPP_TAG_INT ? SIMDPP_TAG_UINT : type_tag;
+
+
+    static const bool is_v2_mask = tags::v2_type_tag == SIMDPP_TAG_MASK_INT ||
+                                   tags::v2_type_tag == SIMDPP_TAG_MASK_FLOAT;
+    static const bool is_v1_float = type_tag == SIMDPP_TAG_FLOAT ||
+                                     type_tag == SIMDPP_TAG_MASK_FLOAT;
+
+    // if second parameter is a mask, then:
+    //    - convert the mask to float mask if the expression is float
+    //    - convert the mask to integer mask otherwise
+    static const unsigned v2_type_tag = (!is_v2_mask) ? v1_type_tag :
+                                        is_v1_float ? SIMDPP_TAG_MASK_FLOAT :
+                                        SIMDPP_TAG_MASK_INT;
+
+    using v1_final_type = typename type_of_tag<v1_type_tag + size_tag,
+                                               tags::length_bytes, void>::type;
+    using v2_final_type = typename type_of_tag<v2_type_tag + size_tag,
+                                               tags::length_bytes, void>::type;
+};
+
+template<template<class, class> class E, class V1, class V2>
+struct get_expr_bitwise2_and {
+    using impl = get_expr_bitwise2_and_impl<V1, V2>;
+    using type = typename type_of_tag<impl::type_tag + impl::size_tag,
+                                      impl::tags::length_bytes,
+                                      E<V1, V2>>::type;
+};
+
+
+/*  The case with bit_or is similar to bit_and and bit_andnot except that the
+    expression types are either leave two masks or none.
+
+      * Both expression arguments have the same type as the expression itself,
+        except that signed vectors are converted to unsigned vectors.
+
+     So, as a result, the following tuples of types will appear as the arguments
+     of the returned expression:
+
+      * mask_int8, mask_int8
+      * uint8, uint8
+      * mask_int16, mask_int16
+      * uint16, uint16
+      * mask_int32, mask_int32
+      * uint32, uint32
+      * mask_int64, mask_int64
+      * uint64, uint64
+      * mask_float32, mask_float32
+      * float32, float32
+      * mask_float64, mask_float64
+      * float64, float64
+
+    The type of the returned expression is governed by the usual rules
+    (see simdpp/types/tag.h)
+*/
+
+template<class V1, class V2>
+class get_expr_bit_or {
+
+    // (size_tag) get the size tag of the resulting expression
+    static const unsigned size_tag = V1::size_tag > V2::size_tag ? V1::size_tag : V2::size_tag;
+
+    // (type_tag) get the type tag of the expression. We compute it in the same
+    // way get_expr2 computes them, i.e.
+    // type_tag == get_expr2<V1,V2>::type::type_tag
+    static const unsigned type_tag_t1 = V1::type_tag > V2::type_tag ? V1::type_tag : V2::type_tag;
+    static const bool is_mask_op1 = type_tag_t1 == SIMDPP_TAG_MASK_INT ||
+                                    type_tag_t1 == SIMDPP_TAG_MASK_FLOAT;
+    static const unsigned type_tag = (is_mask_op1 && V1::size_tag != V2::size_tag)
+                                    ? SIMDPP_TAG_UINT : type_tag_t1;
+
+    // strip signed integer types
+    static const unsigned v12_type_tag = type_tag == SIMDPP_TAG_INT ? SIMDPP_TAG_UINT : type_tag;
+
+
+public:
+    using v1_final_type = typename type_of_tag<v12_type_tag + size_tag,
+                                               V1::length_bytes, void>::type;
+    using v2_final_type = typename type_of_tag<v12_type_tag + size_tag,
+                                               V1::length_bytes, void>::type;
+
+    using type = typename type_of_tag<type_tag + size_tag, V1::length_bytes,
+                                      expr_bit_or<V1, V2>>::type;
+};
+
+
+
+} // namespace detail
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 233 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/detail/get_expr_uint.h

@@ -0,0 +1,233 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_DETAIL_GET_EXPR_UINT_H
+#define LIBSIMDPP_SIMDPP_CORE_DETAIL_GET_EXPR_UINT_H
+
+#include <simdpp/detail/get_expr.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+namespace detail {
+
+
+/*  We want to reduce the number of overloads that need to be created in order
+    to match a specific case of an expression tree containing various integer
+    operation nodes, such as add(int), mul_lo(int), etc. For particular
+    vector size each of these operations are equivalent regardless of the
+    argument types. Thus we simply convert the arguments of the expression to
+    uint expressions of certain configuration.
+
+    As a result, the following tuples of types will appear as the arguments
+     of the returned expression:
+
+      * uint8, uint8
+      * uint16, uint16
+      * uint32, uint32
+      * uint64, uint64
+*/
+
+template<class V1, class V2>
+struct expr2_uint_maybe_scalar_tags {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<int, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<long, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<long long, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<unsigned, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<unsigned long, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<unsigned long long, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<float, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+template<class V2>
+struct expr2_uint_maybe_scalar_tags<double, V2> {
+    static const unsigned v1_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v1_size_tag = V2::size_tag;
+    static const unsigned v2_type_tag = V2::type_tag;
+    static const unsigned v2_size_tag = V2::size_tag;
+    static const unsigned length_bytes = V2::length_bytes;
+};
+
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, int> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, long> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, long long> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, unsigned> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, unsigned long> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, unsigned long long> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, float> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1>
+struct expr2_uint_maybe_scalar_tags<V1, double> {
+    static const unsigned v1_type_tag = V1::type_tag;
+    static const unsigned v1_size_tag = V1::size_tag;
+    static const unsigned v2_type_tag = SIMDPP_TAG_INT;
+    static const unsigned v2_size_tag = V1::size_tag;
+    static const unsigned length_bytes = V1::length_bytes;
+};
+
+template<class V1, class V2>
+struct get_expr_uint_impl {
+    using tags = expr2_uint_maybe_scalar_tags<V1, V2>;
+
+#if SIMDPP_EXPR_DEBUG
+    static_assert(tags::v1_size_tag == tags::v2_size_tag, "Mismatching vector sizes");
+    static_assert(tags::v1_type_tag == SIMDPP_TAG_MASK_INT ||
+                  tags::v1_type_tag == SIMDPP_TAG_UINT ||
+                  tags::v1_type_tag == SIMDPP_TAG_INT, "Incorrect type parameter");
+    static_assert(tags::v2_type_tag == SIMDPP_TAG_MASK_INT ||
+                  tags::v2_type_tag == SIMDPP_TAG_UINT ||
+                  tags::v2_type_tag == SIMDPP_TAG_INT, "Incorrect type parameter");
+#endif
+
+    // the size tag of the expression
+    static const unsigned size_tag = tags::v1_size_tag;
+
+    // (type_tag) get the type tag of the expression. Pretty much the same as
+    // get_expr2_nomask does
+    static const unsigned type_tag_t1 = tags::v1_type_tag > tags::v2_type_tag ? tags::v1_type_tag : tags::v2_type_tag;
+    static const unsigned type_tag = (type_tag_t1 == SIMDPP_TAG_MASK_INT) ? SIMDPP_TAG_UINT : type_tag_t1;
+
+    // strip signed integer types and masks
+    static const unsigned v1_type_tag = SIMDPP_TAG_UINT;
+    static const unsigned v2_type_tag = SIMDPP_TAG_UINT;
+
+    using v1_final_type = typename type_of_tag<v1_type_tag + size_tag,
+                                               tags::length_bytes, void>::type;
+    using v2_final_type = typename type_of_tag<v2_type_tag + size_tag,
+                                               tags::length_bytes, void>::type;
+};
+
+template<template<class, class> class E, class V1, class V2>
+struct get_expr_uint {
+    using impl = get_expr_uint_impl<V1, V2>;
+
+    using type = typename type_of_tag<impl::type_tag + impl::size_tag,
+                                      impl::tags::length_bytes,
+                                      E<V1, V2>>::type;
+};
+
+} // namespace detail
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 216 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/detail/scalar_arg_impl.h

@@ -0,0 +1,216 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_DETAIL_SCALAR_ARG_IMPL_H
+#define LIBSIMDPP_SIMDPP_CORE_DETAIL_SCALAR_ARG_IMPL_H
+
+#include <simdpp/types.h>
+#include <simdpp/expr.h>
+#include <simdpp/core/make_float.h>
+#include <simdpp/core/make_int.h>
+#include <simdpp/core/make_uint.h>
+#include <simdpp/detail/expr/scalar.h>
+
+/*  The following implements the boilerplate for binary function wrappers that
+    accept values as scalar arguments.
+*/
+
+// simple implementation returning empty expression
+#define SIMDPP_SCALAR_ARG_IMPL_VEC_IMPL(FUNC, RET_VEC, EXPR, NEW_VEC)                                                                   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const unsigned& a,              const EXPR<N,V>& b) { return FUNC(make_uint<NEW_VEC>(a), b); }   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const unsigned long& a,         const EXPR<N,V>& b) { return FUNC(make_uint<NEW_VEC>(a), b); }   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const unsigned long long& a,    const EXPR<N,V>& b) { return FUNC(make_uint<NEW_VEC>(a), b); }   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const int& a,                   const EXPR<N,V>& b) { return FUNC(make_int<NEW_VEC>(a), b); }    \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const long& a,                  const EXPR<N,V>& b) { return FUNC(make_int<NEW_VEC>(a), b); }    \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const long long& a,             const EXPR<N,V>& b) { return FUNC(make_int<NEW_VEC>(a), b); }    \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const float& a,                 const EXPR<N,V>& b) { return FUNC(make_float<NEW_VEC>(a), b); }  \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const double& a,                const EXPR<N,V>& b) { return FUNC(make_float<NEW_VEC>(a), b); }  \
+                                                                                                                                                        \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const unsigned& b          ) { return FUNC(a, make_uint<NEW_VEC>(b)); }   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const unsigned long& b     ) { return FUNC(a, make_uint<NEW_VEC>(b)); }   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const unsigned long long& b) { return FUNC(a, make_uint<NEW_VEC>(b)); }   \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const int& b               ) { return FUNC(a, make_int<NEW_VEC>(b)); }    \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const long& b              ) { return FUNC(a, make_int<NEW_VEC>(b)); }    \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const long long& b         ) { return FUNC(a, make_int<NEW_VEC>(b)); }    \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const float& b             ) { return FUNC(a, make_float<NEW_VEC>(b)); }  \
+template<unsigned N, class V> SIMDPP_INL RET_VEC<N,expr_empty> FUNC(const EXPR<N,V>& a, const double& b            ) { return FUNC(a, make_float<NEW_VEC>(b)); }
+// end #define
+
+
+#define SIMDPP_SCALAR_ARG_IMPL_VEC(FUNC, RET_VEC, VEC) \
+    SIMDPP_SCALAR_ARG_IMPL_VEC_IMPL(FUNC, RET_VEC, VEC, VEC<N>)
+#define SIMDPP_SCALAR_ARG_IMPL_VEC_EXPR(FUNC, RET_VEC, EXPR) \
+    SIMDPP_SCALAR_ARG_IMPL_VEC_IMPL(FUNC, RET_VEC, EXPR, typename detail::get_expr<V>::type)
+// end #define
+
+// implementation returning an expression for vector arguments
+#define SIMDPP_SCALAR_ARG_IMPL_EXPR(FUNC, EXPR, RET_VEC, VEC)                   \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<unsigned, VEC<N,V>>>                                            \
+    FUNC(const unsigned& a, const VEC<N,V>& b)                                  \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<unsigned long, VEC<N,V>>>                                       \
+    FUNC(const unsigned long& a, const VEC<N,V>& b)                             \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<unsigned long long, VEC<N,V>>>                                  \
+    FUNC(const unsigned long long& a, const VEC<N,V>& b)                        \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<int, VEC<N,V>>>                                                 \
+    FUNC(const int& a, const VEC<N,V>& b)                                       \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<long, VEC<N,V>>>                                                \
+    FUNC(const long& a, const VEC<N,V>& b)                                      \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<long long, VEC<N,V>>>                                           \
+    FUNC(const long long& a, const VEC<N,V>& b)                                 \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<float, VEC<N,V>>>                                               \
+    FUNC(const float& a, const VEC<N,V>& b)                                     \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<double, VEC<N,V>>>                                              \
+    FUNC(const double& a, const VEC<N,V>& b)                                    \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, unsigned>>                                            \
+    FUNC(const VEC<N,V>& a, const unsigned& b)                                  \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, unsigned long>>                                       \
+    FUNC(const VEC<N,V>& a, const unsigned long& b)                             \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, unsigned long long>>                                  \
+    FUNC(const VEC<N,V>& a, const unsigned long long& b)                        \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, int>>                                                 \
+    FUNC(const VEC<N,V>& a, const int& b)                                       \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, long>>                                                \
+    FUNC(const VEC<N,V>& a, const long& b)                                      \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, long long>>                                           \
+    FUNC(const VEC<N,V>& a, const long long& b)                                 \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, float>>                                               \
+    FUNC(const VEC<N,V>& a, const float& b)                                     \
+{ return { { a, b } }; }                                                        \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+RET_VEC<N, EXPR<VEC<N,V>, double>>                                              \
+    FUNC(const VEC<N,V>& a, const double& b)                                    \
+{ return { { a, b } }; }
+// end #define
+
+// a implementation for integer operations that use get_expr_uint
+#define SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(FUNC, EXPR, VEC, INT_VEC)           \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, unsigned>::type                         \
+        FUNC(const VEC<N,V>& a, const unsigned& b)                              \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, unsigned long>::type                    \
+        FUNC(const VEC<N,V>& a, const unsigned long& b)                         \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, unsigned long long>::type               \
+        FUNC(const VEC<N,V>& a, const unsigned long long& b)                    \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, int>::type                              \
+        FUNC(const VEC<N,V>& a, const int& b)                                   \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, long>::type                             \
+        FUNC(const VEC<N,V>& a, const long& b)                                  \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, long long>::type                        \
+        FUNC(const VEC<N,V>& a, const long long& b)                             \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, float>::type                            \
+        FUNC(const VEC<N,V>& a, const float& b)                                 \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, V, double>::type                           \
+        FUNC(const VEC<N,V>& a, const double& b)                                \
+{ return { { a.wrapped(), b } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, unsigned, V>::type                         \
+        FUNC(const unsigned& a, const VEC<N,V>& b)                              \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, unsigned long, V>::type                    \
+        FUNC(const unsigned long& a, const VEC<N,V>& b)                         \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, unsigned long long, V>::type               \
+        FUNC(const unsigned long long& a, const VEC<N,V>& b)                    \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, int, V>::type                              \
+        FUNC(const int& a, const VEC<N,V>& b)                                   \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, long, V>::type                             \
+        FUNC(const long& a, const VEC<N,V>& b)                                  \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, long long, V>::type                        \
+        FUNC(const long long& a, const VEC<N,V>& b)                             \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, float, V>::type                            \
+        FUNC(const float& a, const VEC<N,V>& b)                                 \
+{ return { { a, b.wrapped() } }; }                                              \
+                                                                                \
+template<unsigned N, class V> SIMDPP_INL                                        \
+typename detail::get_expr_uint<EXPR, double, V>::type                           \
+        FUNC(const double& a, const VEC<N,V>& b)                                \
+{ return { { a, b.wrapped() } }; }
+// end #define
+
+#endif

+ 103 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/detail/subvec_extract.h

@@ -0,0 +1,103 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_DETAIL_VEC_EXTRACT_H
+#define LIBSIMDPP_SIMDPP_CORE_DETAIL_VEC_EXTRACT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/setup_arch.h>
+#include <simdpp/types.h>
+#include <simdpp/core/insert.h>
+#include <simdpp/core/extract.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+namespace detail {
+
+template<class R, class V> SIMDPP_INL
+R subvec_extract_impl(const V& a, unsigned n)
+{
+    static_assert(R::length >= V::base_length, "Too small vector to extract");
+
+    R r;
+    for (unsigned i = 0; i < r.vec_length; ++i) {
+        r.vec(i) = a.vec(n*r.vec_length + i);
+    }
+    return r;
+
+}
+
+// extract a sub-vector consisting of [M*n .. M*(n+1)) elements
+template<unsigned M, unsigned N> SIMDPP_INL
+uint8<M> subvec_extract(const uint8<N>& a, unsigned n)
+{
+    return subvec_extract_impl<uint8<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+uint16<M> subvec_extract(const uint16<N>& a, unsigned n)
+{
+    return subvec_extract_impl<uint16<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+uint32<M> subvec_extract(const uint32<N>& a, unsigned n)
+{
+    return subvec_extract_impl<uint32<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+uint64<M> subvec_extract(const uint64<N>& a, unsigned n)
+{
+    return subvec_extract_impl<uint64<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+int8<M> subvec_extract(const int8<N>& a, unsigned n)
+{
+    return subvec_extract_impl<int8<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+int16<M> subvec_extract(const int16<N>& a, unsigned n)
+{
+    return subvec_extract_impl<int16<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+int32<M> subvec_extract(const int32<N>& a, unsigned n)
+{
+    return subvec_extract_impl<int32<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+int64<M> subvec_extract(const int64<N>& a, unsigned n)
+{
+    return subvec_extract_impl<int64<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+float32<M> subvec_extract(const float32<N>& a, unsigned n)
+{
+    return subvec_extract_impl<float32<M>>(a, n);
+}
+
+template<unsigned M, unsigned N> SIMDPP_INL
+float64<M> subvec_extract(const float64<N>& a, unsigned n)
+{
+    return subvec_extract_impl<float64<M>>(a, n);
+}
+
+
+} // namespace detail
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 62 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/detail/subvec_insert.h

@@ -0,0 +1,62 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_DETAIL_VEC_INSERT_H
+#define LIBSIMDPP_SIMDPP_CORE_DETAIL_VEC_INSERT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/setup_arch.h>
+#include <simdpp/types.h>
+
+#include <cstring>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+namespace detail {
+
+template<class R, class V> SIMDPP_INL
+void subvec_insert_impl(R& r, const V& v, unsigned n)
+{
+    static_assert(V::length >= R::base_length, "Too small vector to insert");
+
+    for (unsigned i = 0; i < V::vec_length; ++i) {
+        r.vec(n*v.vec_length + i) = v.vec(i); //TODO combine or split as needed
+    }
+}
+
+// Sets the elements [M*n .. M*(n+1)) of @a a to the contents of @a x
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(uint8<N>& a, const uint8<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(uint16<N>& a, const uint16<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(uint32<N>& a, const uint32<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(uint64<N>& a, const uint64<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(int8<N>& a, const int8<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(int16<N>& a, const int16<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(int32<N>& a, const int32<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(int64<N>& a, const int64<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(float32<N>& a, const float32<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+template<unsigned N, unsigned M> SIMDPP_INL
+void subvec_insert(float64<N>& a, const float64<M>& x, unsigned n) { subvec_insert_impl(a, x, n); }
+
+} // namespace detail
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 103 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/extract.h

@@ -0,0 +1,103 @@
+/*  Copyright (C) 2011-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMD_EXTRACT_H
+#define LIBSIMDPP_SIMD_EXTRACT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/extract.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Extracts the @a id-th element from a vector.
+
+    @code
+    r = a[id]
+    @endcode
+
+    This function may have very high latency.
+*/
+template<unsigned id, unsigned N> SIMDPP_INL
+uint8_t extract(const uint8<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int8_t extract(const int8<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+uint16_t extract(const uint16<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int16_t extract(const int16<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+uint32_t extract(const uint32<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int32_t extract(const int32<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+uint64_t extract(const uint64<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int64_t extract(const int64<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+float extract(const float32<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+double extract(const float64<N>& a)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_extract<id>(a);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 67 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/extract_bits.h

@@ -0,0 +1,67 @@
+/*  Copyright (C) 2011-2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMD_CORE_EXTRACT_BITS_H
+#define LIBSIMDPP_SIMD_CORE_EXTRACT_BITS_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/extract_bits.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Extracts a bit from each byte of each element of a vector containing 8-bit
+    elements.
+
+    This operation is only sensible if each byte within the vector is either
+    0x00 or 0xff.
+
+    @code
+    r = ((a[0] & 0x??) ? 0x01 : 0) |
+        ((a[1] & 0x??) ? 0x02 : 0) |
+        ...
+        ((a[15] & 0x??) ? 0x80 : 0)
+    @endcode
+*/
+SIMDPP_INL uint16_t extract_bits_any(const uint8<16>& a)
+{
+    return detail::insn::i_extract_bits_any(a);
+}
+SIMDPP_INL uint32_t extract_bits_any(const uint8<32>& a)
+{
+    return detail::insn::i_extract_bits_any(a);
+}
+
+/** Extracts specific bit from each byte of each element of a int8x16 vector.
+
+    @code
+    r = (a[0] & 0x80 >> 7) | (a[1] & 0x80 >> 6) | ...  | (a[15] & 0x80 << 8)
+    @endcode
+*/
+template<unsigned id> SIMDPP_INL
+uint16_t extract_bits(const uint8<16>& a)
+{
+    static_assert(id < 8, "index out of bounds");
+    return detail::insn::i_extract_bits<id>(a);
+}
+template<unsigned id> SIMDPP_INL
+uint32_t extract_bits(const uint8<32>& a)
+{
+    static_assert(id < 8, "index out of bounds");
+    return detail::insn::i_extract_bits<id>(a);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+
+

+ 73 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_abs.h

@@ -0,0 +1,73 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_ABS_H
+#define LIBSIMDPP_SIMDPP_CORE_F_ABS_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_abs.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes absolute value of floating point values.
+
+    @code
+    r0 = abs(a0)
+    ...
+    rN = abs(aN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 1-2}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2-3}
+    @icost{NEON, 2}
+    @icost{AVX-AVX2, 1-2}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N, expr_fabs<float32<N,E>>> abs(const float32<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Computes absolute value of floating point values.
+
+    @code
+    r0 = abs(a0)
+    ...
+    rN = abs(aN)
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-AVX2, 1-2}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2-3}
+    @icost{AVX-AVX2, 1-2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float64<N, expr_fabs<float64<N,E>>> abs(const float64<N,E>& a)
+{
+    return { { a } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 71 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_add.h

@@ -0,0 +1,71 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_ADD_H
+#define LIBSIMDPP_SIMDPP_CORE_F_ADD_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_add.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Adds the values of two vectors
+
+    @code
+    r0 = a0 + b0
+    ...
+    rN = aN + bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N, expr_fadd<float32<N,E1>,
+                     float32<N,E2>>> add(const float32<N,E1>& a, const float32<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(add, expr_fadd, float32, float32)
+
+/** Adds the values of two vectors
+
+    @code
+    r0 = a0 + b0
+    ...
+    rN = aN + bN
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N, expr_fadd<float64<N,E1>,
+                     float64<N,E2>>> add(const float64<N,E1>& a, const float64<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(add, expr_fadd, float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 54 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_ceil.h

@@ -0,0 +1,54 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_CEIL_H
+#define LIBSIMDPP_SIMDPP_CORE_F_CEIL_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_ceil.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Rounds the values a vector towards positive infinity
+
+    @code
+    r0 = ceil(a0)
+    ...
+    rN = ceil(aN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2, SSE3, SSSE3, 13-15}
+    @icost{NEON, 11-13}
+
+    @par 256-bit version:
+    @icost{SSE2, SSE3, SSSE3, 26-28}
+    @icost{NEON, 22-24}
+    @icost{ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> ceil(const float32<N,E>& a)
+{
+    return detail::insn::i_ceil(a.eval());
+}
+template<unsigned N, class E> SIMDPP_INL
+float64<N,expr_empty> ceil(const float64<N,E>& a)
+{
+    return detail::insn::i_ceil(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 73 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_div.h

@@ -0,0 +1,73 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_DIV_H
+#define LIBSIMDPP_SIMDPP_CORE_F_DIV_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_div.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Divides the values of two vectors.
+
+    @code
+    r0 = a0 / b0
+    ...
+    rN = aN / bN
+    @endcode
+
+    @icost{NEON, 6}
+    @icost{ALTIVEC, 10}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, 12}
+    @icost{ALTIVEC, 19}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N,expr_empty> div(const float32<N,E1>& a, const float32<N,E2>& b)
+{
+    return detail::insn::i_div(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(div, float32, float32)
+
+/** Divides the values of two vectors
+
+    @code
+    r0 = a0 / b0
+    ...
+    rN = aN / bN
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @novec{NEON, ALTIVEC}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N,expr_empty> div(const float64<N,E1>& a, const float64<N,E2>& b)
+{
+    return detail::insn::i_div(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(div, float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 55 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_floor.h

@@ -0,0 +1,55 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_FLOOR_H
+#define LIBSIMDPP_SIMDPP_CORE_F_FLOOR_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <cmath>
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_floor.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Rounds the values of a vector towards negative infinity
+
+    @code
+    r0 = floor(a0)
+    ...
+    rN = floor(aN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 12-14}
+    @icost{NEON, 10-11}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 24-26}
+    @icost{NEON, 20-21}
+    @icost{ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> floor(const float32<N,E>& a)
+{
+    return detail::insn::i_floor(a.eval());
+}
+template<unsigned N, class E> SIMDPP_INL
+float64<N,expr_empty> floor(const float64<N,E>& a)
+{
+    return detail::insn::i_floor(a.eval());
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 56 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_fmadd.h

@@ -0,0 +1,56 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_FMADD_H
+#define LIBSIMDPP_SIMDPP_CORE_F_FMADD_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_fmadd.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Performs a fused multiply-add operation
+
+    @code
+    r0 = a0 * b0 + c0
+    ...
+    rN = aN * bN + cN
+    @endcode
+
+    Implemented only on architectures with either @c X86_FMA3 or @c X86_FMA4
+    support.
+*/
+template<unsigned N, class E1, class E2, class E3> SIMDPP_INL
+float32<N, expr_fmadd<float32<N,E1>,
+                      float32<N,E2>,
+                      float32<N,E3>>> fmadd(const float32<N,E1>& a,
+                                            const float32<N,E2>& b,
+                                            const float32<N,E3>& c)
+{
+    return { { a, b, c } };
+}
+
+template<unsigned N, class E1, class E2, class E3> SIMDPP_INL
+float64<N, expr_fmadd<float64<N,E1>,
+                      float64<N,E2>,
+                      float64<N,E3>>> fmadd(const float64<N,E1>& a,
+                                            const float64<N,E2>& b,
+                                            const float64<N,E3>& c)
+{
+    return { { a, b, c } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 56 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_fmsub.h

@@ -0,0 +1,56 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_FMSUB_H
+#define LIBSIMDPP_SIMDPP_CORE_F_FMSUB_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_fmsub.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Performs a fused multiply-sutract operation
+
+    @code
+    r0 = a0 * b0 - c0
+    ...
+    rN = aN * bN - cN
+    @endcode
+
+    Implemented only on architectures with either @c X86_FMA3 or @c X86_FMA4
+    support.
+*/
+template<unsigned N, class E1, class E2, class E3> SIMDPP_INL
+float32<N, expr_fmsub<float32<N,E1>,
+                      float32<N,E2>,
+                      float32<N,E3>>> fmsub(const float32<N,E1>& a,
+                                            const float32<N,E2>& b,
+                                            const float32<N,E3>& c)
+{
+    return { { a, b, c } };
+}
+
+template<unsigned N, class E1, class E2, class E3> SIMDPP_INL
+float64<N, expr_fmsub<float64<N,E1>,
+                      float64<N,E2>,
+                      float64<N,E3>>> fmsub(const float64<N,E1>& a,
+                                            const float64<N,E2>& b,
+                                            const float64<N,E3>& c)
+{
+    return { { a, b, c } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 63 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_isnan.h

@@ -0,0 +1,63 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_ISNAN_H
+#define LIBSIMDPP_SIMDPP_CORE_ISNAN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_isnan.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Checks whether elements in @a a are IEEE754 NaN.
+
+    @code
+    r0 = isnan(a0) ? 0xffffffff : 0
+    ...
+    rN = isnan(aN) ? 0xffffffff : 0
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+mask_float32<N,expr_empty> isnan(const float32<N,E>& a)
+{
+    return detail::insn::i_isnan(a.eval());
+}
+
+/** Checks whether elements in @a a are IEEE754 NaN.
+
+    @code
+    r0 = isnan(a0) ? 0xffffffffffffffff : 0
+    ...
+    rN = isnan(aN) ? 0xffffffffffffffff : 0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+mask_float64<N,expr_empty> isnan(const float64<N,E>& a)
+{
+    return detail::insn::i_isnan(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 69 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_isnan2.h

@@ -0,0 +1,69 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_ISNAN2_H
+#define LIBSIMDPP_SIMDPP_CORE_F_ISNAN2_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_isnan2.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Checks whether corresponding elements in either @a a or @a b are IEEE754 NaN.
+
+    @code
+    r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
+    ...
+    rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0
+    @endcode
+
+    @par 128-bit version:
+    @icost{NEON, ALTIVEC, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, ALTIVEC, 6}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float32<N,expr_empty> isnan2(const float32<N,E1>& a, const float32<N,E2>& b)
+{
+    return detail::insn::i_isnan2(a.eval(), b.eval());
+}
+
+/** Checks whether corresponding elements in either @a a or @a b are IEEE754
+    NaN.
+
+    @code
+    r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
+    ...
+    rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+mask_float64<N,expr_empty> isnan2(const float64<N,E1>& a, const float64<N,E2>& b)
+{
+    return detail::insn::i_isnan2(a.eval(), b.eval());
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 73 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_max.h

@@ -0,0 +1,73 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_MAX_H
+#define LIBSIMDPP_SIMDPP_CORE_F_MAX_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_max.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes maxima of the values of two vectors. If at least one of the values
+    is NaN, or both values are zeroes, it is unspecified which value will be
+    returned.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N,expr_empty> max(const float32<N,E1>& a, const float32<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, float32, float32)
+
+/** Computes maxima of the values of two vectors. If at least one of the values
+    is NaN, or both values are zeroes, it is unspecified which value will be
+    returned.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @novec{NEON, ALTIVEC}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N,expr_empty> max(const float64<N,E1>& a, const float64<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 74 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_min.h

@@ -0,0 +1,74 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_MIN_H
+#define LIBSIMDPP_SIMDPP_CORE_F_MIN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_min.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+// note: SSE doesn't provide a way to propagate NaNs in min/max
+/** Computes minimum of the values in two vectors. If at least one of the
+    values is NaN, or both values are zeroes, it is unspecified which value
+    will be returned.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N,expr_empty> min(const float32<N,E1>& a, const float32<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, float32, float32)
+
+/** Computes minima of the values in two vectors. If at least one of the values
+    is NaN, or both values are zeroes, it is unspecified which value will be
+    returned.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N,expr_empty> min(const float64<N,E1>& a, const float64<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 73 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_mul.h

@@ -0,0 +1,73 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_MUL_H
+#define LIBSIMDPP_SIMDPP_CORE_F_MUL_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_mul.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Multiplies the values of two vectors
+
+    @code
+    r0 = a0 * b0
+    ...
+    rN = aN * bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N, expr_fmul<float32<N,E1>,
+                     float32<N,E2>>> mul(const float32<N,E1>& a,
+                                         const float32<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mul, expr_fmul, float32, float32)
+
+/** Multiplies the values of two vectors
+
+    @code
+    r0 = a0 * b0
+    ...
+    rN = aN * bN
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N, expr_fmul<float64<N,E1>,
+                     float64<N,E2>>> mul(const float64<N,E1>& a,
+                                         const float64<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mul, expr_fmul, float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 70 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_neg.h

@@ -0,0 +1,70 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_NEG_H
+#define LIBSIMDPP_SIMDPP_CORE_F_NEG_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_neg.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Negates the values of a float32x4 vector
+
+    @code
+    r0 = -a0
+    ...
+    rN = -aN
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, ALTIVEC, 2-3}
+    @icost{AVX-AVX2, NEON, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N, expr_fneg<float32<N,E>>> neg(const float32<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Negates the values of a vector
+
+    @code
+    r0 = -a0
+    ...
+    rN = -aN
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 1-2}
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2-3}
+    @icost{AVX-AVX2, 1-2}
+    @novec{NEON, ALTIVEC}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float64<N, expr_fneg<float64<N,E>>> neg(const float64<N,E>& a)
+{
+    return { { a } };
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 50 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_rcp_e.h

@@ -0,0 +1,50 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_RCP_E_H
+#define LIBSIMDPP_SIMDPP_CORE_F_RCP_E_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_rcp_e.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes approximate reciprocal.
+
+    Relative error is as follows:
+     - 1/2 ULP for NULL and NEON
+     - ~1/2730 for SSE2
+     - 1/16376 for AVX512
+     - 1/4096 for ALTIVEC
+     - 1/256 for NEON_FLT_SP
+
+    @code
+    r0 = approx(1.0f / a0)
+    ...
+    rN = approx(1.0f / aN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> rcp_e(const float32<N,E>& a)
+{
+    return detail::insn::i_rcp_e(a.eval());
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 64 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_rcp_rh.h

@@ -0,0 +1,64 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_RCP_RH_H
+#define LIBSIMDPP_SIMDPP_CORE_F_RCP_RH_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_rcp_rh.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes one Newton-Rhapson iterations for reciprocal. @a x is the current
+    estimate, @a a are the values to estimate reciprocal for.
+
+    @code
+    r0 = x0 * (2 - x0*a0)
+    ...
+    rN = xN * (2 - xN*aN)
+    @endcode
+
+    Using this function, one can the division can be implemented as follows:
+    @code
+    // a/b
+    float32x4 x;
+    x = rcp_e(b);
+    x = rcp_rh(x, b);
+    x = rcp_rh(x, b);
+    return mul(a, x);
+    @endcode
+
+    Precision can be controlled by selecting the number of @c rcp_rh steps.
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 3-4}
+    @icost{NEON, 2}
+    @icost{ALTIVEC, 2-3}
+
+    @par 256-bit version:
+    @icost{AVX-AVX2, 3-4}
+    @icost{SSE2-SSE4.1, 6-7}
+    @icost{NEON, 4}
+    @icost{ALTIVEC, 4-5}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> rcp_rh(const float32<N,E>& x, const float32<N,E>& a)
+{
+    return detail::insn::i_rcp_rh(x.eval(), a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 44 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_add.h

@@ -0,0 +1,44 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_REDUCE_ADD_H
+#define LIBSIMDPP_SIMDPP_CORE_F_REDUCE_ADD_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_reduce_add.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the sum of the elements in the vector.
+
+    @code
+    r0 = a0 + a1 + a2 + ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+float reduce_add(const float32<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+double reduce_add(const float64<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 44 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_max.h

@@ -0,0 +1,44 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_REDUCE_MAX_H
+#define LIBSIMDPP_SIMDPP_CORE_F_REDUCE_MAX_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_reduce_max.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the maximum of the elements in the vector.
+
+    @code
+    r0 = max(a0, a1, a2, ...)
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+float reduce_max(const float32<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+double reduce_max(const float64<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 44 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_min.h

@@ -0,0 +1,44 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_REDUCE_MIN_H
+#define LIBSIMDPP_SIMDPP_CORE_F_REDUCE_MIN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_reduce_min.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the minimum of the elements in the vector.
+
+    @code
+    r0 = min(a0, a1, a2, ...)
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+float reduce_min(const float32<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+double reduce_min(const float64<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 44 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_reduce_mul.h

@@ -0,0 +1,44 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_REDUCE_MUL_H
+#define LIBSIMDPP_SIMDPP_CORE_F_REDUCE_MUL_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_reduce_mul.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the product of the elements in the vector.
+
+    @code
+    r0 = a0 * a1 * a2 * ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+float reduce_mul(const float32<N,E>& a)
+{
+    return detail::insn::i_reduce_mul(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+double reduce_mul(const float64<N,E>& a)
+{
+    return detail::insn::i_reduce_mul(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 50 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_rsqrt_e.h

@@ -0,0 +1,50 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_RSQRT_E_H
+#define LIBSIMDPP_SIMDPP_CORE_F_RSQRT_E_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_rsqrt_e.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes approximate reciprocal square root.
+
+    Relative error is as follows:
+     - 1/2 ULP for NULL and NEON
+     - ~1/2730 for SSE2
+     - 1/16384 for AVX512
+     - 1/4096 for ALTIVEC
+     - 1/256 for NEON_FLT_SP
+
+    @code
+    r0 = approx(1 / sqrt(a0))
+    ...
+    rN = approx(1 / sqrt(aN))
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> rsqrt_e(const float32<N,E>& a)
+{
+    return detail::insn::i_rsqrt_e(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 53 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_rsqrt_rh.h

@@ -0,0 +1,53 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_RSQRT_RH_H
+#define LIBSIMDPP_SIMDPP_CORE_F_RSQRT_RH_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_rsqrt_rh.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes one Newton-Rhapson iteration for inverse of square root. @a x is
+    the current estimate, @a a are the values to estimate the inverse square
+    root for.
+
+    @code
+    r0 = x0 * (3 - a0*x0*x0) * 0.5
+    ...
+    rN = xN * (3 - aN*xN*xN) * 0.5
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2, SSE3, SSSE3, SSE4.1, 5-7}
+    @icost{NEON, 3}
+    @icost{ALTIVEC, 4-6}
+
+    @par 256-bit version:
+    @icost{AVX-AVX2, 7}
+    @icost{SSE2, SSE3, SSSE3, SSE4.1, 10-12}
+    @icost{NEON, 6}
+    @icost{ALTIVEC, 8-10}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> rsqrt_rh(const float32<N,E>& x, const float32<N,E>& a)
+{
+    return detail::insn::i_rsqrt_rh(x.eval(), a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 71 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_sign.h

@@ -0,0 +1,71 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_SIGN_H
+#define LIBSIMDPP_SIMDPP_CORE_F_SIGN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_sign.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Extracts sign bits from the values in float32x4 vector
+
+    @code
+    r0 = a0 & 0x80000000
+    ...
+    rN = aN & 0x80000000
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSE4.1, ALTIVEC, NEON, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, ALTIVEC, NEON, 2-3}
+    @icost{AVX-AVX2, 1-2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> sign(const float32<N,E>& a)
+{
+    return detail::insn::i_sign(a.eval());
+}
+
+/** Extracts sigh bit from the values in float64x2 vector.
+
+    @code
+    r0 = a0 & 0x8000000000000000
+    ...
+    r0 = aN & 0x8000000000000000
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 1-2}
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2-3}
+    @icost{AVX-AVX2, 1-2}
+    @novec{NEON, ALTIVEC}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float64<N,expr_empty> sign(const float64<N,E>& a)
+{
+    return detail::insn::i_sign(a.eval());
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 70 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_sqrt.h

@@ -0,0 +1,70 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_SQRT_H
+#define LIBSIMDPP_SIMDPP_CORE_F_SQRT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/f_sqrt.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes square root.
+
+    @code
+    r0 = sqrt(a0)
+    ...
+    rN = sqrt(aN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{NEON, 5}
+    @icost{ALTIVEC, 5-7}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, 10}
+    @icost{ALTIVEC, 10-12}
+*/
+template<unsigned N, class E1> SIMDPP_INL
+float32<N,expr_empty> sqrt(const float32<N,E1>& a)
+{
+    return detail::insn::i_sqrt(a.eval());
+}
+
+/** Computes square root.
+
+    @code
+    r0 = sqrt(a0)
+    ...
+    rN = sqrt(aN)
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @novec{NEON, ALTIVEC}
+*/
+template<unsigned N, class E1> SIMDPP_INL
+float64<N,expr_empty> sqrt(const float64<N,E1>& a)
+{
+    return detail::insn::i_sqrt(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 74 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_sub.h

@@ -0,0 +1,74 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_SUB_H
+#define LIBSIMDPP_SIMDPP_CORE_F_SUB_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/f_sub.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Substracts the values of two vectors
+
+    @code
+    r0 = a0 - b0
+    ...
+    rN = aN - bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float32<N, expr_fsub<float32<N,E1>,
+                     float32<N,E2>>> sub(const float32<N,E1>& a,
+                                         const float32<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(sub, expr_fsub, float32, float32)
+
+/** Subtracts the values of two vectors
+
+    @code
+    r0 = a0 - b0
+    ...
+    rN = aN - bN
+    @endcode
+
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @novec{NEON, ALTIVEC}
+    @icost{SSE2-SSE4.1, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+float64<N, expr_fsub<float64<N,E1>,
+                     float64<N,E2>>> sub(const float64<N,E1>& a,
+                                         const float64<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(sub, expr_fsub, float64, float64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 53 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/f_trunc.h

@@ -0,0 +1,53 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_F_TRUNC_H
+#define LIBSIMDPP_SIMDPP_CORE_F_TRUNC_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <cmath>
+#include <simdpp/detail/insn/f_trunc.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Rounds the values of a vector towards zero
+    @code
+    r0 = trunc(a0)
+    ...
+    rN = trunc(aN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2, SSE3, SSSE3, 7-9}
+    @icost{NEON, 5-6}
+
+    @par 256-bit version:
+    @icost{SSE2, SSE3, SSSE3, 14-16}
+    @icost{NEON, 10-11}
+    @icost{SSE4.1, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+float32<N,expr_empty> trunc(const float32<N,E>& a)
+{
+    return detail::insn::i_trunc(a.eval());
+}
+template<unsigned N, class E> SIMDPP_INL
+float64<N,expr_empty> trunc(const float64<N,E>& a)
+{
+    return detail::insn::i_trunc(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 41 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/for_each.h

@@ -0,0 +1,41 @@
+/*  Copyright (C) 2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_FOR_EACH_H
+#define LIBSIMDPP_SIMDPP_CORE_FOR_EACH_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/for_each.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Executes the given function on all elements of the vector.
+
+    Equivalent to:
+    @code
+    function(extract<0>(v));
+    function(extract<1>(v));
+    ...
+    function(extract<N>(v));
+    @endcode
+*/
+template<unsigned N, class V, class F> SIMDPP_INL
+void for_each(const any_vec<N, V>& v, F function)
+{
+    detail::for_each(v.wrapped().eval(), function);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 117 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_abs.h

@@ -0,0 +1,117 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_ABS_H
+#define LIBSIMDPP_SIMDPP_CORE_I_ABS_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_abs.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes absolute value of 8-bit integer values.
+
+    @code
+    r0 = abs(a0)
+    ...
+    rN = abs(aN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+    @icost{ALTIVEC, 1-3}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, 2}
+    @icost{ALTIVEC, 2-4}
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint8<N, expr_iabs<int8<N,E>>> abs(const int8<N,E>& a)
+{
+    return { { a } };
+}
+
+
+/** Computes absolute value of 16-bit integer values.
+
+    @code
+    r0 = abs(a0)
+    ...
+    rN = abs(aN)
+    @endcode
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+    @icost{ALTIVEC, 1-3}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, 2}
+    @icost{ALTIVEC, 2-5}
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint16<N, expr_iabs<int16<N,E>>> abs(const int16<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Computes absolute value of 32-bit integer values.
+
+    @code
+    r0 = abs(a0)
+    ...
+    rN = abs(aN)
+    @endcode
+    @par 128-bit version:
+    @icost{SSE2-SSE3, 3}
+    @icost{ALTIVEC, 1-3}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE3, 6}
+    @icost{SSSE3-AVX, NEON, 2}
+    @icost{ALTIVEC, 2-4}
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint32<N, expr_iabs<int32<N,E>>> abs(const int32<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Computes absolute value of 64-bit integer values.
+
+    @code
+    r0 = abs(a0)
+    ...
+    rN = abs(aN)
+    @endcode
+    @par 128-bit version:
+    @icost{SSE2-AVX, 5}
+    @icost{NEON, 6}
+    @novec{ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 10}
+    @icost{NEON, 12}
+    @icost{AVX2, 4}
+    @novec{ALTIVEC}
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint64<N, expr_iabs<int64<N,E>>> abs(const int64<N,E>& a)
+{
+    return { { a } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 117 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_add.h

@@ -0,0 +1,117 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_ADD_H
+#define LIBSIMDPP_SIMDPP_CORE_I_ADD_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_add.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Adds 8-bit integer values.
+
+    @code
+    r0 = a0 + b0
+    ...
+    rN = aN + bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_iadd, V1, V2>::type
+        add(const any_int8<N,V1>& a,
+            const any_int8<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(add, expr_iadd, any_int8, int8)
+
+/** Adds 16-bit integer values.
+
+    @code
+    r0 = a0 + b0
+    ...
+    rN = aN + bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_iadd, V1, V2>::type
+        add(const any_int16<N,V1>& a,
+            const any_int16<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(add, expr_iadd, any_int16, int16)
+
+/** Adds 32-bit integer values.
+
+    @code
+    r0 = a0 + b0
+    ...
+    rN = aN + bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_iadd, V1, V2>::type
+        add(const any_int32<N,V1>& a,
+            const any_int32<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(add, expr_iadd, any_int32, int32)
+
+/** Adds 64-bit integer values.
+
+    @code
+    r0 = a0 + b0
+    ...
+    rN = aN + bN
+    @endcode
+
+    @par 128-bit version:
+    @icost{ALTIVEC, 5-6}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, 2}
+    @icost{ALTIVEC, 10-11}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_iadd, V1, V2>::type
+        add(const any_int64<N,V1>& a,
+            const any_int64<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(add, expr_iadd, any_int64, int64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 111 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_add_sat.h

@@ -0,0 +1,111 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_ADD_SAT_H
+#define LIBSIMDPP_SIMDPP_CORE_I_ADD_SAT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_add_sat.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Adds and saturates signed 8-bit integer values.
+
+    @code
+    r0 = signed_saturate(a0 + b0)
+    ...
+    rN = signed_saturate(aN + bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N, expr_iadd_sat<int8<N,E1>,
+                      int8<N,E2>>> add_sat(const int8<N,E1>& a,
+                                           const int8<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(add_sat, expr_iadd_sat, int8, int8)
+
+/** Adds and saturates signed 16-bit integer values.
+
+    @code
+    r0 = signed_saturate(a0 + b0)
+    ...
+    rN = signed_saturate(aN + bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N, expr_iadd_sat<int16<N,E1>,
+                       int16<N,E2>>> add_sat(const int16<N,E1>& a,
+                                             const int16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(add_sat, expr_iadd_sat, int16, int16)
+
+/** Adds and saturates unsigned 8-bit integer values.
+
+    @code
+    r0 = unsigned_saturate(a0 + b0)
+    ...
+    rN = unsigned_saturate(aN + bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N, expr_iadd_sat<uint8<N,E1>,
+                       uint8<N,E2>>> add_sat(const uint8<N,E1>& a,
+                                             const uint8<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(add_sat, expr_iadd_sat, uint8, uint8)
+
+/** Adds and saturates unsigned 16-bit integer values.
+
+    @code
+    r0 = unsigned_saturate(a0 + b0)
+    ...
+    rN = unsigned_saturate(aN + bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N, expr_iadd_sat<uint16<N,E1>,
+                        uint16<N,E2>>> add_sat(const uint16<N,E1>& a,
+                                               const uint16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(add_sat, expr_iadd_sat, uint16, uint16)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 162 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_avg.h

@@ -0,0 +1,162 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_AVG_H
+#define LIBSIMDPP_SIMDPP_CORE_I_AVG_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_avg.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes rounded average of the unsigned 8-bit values.
+
+    @code
+    r0 = (a0 + b0 + 1) / 2
+    ...
+    rN = (aN + bN + 1) / 2
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N,expr_empty> avg(const uint8<N,E1>& a, const uint8<N,E2>& b)
+{
+    return detail::insn::i_avg(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg, uint8, uint8)
+
+/** Computes rounded average of signed 8-bit values.
+
+    @code
+    r0 = (a0 + b0 + 1) / 2
+    ...
+    rN = (aN + bN + 1) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 4-5}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 8-9}
+    @icost{AVX2, 4-5}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N,expr_empty> avg(const int8<N,E1>& a, const int8<N,E2>& b)
+{
+    return detail::insn::i_avg(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg, int8, int8)
+
+/** Computes rounded average of unsigned 16-bit values.
+
+    @code
+    r0 = (a0 + b0 + 1) / 2
+    ...
+    rN = (aN + bN + 1) / 2
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N,expr_empty> avg(const uint16<N,E1>& a, const uint16<N,E2>& b)
+{
+    return detail::insn::i_avg(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg, uint16, uint16)
+
+/** Computes rounded average of signed 16-bit values.
+
+    @code
+    r0 = (a0 + b0 + 1) / 2
+    ...
+    rN = (aN + bN + 1) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 4-5}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 8-9}
+    @icost{AVX2, 4-5}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N,expr_empty> avg(const int16<N,E1>& a, const int16<N,E2>& b)
+{
+    return detail::insn::i_avg(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg, int16, int16)
+
+/** Computes rounded average of unsigned 32-bit values.
+
+    @code
+    r0 = (a0 + b0 + 1) / 2
+    ...
+    rN = (aN + bN + 1) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 6-7}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 12-13}
+    @icost{AVX2, 6-7}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint32<N,expr_empty> avg(const uint32<N,E1>& a, const uint32<N,E2>& b)
+{
+    return detail::insn::i_avg(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg, uint32, uint32)
+
+/** Computes rounded average of signed 32-bit values.
+
+    @code
+    r0 = (a0 + b0 + 1) / 2
+    ...
+    rN = (aN + bN + 1) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 9-10}
+    @icost{NEON, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 18-19}
+    @icost{AVX2, 9-10}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int32<N,expr_empty> avg(const int32<N,E1>& a, const int32<N,E2>& b)
+{
+    return detail::insn::i_avg(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg, int32, int32)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 177 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_avg_trunc.h

@@ -0,0 +1,177 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_AVG_TRUNC_H
+#define LIBSIMDPP_SIMDPP_CORE_I_AVG_TRUNC_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_avg_trunc.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes truncated average of the unsigned 8-bit values.
+
+    @code
+    r0 = (a0 + b0) / 2
+    ...
+    rN = (aN + bN) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 4}
+    @icost{NEON, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 8}
+    @icost{AVX2, 4}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N,expr_empty> avg_trunc(const uint8<N,E1>& a, const uint8<N,E2>& b)
+{
+    return detail::insn::i_avg_trunc(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg_trunc, uint8, uint8)
+
+/** Computes truncated average of signed 8-bit values.
+
+    @code
+    r0 = (a0 + b0) / 2
+    ...
+    rN = (aN + bN) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 7-8}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 14-15}
+    @icost{AVX2, 7-8}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N,expr_empty> avg_trunc(const int8<N,E1>& a, const int8<N,E2>& b)
+{
+    return detail::insn::i_avg_trunc(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg_trunc, int8, int8)
+
+/** Computes truncated average of unsigned 16-bit values.
+
+    @code
+    r0 = (a0 + b0) / 2
+    ...
+    rN = (aN + bN) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 4}
+    @icost{NEON, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 8}
+    @icost{AVX2, 4}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N,expr_empty> avg_trunc(const uint16<N,E1>& a, const uint16<N,E2>& b)
+{
+    return detail::insn::i_avg_trunc(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg_trunc, uint16, uint16)
+
+/** Computes truncated average of signed 16-bit values.
+
+    @code
+    r0 = (a0 + b0) / 2
+    ...
+    rN = (aN + bN) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 7-8}
+    @icost{NEON, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 14-15}
+    @icost{AVX2, 7-8}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N,expr_empty> avg_trunc(const int16<N,E1>& a, const int16<N,E2>& b)
+{
+    return detail::insn::i_avg_trunc(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg_trunc, int16, int16)
+
+/** Computes truncated average of unsigned 32-bit values.
+
+    @code
+    r0 = (a0 + b0) / 2
+    ...
+    rN = (aN + bN) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 4}
+    @icost{NEON, 1}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 8}
+    @icost{AVX2, 4}
+    @icost{NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint32<N,expr_empty> avg_trunc(const uint32<N,E1>& a, const uint32<N,E2>& b)
+{
+    return detail::insn::i_avg_trunc(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg_trunc, uint32, uint32)
+
+/** Computes truncated average of signed 32-bit values.
+
+    @code
+    r0 = (a0 + b0) / 2
+    ...
+    rN = (aN + bN) / 2
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-AVX2, 7-8}
+    @icost{ALTIVEC, 4}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 14-15}
+    @icost{AVX2, 7-8}
+    @icost{ALTIVEC, 8}
+    @icost{NEON, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int32<N,expr_empty> avg_trunc(const int32<N,E1>& a, const int32<N,E2>& b)
+{
+    return detail::insn::i_avg_trunc(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(avg_trunc, int32, int32)
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 131 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_div_p.h

@@ -0,0 +1,131 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_DIV_P_H
+#define LIBSIMDPP_SIMDPP_CORE_I_DIV_P_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/core/bit_and.h>
+#include <simdpp/core/bit_andnot.h>
+#include <simdpp/core/bit_or.h>
+#include <simdpp/core/cmp_lt.h>
+#include <simdpp/core/i_sub.h>
+#include <simdpp/detail/null/math.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+// FIXME: move to adv
+/** Divides one 8-bit unsigned number by another. The precision of the operation
+    is configurable: only P least significant bits of both numerator and
+    denumerator are considered.
+
+    @code
+    r0 = num0 / den0
+    ...
+    rN = numN / denN
+    @endcode
+    @par 128-bit version:
+    The operations costs at least 9 instructions per bit of precision.
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, 10}
+    @icost{AVX2, 4}
+*/
+template<unsigned P> SIMDPP_INL
+uint8x16 div_p(const uint8x16& num, const uint8x16& den)
+{
+#if SIMDPP_USE_NULL
+    return detail::null::div_p<P>(num, den);
+#else
+    static_assert(P <= 8, "Precision too large");
+    uint8x16 r, q, bit_mask;
+    r = q = make_zero();
+    bit_mask = make_uint(1 << (P-1));
+
+    for (unsigned i = P; i > 0; i--) {
+        unsigned bit = i-1;
+        uint8x16 n_bit;
+        // we'll never shift out any bits, so larger shift doesn't matter
+        r = shift_l<1>((uint16x8)r);
+
+        n_bit = bit_and(num, bit_mask);
+        n_bit = shift_r((uint16x8)n_bit, bit);
+        r = bit_or(r, n_bit);
+
+        uint8x16 cmp, csub, cbit;
+        cmp = cmp_lt(r, den);
+
+        csub = bit_andnot(den, cmp);
+        cbit = bit_andnot(bit_mask, cmp);
+        r = sub(r, csub);
+        q = bit_or(q, cbit);
+
+        bit_mask = shift_r<1>((uint16x8)bit_mask);
+    }
+    return q;
+
+    /*
+    The actual algorithm is as follows:
+    N - numerator, D - denominator, R - remainder, Q - quetient
+    R = 0; Q = 0;
+    for (unsigned i = P; i > 0; i--) {
+        unsigned bit = i-1;
+        R <<= 1;
+        R |= (N >> bit) & 1;
+        if (R >= D) {
+            R = R - D;
+            Q |= 1 << bit;
+        }
+    }*/
+#endif
+}
+
+template<unsigned P> SIMDPP_INL
+uint16x8 div_p(const uint16x8& num, const uint16x8& den)
+{
+#if SIMDPP_USE_NULL
+    return detail::null::div_p<P>(num, den);
+#else
+    static_assert(P <= 16, "Precision too large");
+    uint16x8 r, q, bit_mask;
+
+    r = q = make_zero();
+    bit_mask = make_uint(1 << (P-1));
+
+    for (unsigned i = P; i > 0; i--) {
+        unsigned bit = i-1; // TODO precision
+        uint16x8 n_bit;
+        r = shift_l<1>(r);
+
+        n_bit = bit_and(num, bit_mask);
+        n_bit = shift_r(n_bit, bit);
+        r = bit_or(r, n_bit);
+
+        uint16x8 cmp, csub, cbit;
+        cmp = cmp_lt(r, den);
+
+        csub = bit_andnot(den, cmp);
+        cbit = bit_andnot(bit_mask, cmp);
+        r = sub(r, csub);
+        q = bit_or(q, cbit);
+
+        bit_mask = shift_r<1>(bit_mask);
+    }
+    return q;
+#endif
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 193 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_max.h

@@ -0,0 +1,193 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_MAX_H
+#define LIBSIMDPP_SIMDPP_CORE_I_MAX_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_max.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes maximum of the signed 8-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 8}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N,expr_empty> max(const int8<N,E1>& a, const int8<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, int8, int8)
+
+/** Computes maximum of the unsigned 8-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N,expr_empty> max(const uint8<N,E1>& a, const uint8<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, uint8, uint8)
+
+/** Computes maximum of the signed 16-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N,expr_empty> max(const int16<N,E1>& a, const int16<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, int16, int16)
+
+/** Computes maximum of the unsigned 16-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 6-7}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 12-13}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N,expr_empty> max(const uint16<N,E1>& a, const uint16<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, uint16, uint16)
+
+/** Computes maximum of the signed 32-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 8}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int32<N,expr_empty> max(const int32<N,E1>& a, const int32<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, int32, int32)
+
+/** Computes maximum of the unsigned 32-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 6-7}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 12-13}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint32<N,expr_empty> max(const uint32<N,E1>& a, const uint32<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, uint32, uint32)
+
+/** Computes maximum of the signed 64-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int64<N,expr_empty> max(const int64<N,E1>& a, const int64<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, int64, int64)
+
+/** Computes maximum of the unsigned 64-bit values.
+
+    @code
+    r0 = max(a0, b0)
+    ...
+    rN = max(aN, bN)
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint64<N,expr_empty> max(const uint64<N,E1>& a, const uint64<N,E2>& b)
+{
+    return detail::insn::i_max(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(max, uint64, uint64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 194 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_min.h

@@ -0,0 +1,194 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_MIN_H
+#define LIBSIMDPP_SIMDPP_CORE_I_MIN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_min.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes minimum of signed 8-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 8}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N,expr_empty> min(const int8<N,E1>& a, const int8<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, int8, int8)
+
+/** Computes minimum of the unsigned 8-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N,expr_empty> min(const uint8<N,E1>& a, const uint8<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, uint8, uint8)
+
+/** Computes minimum of the signed 16-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N,expr_empty> min(const int16<N,E1>& a, const int16<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, int16, int16)
+
+/** Computes minimum of the unsigned 16-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 6-7}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 12-13}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N,expr_empty> min(const uint16<N,E1>& a, const uint16<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, uint16, uint16)
+
+/** Computes minimum of the signed 32-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 8}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int32<N,expr_empty> min(const int32<N,E1>& a, const int32<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, int32, int32)
+
+
+/** Computes minimum of the unsigned 32-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 6-7}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 12-13}
+    @icost{SSE4.1-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint32<N,expr_empty> min(const uint32<N,E1>& a, const uint32<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, uint32, uint32)
+
+/** Computes minimum of the signed 64-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int64<N,expr_empty> min(const int64<N,E1>& a, const int64<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, int64, int64)
+
+/** Computes minimum of the unsigned 64-bit values.
+
+    @code
+    r0 = min(a0, b0)
+    ...
+    rN = min(aN, bN)
+    @endcode
+
+    Supported since AVX2, NEON64. Not supported on ALTIVEC.
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint64<N,expr_empty> min(const uint64<N,E1>& a, const uint64<N,E2>& b)
+{
+    return detail::insn::i_min(a.eval(), b.eval());
+}
+
+SIMDPP_SCALAR_ARG_IMPL_VEC(min, uint64, uint64)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 129 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_mul.h

@@ -0,0 +1,129 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_MUL_H
+#define LIBSIMDPP_SIMDPP_CORE_I_MUL_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_mul.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+// no 8 bit multiplications in SSE
+/** Multiplies 16-bit values and returns the lower part of the multiplication
+
+    @code
+    r0 = low(a0 * b0)
+    ...
+    rN = low(aN * bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_mul_lo, V1, V2>::type
+        mul_lo(const any_int16<N,V1>& a,
+               const any_int16<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(mul_lo, expr_mul_lo, any_int16, int16)
+
+/** Multiplies signed 16-bit values and returns the higher half of the result.
+
+    @code
+    r0 = high(a0 * b0)
+    ...
+    rN = high(aN * bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{NEON, ALTIVEC, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, ALTIVEC, 6}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N, expr_mul_hi<int16<N,E1>,
+                     int16<N,E2>>> mul_hi(const int16<N,E1>& a,
+                                          const int16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mul_hi, expr_mul_hi, int16, int16)
+
+/** Multiplies unsigned 16-bit values and returns the higher half of the result.
+
+    @code
+    r0 = high(a0 * b0)
+    ...
+    rN = high(aN * bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{NEON, ALTIVEC, 3}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, ALTIVEC, 6}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N, expr_mul_hi<uint16<N,E1>,
+                      uint16<N,E2>>> mul_hi(const uint16<N,E1>& a,
+                                            const uint16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mul_hi, expr_mul_hi, uint16, uint16)
+
+
+/** Multiplies 32-bit values and returns the lower half of the result.
+
+    @code
+    r0 = low(a0 * b0)
+    ...
+    rN = low(aN * bN)
+    @endcode
+
+    @par 128-bit version:
+    @icost{SSE2-SSSE3, 6}
+    @icost{ALTIVEC, 8}
+
+    @par 256-bit version:
+    @icost{SSE2-SSSE3, 12}
+    @icost{SSE4.1, AVX, NEON, 2}
+    @icost{ALTIVEC, 16}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_mul_lo, V1, V2>::type
+        mul_lo(const any_int32<N,V1>& a,
+               const any_int32<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(mul_lo, expr_mul_lo, any_int32, int32)
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 156 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_mull.h

@@ -0,0 +1,156 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_MULL_H
+#define LIBSIMDPP_SIMDPP_CORE_I_MULL_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_mull.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/*  Note: widening integer multiplication instructions are very different among
+    instruction sets. The main difference is in which half of the elements are
+    selected for multiplication. Trying to abstract this incurs definite
+    overhead.
+
+     - SSE2-SSE4.1 and AVX2 provide only instructions with interfaces similar
+        to mul_lo and mul_hi. The result vectors must be interleaved to obtain
+        contiguous result values. Multiplying 2 vectors always incurs
+        overhead of at least two interleaving instructions.
+
+     - AVX512 only provides 32-bit integer support. Widening multiplication
+        can be done only by using PMULDQ, which takes odd elements and produces
+        widened multiplication results. Multiplication of two whole vectors
+        always incurs overhead of at least two shifts or interleaving
+        instructions.
+
+     - NEON, NEONv2 provide instructions that take elements of either the lower
+        or higher halves of two 128-bit vectors and multiply them. No
+        additional overhead is incurred to obtain contiguous result values.
+
+     - ALTIVEC hav multiply odd and multiply even instructions. No additional
+        overhead is incurred to obtain contiguous result values.
+
+    The abstraction below uses the NEON model. No additional overhead is
+    incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute
+    instruction is needed for each vector multiplication on average.
+*/
+
+/** Multiplies signed 16-bit values and expands the results to 32 bits.
+
+    @par 128-bit version:
+    @code
+    r0 = a0 * b0
+    ...
+    rN = aN * bN
+    @endcode
+
+    @icost{SSE2-AVX, ALTIVEC, 2-3}
+
+    @par 256-bit version:
+
+    @icost{SSE2-AVX, ALTIVEC, 4-6}
+    @icost{AVX2, NEON, 2-3}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int32<N, expr_mull<int16<N,E1>,
+                   int16<N,E2>>> mull(const int16<N,E1>& a, const int16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int32, int16)
+
+/** Multiplies unsigned 16-bit values and expands the results to 32 bits.
+
+    @par 128-bit version:
+    @code
+    r0 = a0 * b0
+    ...
+    rN = aN * bN
+    @endcode
+
+    @icost{SSE2-AVX2, ALTIVEC, 2-3}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, ALTIVEC, 4-6}
+    @icost{AVX2, 2-3}
+    @icost{NEON, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint32<N, expr_mull<uint16<N,E1>,
+                    uint16<N,E2>>> mull(const uint16<N,E1>& a, const uint16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint32, uint16)
+
+/** Multiplies signed 32-bit values in and expands the results to 64 bits.
+
+    @code
+    r0 = a0 * b0
+    ...
+    rN = aN * bN
+    @endcode
+    @par 128-bit version:
+    @icost{SSE4.1-AVX, 3}
+    @unimp{SSE2-SSSE3, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE4.1-AVX, 6}
+    @icost{AVX2, 3}
+    @icost{NEON, 2}
+    @unimp{SSE2-SSSE3, ALTIVEC}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int64<N, expr_mull<int32<N,E1>,
+                   int32<N,E2>>> mull(const int32<N,E1>& a, const int32<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int64, int32)
+
+/** Multiplies unsigned 32-bit values in the lower halves of the vectors and
+    expands the results to 64 bits.
+
+    @par 128-bit version:
+    @code
+    r0 = a0 * b0
+    r1 = a1 * b1
+    @endcode
+    @icost{SSE2-AVX, 3}
+    @unimp{ALTIVEC}
+
+    @icost{SSE2-AVX, 6}
+    @icost{AVX2, 3}
+    @icost{NEON, 2}
+    @unimp{ALTIVEC}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint64<N, expr_mull<uint32<N,E1>,
+                    uint32<N,E2>>> mull(const uint32<N,E1>& a, const uint32<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint64, uint32)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 97 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_neg.h

@@ -0,0 +1,97 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_NEG_H
+#define LIBSIMDPP_SIMDPP_CORE_I_NEG_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_neg.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Negates signed 8-bit values.
+
+    @code
+    r0 = -a0
+    ...
+    rN = -aN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8<N, expr_ineg<int8<N,E>>> neg(const int8<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Negates signed 16-bit values.
+
+    @code
+    r0 = -a0
+    ...
+    rN = -aN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+int16<N, expr_ineg<int16<N,E>>> neg(const int16<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Negates signed 32-bit values.
+
+    @code
+    r0 = -a0
+    ...
+    rN = -aN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E> SIMDPP_INL
+int32<N, expr_ineg<int32<N,E>>> neg(const int32<N,E>& a)
+{
+    return { { a } };
+}
+
+/** Negates signed 64-bit values.
+
+    @code
+    r0 = -a0
+    ...
+    rN = -aN
+    @endcode
+
+    @par 128-bit version:
+    @icost{ALTIVEC, 4-5}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, 2}
+    @icost{ALTIVEC, 8-9}
+*/
+template<unsigned N, class E> SIMDPP_INL
+int64<N, expr_ineg<int64<N,E>>> neg(const int64<N,E>& a)
+{
+    return { { a } };
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 82 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_popcnt.h

@@ -0,0 +1,82 @@
+/*  Copyright (C) 2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_POPCNT_H
+#define LIBSIMDPP_SIMDPP_CORE_I_POPCNT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_popcnt.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Computes the population count of elements in the vector.
+
+    @code
+    r0 = popcnt(a0)
+    r1 = popcnt(a1)
+    ...
+    rN = popcnt(aN)
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> popcnt(const int8<N,E>& a)
+{
+    return detail::insn::i_popcnt(uint8<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> popcnt(const uint8<N,E>& a)
+{
+    return detail::insn::i_popcnt(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> popcnt(const int16<N,E>& a)
+{
+    return detail::insn::i_popcnt(uint16<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> popcnt(const uint16<N,E>& a)
+{
+    return detail::insn::i_popcnt(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> popcnt(const int32<N,E>& a)
+{
+    return detail::insn::i_popcnt(uint32<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> popcnt(const uint32<N,E>& a)
+{
+    return detail::insn::i_popcnt(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int64<N,expr_empty> popcnt(const int64<N,E>& a)
+{
+    return detail::insn::i_popcnt(uint64<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64<N,expr_empty> popcnt(const uint64<N,E>& a)
+{
+    return detail::insn::i_popcnt(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 82 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_add.h

@@ -0,0 +1,82 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_ADD_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_ADD_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_add.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the sum of the elements in the vector. Intermediate and the
+    final result has twice as many bits as the input element size in 8 and 16
+    bit cases.
+
+    @code
+    r0 = a0 + a1 + a2 + ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int16_t reduce_add(const int8<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16_t reduce_add(const uint8<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_add(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_add(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_add(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_add(uint32<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_add(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int64_t reduce_add(const int64<N,E>& a)
+{
+    return detail::insn::i_reduce_add(uint64<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64_t reduce_add(const uint64<N,E>& a)
+{
+    return detail::insn::i_reduce_add(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 80 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_and.h

@@ -0,0 +1,80 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_AND_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_AND_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_and.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the bitwise AND of the elements in the vector
+
+    @code
+    r0 = a0 & a1 & a2 & ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8_t reduce_and(const int8<N,E>& a)
+{
+    return detail::insn::i_reduce_and(uint8<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8_t reduce_and(const uint8<N,E>& a)
+{
+    return detail::insn::i_reduce_and(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int16_t reduce_and(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_and(uint16<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16_t reduce_and(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_and(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_and(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_and(uint32<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_and(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_and(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int64_t reduce_and(const int64<N,E>& a)
+{
+    return detail::insn::i_reduce_and(uint64<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64_t reduce_and(const uint64<N,E>& a)
+{
+    return detail::insn::i_reduce_and(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 80 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_max.h

@@ -0,0 +1,80 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_MAX_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_MAX_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_max.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the maximum of the elements in the vector
+
+    @code
+    r0 = max(a0, a1, a2, ...)
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8_t reduce_max(const int8<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8_t reduce_max(const uint8<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int16_t reduce_max(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16_t reduce_max(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_max(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_max(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int64_t reduce_max(const int64<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64_t reduce_max(const uint64<N,E>& a)
+{
+    return detail::insn::i_reduce_max(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 80 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_min.h

@@ -0,0 +1,80 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_MIN_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_MIN_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_min.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the minimum of the elements in the vector
+
+    @code
+    r0 = min(a0, a1, a2, ...)
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8_t reduce_min(const int8<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8_t reduce_min(const uint8<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int16_t reduce_min(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16_t reduce_min(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_min(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_min(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int64_t reduce_min(const int64<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64_t reduce_min(const uint64<N,E>& a)
+{
+    return detail::insn::i_reduce_min(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 58 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_mul.h

@@ -0,0 +1,58 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_MUL_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_MUL_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_mul.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the product of the elements in the vector. Intermediate and the
+    final result is computed in 32-bit precision in 16 bit case. The
+    behavior is undefined in the case of an overflow.
+
+    @code
+    r0 = a0 * a1 * a2 * ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_mul(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_mul(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_mul(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_mul(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_mul(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_mul(uint32<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_mul(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_mul(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 80 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_or.h

@@ -0,0 +1,80 @@
+/*  Copyright (C) 2016  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_OR_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_OR_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_or.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the bitwise OR of the elements in the vector
+
+    @code
+    r0 = a0 & a1 & a2 & ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8_t reduce_or(const int8<N,E>& a)
+{
+    return detail::insn::i_reduce_or(uint8<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8_t reduce_or(const uint8<N,E>& a)
+{
+    return detail::insn::i_reduce_or(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int16_t reduce_or(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_or(uint16<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16_t reduce_or(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_or(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int32_t reduce_or(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_or(uint32<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_or(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_or(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+int64_t reduce_or(const int64<N,E>& a)
+{
+    return detail::insn::i_reduce_or(uint64<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64_t reduce_or(const uint64<N,E>& a)
+{
+    return detail::insn::i_reduce_or(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 80 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_reduce_popcnt.h

@@ -0,0 +1,80 @@
+/*  Copyright (C) 2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_REDUCE_POPCNT_H
+#define LIBSIMDPP_SIMDPP_CORE_I_REDUCE_POPCNT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/i_reduce_popcnt.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Computes the population count of all values in the vector.
+
+    @code
+    result = popcnt(a0) + popcnt(a1) + ...
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const int8<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N/4>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const uint8<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N/4>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const int16<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N/2>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const uint16<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N/2>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const int32<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const uint32<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(a.eval());
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const int64<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N*2>(a.eval()));
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32_t reduce_popcnt(const uint64<N,E>& a)
+{
+    return detail::insn::i_reduce_popcnt(uint32<N*2>(a.eval()));
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 303 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_shift_l.h

@@ -0,0 +1,303 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_H
+#define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/capabilities.h>
+#include <simdpp/detail/insn/i_shift_l.h>
+#include <simdpp/detail/insn/i_shift_l_v.h>
+#include <simdpp/detail/not_implemented.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+// -----------------------------------------------------------------------------
+// shift by scalar
+
+/** Shifts 8-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> shift_l(const int8<N,E>& a, unsigned count)
+{
+    uint8<N> qa = a.eval();
+    return detail::insn::i_shift_l(qa, count);
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> shift_l(const uint8<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_l(a.eval(), count);
+}
+
+/** Shifts 16-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> shift_l(const int16<N,E>& a, unsigned count)
+{
+    uint16<N> qa = a.eval();
+    return detail::insn::i_shift_l(qa, count);
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> shift_l(const uint16<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_l(a.eval(), count);
+}
+
+/** Shifts 32-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> shift_l(const int32<N,E>& a, unsigned count)
+{
+    uint32<N> qa = a.eval();
+    return detail::insn::i_shift_l(qa, count);
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> shift_l(const uint32<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_l(a.eval(), count);
+}
+
+/** Shifts 64-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int64<N,expr_empty> shift_l(const int64<N,E>& a, unsigned count)
+{
+    uint64<N> qa = a.eval();
+    return detail::insn::i_shift_l(qa, count);
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint64<N,expr_empty> shift_l(const uint64<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_l(a.eval(), count);
+}
+
+// -----------------------------------------------------------------------------
+// shift by vector
+
+/** Shifts 8-bit values left by the number of bits in corresponding element
+    in the given count vector. Zero bits are shifted in.
+
+    @code
+    r0 = a0 << count0
+    ...
+    rN = aN << countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> shift_l(const int8<N,E>& a, const uint8<N,E>& count)
+{
+#if SIMDPP_HAS_INT8_SHIFT_R_BY_VECTOR
+    uint8<N> qa = a.eval();
+    return detail::insn::i_shift_l_v(qa, count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> shift_l(const uint8<N,E>& a, const uint8<N,E>& count)
+{
+#if SIMDPP_HAS_UINT8_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_l_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts 16-bit values left by the number of bits in corresponding element
+    in the given count vector. Zero bits are shifted in.
+
+    @code
+    r0 = a0 << count0
+    ...
+    rN = aN << countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> shift_l(const int16<N,E>& a, const uint16<N,E>& count)
+{
+#if SIMDPP_HAS_INT16_SHIFT_R_BY_VECTOR
+    uint16<N> qa = a.eval();
+    return detail::insn::i_shift_l_v(qa, count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> shift_l(const uint16<N,E>& a, const uint16<N,E>& count)
+{
+#if SIMDPP_HAS_UINT16_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_l_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts 32-bit values left by the number of bits in corresponding element
+    in the given count vector. Zero bits are shifted in.
+
+    @code
+    r0 = a0 << count0
+    ...
+    rN = aN << countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> shift_l(const int32<N,E>& a, const uint32<N,E>& count)
+{
+#if SIMDPP_HAS_INT32_SHIFT_R_BY_VECTOR
+    uint32<N> qa = a.eval();
+    return detail::insn::i_shift_l_v(qa, count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+template<unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> shift_l(const uint32<N,E>& a, const uint32<N,E>& count)
+{
+#if SIMDPP_HAS_UINT32_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_l_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+// -----------------------------------------------------------------------------
+// shift by compile-time constant
+
+/** Shifts 8-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> shift_l(const int8<N,E>& a)
+{
+    static_assert(count < 8, "Shift out of bounds");
+    uint8<N> qa = a.eval();
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(qa);
+}
+
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> shift_l(const uint8<N,E>& a)
+{
+    static_assert(count < 8, "Shift out of bounds");
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts 16-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> shift_l(const int16<N,E>& a)
+{
+    static_assert(count < 16, "Shift out of bounds");
+    uint16<N> qa = a.eval();
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(qa);
+}
+
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> shift_l(const uint16<N,E>& a)
+{
+    static_assert(count < 16, "Shift out of bounds");
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts 32-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> shift_l(const int32<N,E>& a)
+{
+    static_assert(count < 32, "Shift out of bounds");
+    uint32<N> qa = a.eval();
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(qa);
+}
+
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> shift_l(const uint32<N,E>& a)
+{
+    static_assert(count < 32, "Shift out of bounds");
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts 64-bit values left by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 << count
+    ...
+    rN = aN << count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int64<N,expr_empty> shift_l(const int64<N,E>& a)
+{
+    static_assert(count < 64, "Shift out of bounds");
+    uint64<N> qa = a.eval();
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(qa);
+}
+
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint64<N,expr_empty> shift_l(const uint64<N,E>& a)
+{
+    static_assert(count < 64, "Shift out of bounds");
+    return detail::insn::i_shift_l_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 398 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_shift_r.h

@@ -0,0 +1,398 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_H
+#define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/capabilities.h>
+#include <simdpp/detail/insn/i_shift_r.h>
+#include <simdpp/detail/insn/i_shift_r_v.h>
+#include <simdpp/detail/not_implemented.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+// -----------------------------------------------------------------------------
+// shift by scalar
+
+/** Shifts signed 8-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> shift_r(const int8<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts unsigned 8-bit values right by @a count bits while shifting in zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> shift_r(const uint8<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts signed 16-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> shift_r(const int16<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts unsigned 16-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> shift_r(const uint16<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts signed 32-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> shift_r(const int32<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts unsigned 32-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> shift_r(const uint32<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts signed 64-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int64<N,expr_empty> shift_r(const int64<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+/** Shifts unsigned 64-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint64<N,expr_empty> shift_r(const uint64<N,E>& a, unsigned count)
+{
+    return detail::insn::i_shift_r(a.eval(), count);
+}
+
+// -----------------------------------------------------------------------------
+// shift by vector
+
+/** Shifts signed 8-bit values right by the number of bits in corresponding
+    element in the given count vector. Sign bits are shifted in.
+
+    @code
+    r0 = a0 >> count0
+    ...
+    rN = aN >> countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> shift_r(const int8<N,E>& a, const uint8<N,E>& count)
+{
+#if SIMDPP_HAS_INT8_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_r_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts unsigned 8-bit values right by the number of bits in corresponding
+    element in the given count vector. Zero bits are shifted in.
+
+    @code
+    r0 = a0 >> count0
+    ...
+    rN = aN >> countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> shift_r(const uint8<N,E>& a, const uint8<N,E>& count)
+{
+#if SIMDPP_HAS_UINT8_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_r_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts signed 16-bit values right by the number of bits in corresponding
+    element in the given count vector. Sign bits are shifted in.
+
+    @code
+    r0 = a0 >> count0
+    ...
+    rN = aN >> countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> shift_r(const int16<N,E>& a, const uint16<N,E>& count)
+{
+#if SIMDPP_HAS_INT16_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_r_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts unsigned 16-bit values right by the number of bits in corresponding
+    element in the given count vector. Zero bits are shifted in.
+
+    @code
+    r0 = a0 >> count0
+    ...
+    rN = aN >> countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> shift_r(const uint16<N,E>& a, const uint16<N,E>& count)
+{
+#if SIMDPP_HAS_UINT16_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_r_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts signed 32-bit values right by the number of bits in corresponding
+    element in the given count vector. Sign bits are shifted in.
+
+    @code
+    r0 = a0 >> count0
+    ...
+    rN = aN >> countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> shift_r(const int32<N,E>& a, const uint32<N,E>& count)
+{
+#if SIMDPP_HAS_INT32_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_r_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+/** Shifts unsigned 32-bit values right by the number of bits in corresponding
+    element in the given count vector. Zero bits are shifted in.
+
+    @code
+    r0 = a0 >> count0
+    ...
+    rN = aN >> countN
+    @endcode
+*/
+template<unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> shift_r(const uint32<N,E>& a, const uint32<N,E>& count)
+{
+#if SIMDPP_HAS_UINT32_SHIFT_R_BY_VECTOR
+    return detail::insn::i_shift_r_v(a.eval(), count.eval());
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(E, a, count);
+#endif
+}
+
+// -----------------------------------------------------------------------------
+// shift by compile-time constant
+
+/** Shifts signed 8-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int8<N,expr_empty> shift_r(const int8<N,E>& a)
+{
+    static_assert(count < 8, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts unsigned 8-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint8<N,expr_empty> shift_r(const uint8<N,E>& a)
+{
+    static_assert(count < 8, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts signed 16-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int16<N,expr_empty> shift_r(const int16<N,E>& a)
+{
+    static_assert(count < 16, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts unsigned 16-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint16<N,expr_empty> shift_r(const uint16<N,E>& a)
+{
+    static_assert(count < 16, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts signed 32-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int32<N,expr_empty> shift_r(const int32<N,E>& a)
+{
+    static_assert(count < 32, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts unsigned 32-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint32<N,expr_empty> shift_r(const uint32<N,E>& a)
+{
+    static_assert(count < 32, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts signed 64-bit values right by @a count bits while shifting in the
+    sign bit.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+int64<N,expr_empty> shift_r(const int64<N,E>& a)
+{
+    static_assert(count < 64, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+/** Shifts unsigned 64-bit values right by @a count bits while shifting in
+    zeros.
+
+    @code
+    r0 = a0 >> count
+    ...
+    rN = aN >> count
+    @endcode
+*/
+template<unsigned count, unsigned N, class E> SIMDPP_INL
+uint64<N,expr_empty> shift_r(const uint64<N,E>& a)
+{
+    static_assert(count < 64, "Shift out of bounds");
+    return detail::insn::i_shift_r_wrapper<count == 0>::template run<count>(a.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 117 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_sub.h

@@ -0,0 +1,117 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_SUB_H
+#define LIBSIMDPP_SIMDPP_CORE_I_SUB_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_sub.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+#include <simdpp/core/detail/get_expr_uint.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Subtracts 8-bit integer values.
+
+    @code
+    r0 = a0 - b0
+    ...
+    rN = aN - bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_isub, V1, V2>::type
+        sub(const any_int8<N,V1>& a,
+            const any_int8<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(sub, expr_isub, any_int8, int8)
+
+/** Subtracts 16-bit integer values.
+
+    @code
+    r0 = a0 - b0
+    ...
+    rN = aN - bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_isub, V1, V2>::type
+        sub(const any_int16<N,V1>& a,
+            const any_int16<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(sub, expr_isub, any_int16, int16)
+
+/** Subtracts 32-bit integer values.
+
+    @code
+    r0 = a0 - b0
+    ...
+    rN = aN - bN
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_isub, V1, V2>::type
+        sub(const any_int32<N,V1>& a,
+            const any_int32<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(sub, expr_isub, any_int32, int32)
+
+/** Subtracts 64-bit integer values.
+
+    @code
+    r0 = a0 - b0
+    ...
+    rN = aN - bN
+    @endcode
+
+    @par 128-bit version:
+    @icost{ALTIVEC, 5-6}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, 2}
+    @icost{ALTIVEC, 10-11}
+*/
+template<unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr_uint<expr_isub, V1, V2>::type
+        sub(const any_int64<N,V1>& a,
+            const any_int64<N,V2>& b)
+{
+    return { { a.wrapped(), b.wrapped() } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_INT_UNSIGNED(sub, expr_isub, any_int64, int64)
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 110 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/i_sub_sat.h

@@ -0,0 +1,110 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_I_SUBS_H
+#define LIBSIMDPP_SIMDPP_CORE_I_SUBS_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/expr/i_sub_sat.h>
+#include <simdpp/core/detail/scalar_arg_impl.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Subtracts and saturaters signed 8-bit integer values.
+
+    @code
+    r0 = saturated(a0 - b0)
+    ...
+    rN = saturated(aN - bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int8<N, expr_isub_sat<int8<N,E1>,
+                      int8<N,E2>>> sub_sat(const int8<N,E1>& a,
+                                           const int8<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(sub_sat, expr_isub_sat, int8, int8)
+
+/** Subtracts and saturaters signed 16-bit integer values.
+
+    @code
+    r0 = saturated(a0 - b0)
+    ...
+    rN = saturated(aN - bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+int16<N, expr_isub_sat<int16<N,E1>,
+                       int16<N,E2>>> sub_sat(const int16<N,E1>& a,
+                                             const int16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(sub_sat, expr_isub_sat, int16, int16)
+
+/** Subtracts and saturaters unsigned 8-bit integer values.
+
+    @code
+    r0 = saturated(a0 - b0)
+    ...
+    rN = saturated(aN - bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint8<N, expr_isub_sat<uint8<N,E1>,
+                       uint8<N,E2>>> sub_sat(const uint8<N,E1>& a,
+                                             const uint8<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(sub_sat, expr_isub_sat, uint8, uint8)
+
+/** Subtracts and saturaters unsigned 16-bit integer values.
+
+    @code
+    r0 = saturated(a0 - b0)
+    ...
+    rN = saturated(aN - bN)
+    @endcode
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned N, class E1, class E2> SIMDPP_INL
+uint16<N, expr_isub_sat<uint16<N,E1>,
+                        uint16<N,E2>>> sub_sat(const uint16<N,E1>& a,
+                                               const uint16<N,E2>& b)
+{
+    return { { a, b } };
+}
+
+SIMDPP_SCALAR_ARG_IMPL_EXPR(sub_sat, expr_isub_sat, uint16, uint16)
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 107 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/insert.h

@@ -0,0 +1,107 @@
+/*  Copyright (C) 2011-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMD_INSERT_H
+#define LIBSIMDPP_SIMD_INSERT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/insert.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Inserts an element into a vector at the position identified by @a id.
+
+    @code
+    r0 = (id == 0) ? x : a0
+    ...
+    rN = (id == N) ? x : aN
+    @endcode
+
+    This function may have very high latency.
+    It is expected that the argument comes from a general-purpose register.
+*/
+template<unsigned id, unsigned N> SIMDPP_INL
+uint8<N> insert(const uint8<N>& a, uint8_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_insert<id>(a, x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int8<N> insert(const int8<N>& a, int8_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return (int8<N>) detail::insn::i_insert<id>((uint8<N>) a, (uint8_t)x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+uint16<N> insert(const uint16<N>& a, uint16_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_insert<id>(a, x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int16<N> insert(const int16<N>& a, int16_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return (int16<N>) detail::insn::i_insert<id>((uint16<N>) a, (uint16_t)x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+uint32<N> insert(const uint32<N>& a, uint32_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_insert<id>(a, x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int32<N> insert(const int32<N>& a, int32_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return (int32<N>) detail::insn::i_insert<id>((uint32<N>)a, (uint32_t)x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+uint64<N> insert(const uint64<N>& a, uint64_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_insert<id>(a, x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+int64<N> insert(const int64<N>& a, int64_t x)
+{
+    static_assert(id < N, "index out of bounds");
+    return (int64<N>) detail::insn::i_insert<id>((uint64<N>)a, (uint64_t)x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+float32<N> insert(const float32<N>& a, float x)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_insert<id>(a, x);
+}
+
+template<unsigned id, unsigned N> SIMDPP_INL
+float64<N> insert(const float64<N>& a, double x)
+{
+    static_assert(id < N, "index out of bounds");
+    return detail::insn::i_insert<id>(a, x);
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 62 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/load.h

@@ -0,0 +1,62 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_LOAD_H
+#define LIBSIMDPP_SIMDPP_CORE_LOAD_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/load.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector
+    from an aligned memory location.
+
+    @par 128-bit version:
+
+    @code
+    a[0..127] = *(p)
+    @endcode
+    @a p must be aligned to 16 bytes.
+
+    @par 256-bit version:
+
+    @code
+    a[0..255] = *(p)
+    @endcode
+    @a p must be aligned to 32 bytes.
+
+    @icost{SSE2-SSE4.1, NEON, ALTIVEC, 2}
+    @icost{AVX (integer vectors), 2}
+*/
+// Fixme return empty expression
+template<class T>
+SIMDPP_INL expr_vec_load load(const T* p)
+{
+    expr_vec_load r;
+    r.a = reinterpret_cast<const char*>(p);
+    return r;
+}
+
+template<class V, class T> SIMDPP_INL
+V load(const T* p)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    return detail::insn::i_load_any<V>(reinterpret_cast<const char*>(p));
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 49 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/load_packed2.h

@@ -0,0 +1,49 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_LOAD_PACKED2_H
+#define LIBSIMDPP_SIMDPP_CORE_LOAD_PACKED2_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/load_packed2.h>
+#include <simdpp/detail/traits.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Loads values packed in pairs, de-interleaves them and stores the result
+    into two vectors.
+
+    @code
+    a = [ *(p),   *(p+2), *(p+4), ... , *(p+M*2-2) ]
+    b = [ *(p+1), *(p+3), *(p+5), ... , *(p+M*2-1) ]
+    @endcode
+
+    Here M is the number of elements in the vector
+
+    @a p must be aligned to the vector size in bytes
+*/
+template<unsigned N, class V, class T> SIMDPP_INL
+void load_packed2(any_vec<N,V>& a, any_vec<N,V>& b, const T* p)
+{
+    static_assert(!is_mask<V>::value, "Mask types can not be loaded");
+    typename detail::get_expr_nosign<V>::type ra, rb;
+    detail::insn::i_load_packed2(ra, rb, reinterpret_cast<const char*>(p));
+    a.wrapped() = ra;
+    b.wrapped() = rb;
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 52 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/load_packed3.h

@@ -0,0 +1,52 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_LOAD_PACKED3_H
+#define LIBSIMDPP_SIMDPP_CORE_LOAD_PACKED3_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/load_packed3.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Loads values packed in triplets, de-interleaves them and stores the result
+    into three vectors.
+
+    @code
+    a = [ *(p),   *(p+3), *(p+6), ... , *(p+M*3-3) ]
+    b = [ *(p+1), *(p+4), *(p+7), ... , *(p+M*3-2) ]
+    c = [ *(p+2), *(p+5), *(p+8), ... , *(p+M*3-1) ]
+    @endcode
+
+    Here M is the number of elements in the vector
+
+    @a p must be aligned to the vector size in bytes
+*/
+template<unsigned N, class V, class T> SIMDPP_INL
+void load_packed3(any_vec<N,V>& a, any_vec<N,V>& b, any_vec<N,V>& c,
+                  const T* p)
+{
+    static_assert(!is_mask<V>::value, "Mask types can not be loaded");
+    typename detail::get_expr_nosign<V>::type ra, rb, rc;
+    detail::insn::i_load_packed3(ra, rb, rc, reinterpret_cast<const char*>(p));
+    a.wrapped() = ra;
+    b.wrapped() = rb;
+    c.wrapped() = rc;
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 55 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/load_packed4.h

@@ -0,0 +1,55 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_LOAD_PACKED4_H
+#define LIBSIMDPP_SIMDPP_CORE_LOAD_PACKED4_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/load_packed4.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+
+/** Loads values packed in quartets, de-interleaves them and stores the result
+    into four vectors.
+
+    @code
+    a = [ *(p),   *(p+4), *(p+8),  ... , *(p+M*4-4) ]
+    b = [ *(p+1), *(p+5), *(p+9),  ... , *(p+M*4-3) ]
+    c = [ *(p+2), *(p+6), *(p+10), ... , *(p+M*4-2) ]
+    d = [ *(p+3), *(p+7), *(p+11), ... , *(p+M*4-1) ]
+    @endcode
+
+    Here M is the number of elements in the vector
+
+    @a p must be aligned to the vector size in bytes
+*/
+template<unsigned N, class V, class T> SIMDPP_INL
+void load_packed4(any_vec<N,V>& a, any_vec<N,V>& b,
+                  any_vec<N,V>& c, any_vec<N,V>& d,
+                  const T* p)
+{
+    static_assert(!is_mask<V>::value, "Mask types can not be loaded");
+    typename detail::get_expr_nosign<V>::type ra, rb, rc, rd;
+    detail::insn::i_load_packed4(ra, rb, rc, rd, reinterpret_cast<const char*>(p));
+    a.wrapped() = ra;
+    b.wrapped() = rb;
+    c.wrapped() = rc;
+    d.wrapped() = rd;
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 51 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/load_splat.h

@@ -0,0 +1,51 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_LOAD_SPLAT_H
+#define LIBSIMDPP_SIMDPP_CORE_LOAD_SPLAT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/load_splat.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Loads a value from a memory location and broadcasts it to all elements of a
+    vector.
+
+    @code
+    r0 = *p
+    ...
+    rN = *p
+    @endcode
+
+    @a p must have the alignment of the element of the target vector.
+*/
+// FIXME: return empty expression
+template<class T>
+SIMDPP_INL expr_vec_load_splat load_splat(const T* p)
+{
+    return expr_vec_load_splat(reinterpret_cast<const char*>(p));
+}
+
+template<class V, class T> SIMDPP_INL
+V load_splat(const T* p)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    return detail::insn::i_load_splat_any<V>(reinterpret_cast<const char*>(p));
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 67 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/load_u.h

@@ -0,0 +1,67 @@
+/*  Copyright (C) 2013  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_LOAD_U_H
+#define LIBSIMDPP_SIMDPP_CORE_LOAD_U_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/load_u.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an
+    unaligned memory location.
+
+    @par 128-bit version:
+
+    @code
+    a[0..127] = *(p)
+    @endcode
+
+    @a p must be aligned to the element size. If @a p is aligned to 16 bytes
+    only the referenced 16 byte block is accessed. Otherwise, memory within the
+    smallest 16-byte aligned 32-byte block may be accessed.
+
+    @icost{ALTIVEC, 4}
+
+    @par 256-bit version:
+
+    @code
+    a[0..255] = *(p)
+    @endcode
+    @a p must be aligned to 32 bytes.
+    @icost{SSE2-SSE4.1, NEON, 2}
+    @icost{ALTIVEC, 6}
+
+    @a p must be aligned to the element size. If @a p is aligned to 32 bytes
+    only the referenced 16 byte block is accessed. Otherwise, memory within the
+    smallest 32-byte aligned 64-byte block may be accessed.
+*/
+// Fixme return empty expression
+template<class T>
+SIMDPP_INL expr_vec_load_u load_u(const T* p)
+{
+    return { reinterpret_cast<const char*>(p) };
+}
+
+template<class V, class T> SIMDPP_INL
+V load_u(const T* p)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    return detail::insn::i_load_u_any<V>(reinterpret_cast<const char*>(p));
+}
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 166 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/make_float.h

@@ -0,0 +1,166 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_MAKE_FLOAT_H
+#define LIBSIMDPP_SIMDPP_CORE_MAKE_FLOAT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/make_const.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Creates a vector from floating-point values known at compile-time.
+    The result of this function may be assigned or converted to a vector of any
+    type: standard conversions are used to convert the arguments. All
+    conversions and other overhead is performed at compile-time thus even if the
+    minimal optimization level is selected, the function results in a simple
+    load from memory.
+
+    The function is not guaranteed to have adequate performance if the
+    arguments are not known at compile-time.
+
+    If the vector has fewer elements than the number of the parameters this
+    function accepts then the extra values are discarded.
+
+    @par 1 parameter version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v0 v0 v0 ... v0 ]
+    @endcode
+
+    @par 2 parameters version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v1 v0 v1 ... v1 ]
+    @endcode
+
+    @par 4 parameters version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v1 v2 v3 ... v3 ]
+    @endcode
+
+    @par 8 parameters version
+    @code
+        | 0  1  ..  7  8  ... n  |
+    r = [ v0 v1 .. v7 v0  ... v7 ]
+    @endcode
+*/
+SIMDPP_INL expr_vec_make_const<double,1> make_float(double v0)
+{
+    expr_vec_make_const<double,1> a;
+    a.a[0] = v0;
+    return a;
+}
+
+SIMDPP_INL expr_vec_make_const<double,2> make_float(double v0, double v1)
+{
+    expr_vec_make_const<double,2> a;
+    a.a[0] = v0;  a.a[1] = v1;
+    return a;
+}
+
+SIMDPP_INL expr_vec_make_const<double,4>
+    make_float(double v0, double v1, double v2, double v3)
+{
+    expr_vec_make_const<double,4> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    return a;
+}
+
+SIMDPP_INL expr_vec_make_const<double,8>
+    make_float(double v0, double v1, double v2, double v3,
+               double v4, double v5, double v6, double v7)
+{
+    expr_vec_make_const<double,8> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    a.a[4] = v4;  a.a[5] = v5;  a.a[6] = v6;  a.a[7] = v7;
+    return a;
+}
+
+SIMDPP_INL expr_vec_make_const<double,16>
+    make_float(double v0,  double v1,  double v2,  double v3,
+               double v4,  double v5,  double v6,  double v7,
+               double v8,  double v9,  double v10, double v11,
+               double v12, double v13, double v14, double v15)
+{
+    expr_vec_make_const<double,16> a;
+    a.a[0] = v0;    a.a[1] = v1;    a.a[2] = v2;    a.a[3] = v3;
+    a.a[4] = v4;    a.a[5] = v5;    a.a[6] = v6;    a.a[7] = v7;
+    a.a[8] = v8;    a.a[9] = v9;    a.a[10] = v10;  a.a[11] = v11;
+    a.a[12] = v12;  a.a[13] = v13;  a.a[14] = v14;  a.a[15] = v15;
+    return a;
+}
+
+template<class V> SIMDPP_INL
+V make_float(double v0)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<double,1> a;
+    a.a[0] = v0;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_float(double v0, double v1)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<double,2> a;
+    a.a[0] = v0;  a.a[1] = v1;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_float(double v0, double v1, double v2, double v3)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<double,4> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_float(double v0, double v1, double v2, double v3,
+             double v4, double v5, double v6, double v7)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<double,8> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    a.a[4] = v4;  a.a[5] = v5;  a.a[6] = v6;  a.a[7] = v7;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_float(double v0,  double v1,  double v2,  double v3,
+             double v4,  double v5,  double v6,  double v7,
+             double v8,  double v9,  double v10, double v11,
+             double v12, double v13, double v14, double v15)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<double,16> a;
+    a.a[0] = v0;    a.a[1] = v1;    a.a[2] = v2;    a.a[3] = v3;
+    a.a[4] = v4;    a.a[5] = v5;    a.a[6] = v6;    a.a[7] = v7;
+    a.a[8] = v8;    a.a[9] = v9;    a.a[10] = v10;  a.a[11] = v11;
+    a.a[12] = v12;  a.a[13] = v13;  a.a[14] = v14;  a.a[15] = v15;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 171 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/make_int.h

@@ -0,0 +1,171 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_MAKE_INT_H
+#define LIBSIMDPP_SIMDPP_CORE_MAKE_INT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/make_const.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Creates a vector from signed integer values known at compile-time.
+    The result of this function may be assigned or converted to a vector of any
+    type: standard conversions are used to convert the arguments. All
+    conversions and other overhead is performed at compile-time thus even if the
+    minimal optimization level is selected, the function results in a simple
+    load from memory.
+
+    The function is not guaranteed to have adequate performance if the
+    arguments are not known at compile-time.
+
+    If the vector has fewer elements than the number of the parameters this
+    function accepts then the extra values are discarded.
+
+    @par 1 parameter version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v0 v0 v0 ... v0 ]
+    @endcode
+
+    @par 2 parameters version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v1 v0 v1 ... v1 ]
+    @endcode
+
+    @par 4 parameters version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v1 v2 v3 ... v3 ]
+    @endcode
+
+    @par 8 parameters version
+    @code
+        | 0  1  ..  7  8  ... n  |
+    r = [ v0 v1 .. v7 v0  ... v7 ]
+    @endcode
+*/
+// FIXME: return empty expr
+SIMDPP_INL expr_vec_make_const<int64_t,1> make_int(int64_t v0)
+{
+    expr_vec_make_const<int64_t,1> a;
+    a.a[0] = v0;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<int64_t,2> make_int(int64_t v0, int64_t v1)
+{
+    expr_vec_make_const<int64_t,2> a;
+    a.a[0] = v0;  a.a[1] = v1;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<int64_t,4>
+    make_int(int64_t v0, int64_t v1, int64_t v2, int64_t v3)
+{
+    expr_vec_make_const<int64_t,4> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<int64_t,8>
+    make_int(int64_t v0, int64_t v1, int64_t v2, int64_t v3,
+             int64_t v4, int64_t v5, int64_t v6, int64_t v7)
+{
+    expr_vec_make_const<int64_t,8> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    a.a[4] = v4;  a.a[5] = v5;  a.a[6] = v6;  a.a[7] = v7;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<int64_t,16>
+    make_int(int64_t v0,  int64_t v1,  int64_t v2,  int64_t v3,
+             int64_t v4,  int64_t v5,  int64_t v6,  int64_t v7,
+             int64_t v8,  int64_t v9,  int64_t v10, int64_t v11,
+             int64_t v12, int64_t v13, int64_t v14, int64_t v15)
+{
+    expr_vec_make_const<int64_t,16> a;
+    a.a[0] = v0;    a.a[1] = v1;    a.a[2] = v2;    a.a[3] = v3;
+    a.a[4] = v4;    a.a[5] = v5;    a.a[6] = v6;    a.a[7] = v7;
+    a.a[8] = v8;    a.a[9] = v9;    a.a[10] = v10;  a.a[11] = v11;
+    a.a[12] = v12;  a.a[13] = v13;  a.a[14] = v14;  a.a[15] = v15;
+    return a;
+}
+
+template<class V> SIMDPP_INL
+V make_int(int64_t v0)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<int64_t,1> a;
+    a.a[0] = v0;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_int(int64_t v0, int64_t v1)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<int64_t,2> a;
+    a.a[0] = v0;  a.a[1] = v1;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_int(int64_t v0, int64_t v1, int64_t v2, int64_t v3)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<int64_t,4> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_int(int64_t v0, int64_t v1, int64_t v2, int64_t v3,
+           int64_t v4, int64_t v5, int64_t v6, int64_t v7)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<int64_t,8> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    a.a[4] = v4;  a.a[5] = v5;  a.a[6] = v6;  a.a[7] = v7;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_int(int64_t v0,  int64_t v1,  int64_t v2,  int64_t v3,
+           int64_t v4,  int64_t v5,  int64_t v6,  int64_t v7,
+           int64_t v8,  int64_t v9,  int64_t v10, int64_t v11,
+           int64_t v12, int64_t v13, int64_t v14, int64_t v15)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<int64_t,16> a;
+    a.a[0] = v0;    a.a[1] = v1;    a.a[2] = v2;    a.a[3] = v3;
+    a.a[4] = v4;    a.a[5] = v5;    a.a[6] = v6;    a.a[7] = v7;
+    a.a[8] = v8;    a.a[9] = v9;    a.a[10] = v10;  a.a[11] = v11;
+    a.a[12] = v12;  a.a[13] = v13;  a.a[14] = v14;  a.a[15] = v15;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 559 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/make_shuffle_bytes_mask.h

@@ -0,0 +1,559 @@
+/*  Copyright (C) 2012-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_SHUFFLE_BYTES_MASK_H
+#define LIBSIMDPP_SIMDPP_SHUFFLE_BYTES_MASK_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <cstdint>
+#include <simdpp/types.h>
+#include <simdpp/core/make_uint.h>
+#include <simdpp/detail/array.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+namespace detail {
+
+/// s - selector, u - number of elements per group
+template<int s, unsigned u> SIMDPP_INL
+void assert_selector_range()
+{
+    static_assert(-1 <= s && s < u*2, "Selector out of range");
+}
+
+template<int s0, int s1, int u> SIMDPP_INL
+void assert_selector_range()
+{
+    static_assert(-1 <= s0 && s0 < u*2, "Selector out of range");
+    static_assert(-1 <= s1 && s1 < u*2, "Selector out of range");
+}
+
+template<int s0, int s1, int s2, int s3, int u> SIMDPP_INL
+void assert_selector_range()
+{
+    static_assert(-1 <= s0 && s0 < u*2, "Selector out of range");
+    static_assert(-1 <= s1 && s1 < u*2, "Selector out of range");
+    static_assert(-1 <= s2 && s2 < u*2, "Selector out of range");
+    static_assert(-1 <= s3 && s3 < u*2, "Selector out of range");
+}
+
+/// s - selector, u - the number of elements per group
+template<int s, int u>
+struct get_shuffle_bytex1_16 {
+    static const unsigned r0 = (s == -1) ? 0x80 : (s < u ? s : (s-u)+16);
+};
+
+/// s - selector, u - the number of elements per group
+template<int s, int u>
+struct get_shuffle_bytex2_16 {
+    static const unsigned r0 = (s == -1) ? 0x80 : (s < u ? s*2   : (s-u)*2+16);
+    static const unsigned r1 = (s == -1) ? 0x80 : r0+1;
+};
+
+/// s - selector, u - the number of elements per group
+template<int s, int u>
+struct get_shuffle_bytex4_16 {
+    static const unsigned r0 = (s == -1) ? 0x80 : (s < u ? s*4   : (s-u)*4+16);
+    static const unsigned r1 = (s == -1) ? 0x80 : r0+1;
+    static const unsigned r2 = (s == -1) ? 0x80 : r0+2;
+    static const unsigned r3 = (s == -1) ? 0x80 : r0+3;
+};
+
+/// s - selector, u - the number of elements per group
+template<int s, int u>
+struct get_shuffle_bytex8_16 {
+    static const unsigned r0 = (s == -1) ? 0x80 : (s < u ? s*8   : (s-u)*8+16);
+    static const unsigned r1 = (s == -1) ? 0x80 : r0+1;
+    static const unsigned r2 = (s == -1) ? 0x80 : r0+2;
+    static const unsigned r3 = (s == -1) ? 0x80 : r0+3;
+    static const unsigned r4 = (s == -1) ? 0x80 : r0+4;
+    static const unsigned r5 = (s == -1) ? 0x80 : r0+5;
+    static const unsigned r6 = (s == -1) ? 0x80 : r0+6;
+    static const unsigned r7 = (s == -1) ? 0x80 : r0+7;
+};
+
+} // namespace detail
+
+/** Makes a mask to shuffle an int8x16 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    All elements within vectors are grouped into sets of two adjacent elements.
+    Elements within each set of the resulting vector can be selected only from
+    corresponding sets of the source vectors.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,1] select elements from the first vector.
+     * Values [2,3] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
+    r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
+    r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
+    r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
+    ...
+    r14 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+14] : b[s0+12])
+    r15 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+14] : b[s1+12])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, unsigned N> SIMDPP_INL
+uint8<N> make_shuffle_bytes16_mask(uint8<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,2>();
+    const unsigned b0 = detail::get_shuffle_bytex1_16<s0,2>::r0;
+    const unsigned b1 = detail::get_shuffle_bytex1_16<s1,2>::r0;
+    mask = make_uint(b0,   b1,   b0+2, b1+2,
+                     b0+4, b1+4, b0+6, b1+6,
+                     b0+8, b1+8, b0+10,b1+10,
+                     b0+12,b1+12,b0+14,b1+14);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int8x16 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    All elements within vectors are grouped into sets of four adjacent
+    elements. Elements within each set of the resulting vector can be selected
+    only from corresponding sets of the source vectors.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,3] select elements from the first vector.
+     * Values [4,7] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
+    r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
+    r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
+    r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
+    ...
+    r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
+    r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
+    r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
+    r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, int s2, int s3, unsigned N> SIMDPP_INL
+uint8<N> make_shuffle_bytes16_mask(uint8<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,s2,s3,4>();
+    const unsigned b0 = detail::get_shuffle_bytex1_16<s0,4>::r0;
+    const unsigned b1 = detail::get_shuffle_bytex1_16<s1,4>::r0;
+    const unsigned b2 = detail::get_shuffle_bytex1_16<s2,4>::r0;
+    const unsigned b3 = detail::get_shuffle_bytex1_16<s3,4>::r0;
+    mask = make_uint(b0,   b1,   b2,   b3,
+                     b0+4, b1+4, b2+4, b3+4,
+                     b0+8, b1+8, b2+8, b3+8,
+                     b0+12,b1+12,b2+12,b3+12);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int8x16 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    All elements within vectors are grouped into sets of eight adjacent
+    elements. Elements within each set of the resulting vector can be selected
+    only from corresponding sets of the source vectors.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,7] select elements from the first vector.
+     * Values [8,15] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
+    ...
+    r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
+    r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
+    ...
+    r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N> SIMDPP_INL
+uint8<N> make_shuffle_bytes16_mask(uint8<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,s2,s3,8>();
+    detail::assert_selector_range<s4,s5,s6,s7,8>();
+    const unsigned b0 = detail::get_shuffle_bytex1_16<s0,8>::r0;
+    const unsigned b1 = detail::get_shuffle_bytex1_16<s1,8>::r0;
+    const unsigned b2 = detail::get_shuffle_bytex1_16<s2,8>::r0;
+    const unsigned b3 = detail::get_shuffle_bytex1_16<s3,8>::r0;
+    const unsigned b4 = detail::get_shuffle_bytex1_16<s4,8>::r0;
+    const unsigned b5 = detail::get_shuffle_bytex1_16<s5,8>::r0;
+    const unsigned b6 = detail::get_shuffle_bytex1_16<s6,8>::r0;
+    const unsigned b7 = detail::get_shuffle_bytex1_16<s7,8>::r0;
+    mask = make_uint(b0,   b1,   b2,   b3,
+                     b4,   b5,   b6,   b7,
+                     b0+8, b1+8, b2+8, b3+8,
+                     b4+8, b5+8, b6+8, b7+8);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int8x16 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,15] select elements from the first vector.
+     * Values [16,32] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
+    r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
+    ...
+    r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7,
+         int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15, unsigned N> SIMDPP_INL
+uint8<N> make_shuffle_bytes16_mask(uint8<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,s2,s3,16>();
+    detail::assert_selector_range<s4,s5,s6,s7,16>();
+    detail::assert_selector_range<s8,s9,s10,s11,16>();
+    detail::assert_selector_range<s12,s13,s14,s15,16>();
+    const unsigned b0 = detail::get_shuffle_bytex1_16<s0,16>::r0;
+    const unsigned b1 = detail::get_shuffle_bytex1_16<s1,16>::r0;
+    const unsigned b2 = detail::get_shuffle_bytex1_16<s2,16>::r0;
+    const unsigned b3 = detail::get_shuffle_bytex1_16<s3,16>::r0;
+    const unsigned b4 = detail::get_shuffle_bytex1_16<s4,16>::r0;
+    const unsigned b5 = detail::get_shuffle_bytex1_16<s5,16>::r0;
+    const unsigned b6 = detail::get_shuffle_bytex1_16<s6,16>::r0;
+    const unsigned b7 = detail::get_shuffle_bytex1_16<s7,16>::r0;
+    const unsigned b8 = detail::get_shuffle_bytex1_16<s8,16>::r0;
+    const unsigned b9 = detail::get_shuffle_bytex1_16<s9,16>::r0;
+    const unsigned b10 = detail::get_shuffle_bytex1_16<s10,16>::r0;
+    const unsigned b11 = detail::get_shuffle_bytex1_16<s11,16>::r0;
+    const unsigned b12 = detail::get_shuffle_bytex1_16<s12,16>::r0;
+    const unsigned b13 = detail::get_shuffle_bytex1_16<s13,16>::r0;
+    const unsigned b14 = detail::get_shuffle_bytex1_16<s14,16>::r0;
+    const unsigned b15 = detail::get_shuffle_bytex1_16<s15,16>::r0;
+    mask = make_uint(b0,  b1,  b2,  b3,
+                     b4,  b5,  b6,  b7,
+                     b8,  b9,  b10, b11,
+                     b12, b13, b14, b15);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int16x8 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    All elements within vectors are grouped into sets of two adjacent elements.
+    Elements within each set of the resulting vector can be selected only from
+    corresponding sets of the source vectors.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,1] select elements from the first vector.
+     * Values [2,3] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
+    r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
+    r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
+    r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
+    ...
+    r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
+    r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, unsigned N> SIMDPP_INL
+uint16<N> make_shuffle_bytes16_mask(uint16<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,2>();
+    using b0 = typename detail::get_shuffle_bytex2_16<s0,2>;
+    using b1 = typename detail::get_shuffle_bytex2_16<s1,2>;
+    mask = (uint8<N*2>) make_uint(b0::r0,   b0::r1,   b1::r0,   b1::r1,
+                                  b0::r0+4, b0::r1+4, b1::r0+4, b1::r1+4,
+                                  b0::r0+8, b0::r1+8, b1::r0+8, b1::r1+8,
+                                  b0::r0+12,b0::r1+12,b1::r0+12,b1::r1+12);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int16x8 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    All elements within vectors are grouped into sets of four adjacent
+    elements. Elements within each set of the resulting vector can be selected
+    only from corresponding sets of the source vectors.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,3] select elements from the first vector.
+     * Values [4,7] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
+    r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
+    r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
+    r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
+    ...
+    r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
+    r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
+    r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
+    r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, int s2, int s3, unsigned N> SIMDPP_INL
+uint16<N> make_shuffle_bytes16_mask(uint16<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,s2,s3,4>();
+    using b0 = typename detail::get_shuffle_bytex2_16<s0,4>;
+    using b1 = typename detail::get_shuffle_bytex2_16<s1,4>;
+    using b2 = typename detail::get_shuffle_bytex2_16<s2,4>;
+    using b3 = typename detail::get_shuffle_bytex2_16<s3,4>;
+    mask = (uint8<N*2>) make_uint(b0::r0,   b0::r1,   b1::r0,   b1::r1,
+                                  b2::r0,   b2::r1,   b3::r0,   b3::r1,
+                                  b0::r0+8, b0::r1+8, b1::r0+8, b1::r1+8,
+                                  b2::r0+8, b2::r1+8, b3::r0+8, b3::r1+8);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int16x8 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,7] select elements from the first vector.
+     * Values [8,15] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
+    ...
+    r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
+    r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
+    ...
+    r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7,
+         unsigned N> SIMDPP_INL
+uint16<N> make_shuffle_bytes16_mask(uint16<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,s2,s3,8>();
+    detail::assert_selector_range<s4,s5,s6,s7,8>();
+
+    using b0 = typename detail::get_shuffle_bytex2_16<s0,8>;
+    using b1 = typename detail::get_shuffle_bytex2_16<s1,8>;
+    using b2 = typename detail::get_shuffle_bytex2_16<s2,8>;
+    using b3 = typename detail::get_shuffle_bytex2_16<s3,8>;
+    using b4 = typename detail::get_shuffle_bytex2_16<s4,8>;
+    using b5 = typename detail::get_shuffle_bytex2_16<s5,8>;
+    using b6 = typename detail::get_shuffle_bytex2_16<s6,8>;
+    using b7 = typename detail::get_shuffle_bytex2_16<s7,8>;
+    mask = (uint8<N*2>) make_uint(b0::r0, b0::r1, b1::r0, b1::r1,
+                                  b2::r0, b2::r1, b3::r0, b3::r1,
+                                  b4::r0, b4::r1, b5::r0, b5::r1,
+                                  b6::r0, b6::r1, b7::r0, b7::r1);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int32x4 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    All elements within vectors are grouped into sets of two adjacent elements.
+    Elements within each set of the resulting vector can be selected only from
+    corresponding sets of the source vectors.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,1] select elements from the first vector.
+     * Values [2,3] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
+    r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
+    r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
+    r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, unsigned N> SIMDPP_INL
+uint32<N> make_shuffle_bytes16_mask(uint32<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,2>();
+    using b0 = typename detail::get_shuffle_bytex4_16<s0,2>;
+    using b1 = typename detail::get_shuffle_bytex4_16<s1,2>;
+    mask = (uint8<N*4>) make_uint(b0::r0,   b0::r1,   b0::r2,   b0::r3,
+                                  b1::r0,   b1::r1,   b1::r2,   b1::r3,
+                                  b0::r0+8, b0::r1+8, b0::r2+8, b0::r3+8,
+                                  b1::r0+8, b1::r1+8, b1::r2+8, b1::r3+8);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int32x4 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,3] select elements from the first vector.
+     * Values [4,7] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
+    r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
+    r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
+    r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, int s2, int s3, unsigned N> SIMDPP_INL
+uint32<N> make_shuffle_bytes16_mask(uint32<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,s2,s3,4>();
+    using b0 = typename detail::get_shuffle_bytex4_16<s0,4>;
+    using b1 = typename detail::get_shuffle_bytex4_16<s1,4>;
+    using b2 = typename detail::get_shuffle_bytex4_16<s2,4>;
+    using b3 = typename detail::get_shuffle_bytex4_16<s3,4>;
+    mask = (uint8<N*4>) make_uint(b0::r0, b0::r1, b0::r2, b0::r3,
+                                  b1::r0, b1::r1, b1::r2, b1::r3,
+                                  b2::r0, b2::r1, b2::r2, b2::r3,
+                                  b3::r0, b3::r1, b3::r2, b3::r3);
+    return mask;
+}
+
+/** Makes a mask to shuffle an int64x2 vector using @c permute_bytes16,
+    @c shuffle_bytes16, @c permute_zbytes16 or @c shuffle_zbytes16 functions.
+
+    The template arguments define which elements to select from each element
+    group:
+     * Values [0,1] select elements from the first vector.
+     * Values [2,3] select elements from the second vector. The mask can only be
+       used in @c shuffle_bytes16 or @c shuffle_zbytes16
+     * Value [-1] sets the corresponding element to zero. The mask can only be
+       used in @c permute_zbytes16 or @c shuffle_zbytes16
+
+    @par 128-bit version:
+
+    The created mask will cause @c shuffle_bytes16 to perform as follows:
+    @code
+    r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])
+    r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])
+    @endcode
+
+    @par 256-bit version:
+
+    The vectors will be shuffled as if the 128-bit version was applied to the
+    lower and higher halves of the vectors separately.
+*/
+template<int s0, int s1, unsigned N> SIMDPP_INL
+uint64<N> make_shuffle_bytes16_mask(uint64<N> &mask)
+{
+    detail::assert_selector_range<s0,s1,2>();
+    using b0 = typename detail::get_shuffle_bytex8_16<s0,2>;
+    using b1 = typename detail::get_shuffle_bytex8_16<s1,2>;
+    mask = (uint8<N*8>) make_uint(b0::r0, b0::r1, b0::r2, b0::r3,
+                                  b0::r4, b0::r5, b0::r6, b0::r7,
+                                  b1::r0, b1::r1, b1::r2, b1::r3,
+                                  b1::r4, b1::r5, b1::r6, b1::r7);
+    return mask;
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif

+ 199 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/make_uint.h

@@ -0,0 +1,199 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_MAKE_UINT_H
+#define LIBSIMDPP_SIMDPP_CORE_MAKE_UINT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/make_const.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Creates a vector from unsigned integer values known at compile-time.
+    The result of this function may be assigned or converted to a vector of any
+    type: standard conversions are used to convert the arguments. All
+    conversions and other overhead is performed at compile-time thus even if the
+    minimal optimization level is selected, the function results in a simple
+    load from memory.
+
+    The function is not guaranteed to have adequate performance if the
+    arguments are not known at compile-time.
+
+    If the vector has fewer elements than the number of the parameters this
+    function accepts then the extra values are discarded.
+
+    Note that per C++ rules negative values are sign-extended to fill entire
+    element before being converted to unsigned type thus e.g. it's safe to use
+    -1 to fill element with ones.
+
+    @par 1 parameter version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v0 v0 v0 ... v0 ]
+    @endcode
+
+    @par 2 parameters version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v1 v0 v1 ... v1 ]
+    @endcode
+
+    @par 4 parameters version
+    @code
+        | 0  1  2  3  ... n  |
+    r = [ v0 v1 v2 v3 ... v3 ]
+    @endcode
+
+    @par 8 parameters version
+    @code
+        | 0  1  ..  7  8  ... n  |
+    r = [ v0 v1 .. v7 v0  ... v7 ]
+    @endcode
+*/
+// FIXME: return empty expr
+SIMDPP_INL expr_vec_make_const<uint64_t,1> make_uint(uint64_t v0)
+{
+    expr_vec_make_const<uint64_t,1> a;
+    a.a[0] = v0;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<uint64_t,2> make_uint(uint64_t v0, uint64_t v1)
+{
+    expr_vec_make_const<uint64_t,2> a;
+    a.a[0] = v0;  a.a[1] = v1;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<uint64_t,4>
+    make_uint(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3)
+{
+    expr_vec_make_const<uint64_t,4> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<uint64_t,8>
+    make_uint(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3,
+              uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7)
+{
+    expr_vec_make_const<uint64_t,8> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    a.a[4] = v4;  a.a[5] = v5;  a.a[6] = v6;  a.a[7] = v7;
+    return a;
+}
+
+static SIMDPP_INL
+expr_vec_make_const<uint64_t,16>
+    make_uint(uint64_t v0,  uint64_t v1,  uint64_t v2,  uint64_t v3,
+              uint64_t v4,  uint64_t v5,  uint64_t v6,  uint64_t v7,
+              uint64_t v8,  uint64_t v9,  uint64_t v10, uint64_t v11,
+              uint64_t v12, uint64_t v13, uint64_t v14, uint64_t v15)
+{
+    expr_vec_make_const<uint64_t,16> a;
+    a.a[0] = v0;    a.a[1] = v1;    a.a[2] = v2;    a.a[3] = v3;
+    a.a[4] = v4;    a.a[5] = v5;    a.a[6] = v6;    a.a[7] = v7;
+    a.a[8] = v8;    a.a[9] = v9;    a.a[10] = v10;  a.a[11] = v11;
+    a.a[12] = v12;  a.a[13] = v13;  a.a[14] = v14;  a.a[15] = v15;
+    return a;
+}
+
+template<class V> SIMDPP_INL
+V make_uint(uint64_t v0)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<uint64_t,1> a;
+    a.a[0] = v0;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_uint(uint64_t v0, uint64_t v1)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<uint64_t,2> a;
+    a.a[0] = v0;  a.a[1] = v1;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_uint(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<uint64_t,4> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_uint(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3,
+            uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<uint64_t,8> a;
+    a.a[0] = v0;  a.a[1] = v1;  a.a[2] = v2;  a.a[3] = v3;
+    a.a[4] = v4;  a.a[5] = v5;  a.a[6] = v6;  a.a[7] = v7;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+template<class V> SIMDPP_INL
+V make_uint(uint64_t v0,  uint64_t v1,  uint64_t v2,  uint64_t v3,
+            uint64_t v4,  uint64_t v5,  uint64_t v6,  uint64_t v7,
+            uint64_t v8,  uint64_t v9,  uint64_t v10, uint64_t v11,
+            uint64_t v12, uint64_t v13, uint64_t v14, uint64_t v15)
+{
+    static_assert(is_vector<V>::value && !is_mask<V>::value,
+                  "V must be a non-mask vector");
+    expr_vec_make_const<uint64_t,16> a;
+    a.a[0] = v0;    a.a[1] = v1;    a.a[2] = v2;    a.a[3] = v3;
+    a.a[4] = v4;    a.a[5] = v5;    a.a[6] = v6;    a.a[7] = v7;
+    a.a[8] = v8;    a.a[9] = v9;    a.a[10] = v10;  a.a[11] = v11;
+    a.a[12] = v12;  a.a[13] = v13;  a.a[14] = v14;  a.a[15] = v15;
+    return detail::insn::i_make_const_any<V>(a);
+}
+
+/// Creates a vector initialized to zero
+SIMDPP_INL expr_vec_make_const<uint64_t,1> make_zero()
+{
+    return make_uint(0);
+}
+
+template<class V> SIMDPP_INL
+V make_zero()
+{
+    return make_uint<V>(0);
+}
+
+/// Creates a vector initialized to ones
+SIMDPP_INL expr_vec_make_ones make_ones()
+{
+    return expr_vec_make_ones();
+}
+
+template<class V> SIMDPP_INL
+V make_ones()
+{
+    return (V) make_ones();
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 139 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/move_l.h

@@ -0,0 +1,139 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_MOVE_L_H
+#define LIBSIMDPP_SIMDPP_CORE_MOVE_L_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/move_l.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Moves the elements in an int8x16 vector to the left by @a shift positions.
+
+    @code
+    shift:  pos:| 0   1    .  14  15  |
+     0      r = [ a0  a1   .  a14 a15 ]
+     1      r = [ a1  a2   .  a15  0  ]
+     2      r = [ a2  a3   .   0   0  ]
+      ...    ..   .. ..   ...  ..  .. ..
+     14     r = [ a15  0   .   0   0  ]
+     15     r = [  0   0   .   0   0  ]
+     16     r = [  0   0   .   0   0  ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move16_l(const any_vec8<N,V>& a)
+{
+    static_assert(shift <= 16, "Shift out of bounds");
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move16_l_wrapper<shift>::run(ra);
+}
+
+/** Moves the 16-bit elements in a vector to the left by @a shift positions.
+
+    @code
+    shift:  pos:| 0  1   . 6  7  |
+     0      r = [ a0 a1  . a6 a7 ]
+     1      r = [ a1 a2  . a7  0 ]
+     2      r = [ a2 a3  .  0  0 ]
+      ...    ..   .. .. ... .. ..
+     6      r = [ a6 a7  .  0  0 ]
+     7      r = [ a7  0  .  0  0 ]
+     8      r = [  0  0  .  0  0 ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move8_l(const any_vec16<N,V>& a)
+{
+    static_assert(shift <= 8, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move8_l_wrapper<shift>::run(ra);
+}
+
+/** Moves the 32-bit elements in a vector to the left by @a shift positions.
+
+    @code
+    shift:  pos:| 0  1  2  3  |
+     0      r = [ a0 a1 a2 a3 ]
+     1      r = [ a1 a2 a3  0 ]
+     2      r = [ a2 a3  0  0 ]
+     3      r = [ a3  0  0  0 ]
+     4      r = [  0  0  0  0 ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move4_l(const any_vec32<N,V>& a)
+{
+    static_assert(shift <= 4, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move4_l_wrapper<shift>::run(ra);
+}
+
+
+/** Moves the 64-bit elements in a vector to the left by @a shift positions.
+
+    @code
+    shift:  pos:| 0  1  |
+     0      r = [ a0 a1 ]
+     1      r = [ a1  0 ]
+     2      r = [  0  0 ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move2_l(const any_vec64<N,V>& a)
+{
+    static_assert(shift <= 2, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move2_l_wrapper<shift>::run(ra);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 141 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/move_r.h

@@ -0,0 +1,141 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_MOVE_R_H
+#define LIBSIMDPP_SIMDPP_CORE_MOVE_R_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/move_r.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Moves the 8-bit elements in a vector to the right by @a shift positions.
+
+    @code
+    shift:  pos:| 0   1    .  14  15  |
+     0      r = [ a0  a1   .  a14 a15 ]
+     1      r = [  0  a0   .  a13 a14 ]
+     2      r = [  0   0   .  a12 a13 ]
+      ...    ..   .. ..   ...  ..  .. ..
+     14     r = [  0   0   .  a0  a1  ]
+     15     r = [  0   0   .   0  a0  ]
+     16     r = [  0   0   .   0   0  ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move16_r(const any_vec8<N,V>& a)
+{
+    static_assert(shift <= 16, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move16_r_wrapper<shift>::run(ra);
+}
+
+/** Moves the 16-bit elements in a vector to the right by @a shift positions.
+
+    @code
+    shift:  pos:| 0  1   . 6  7  |
+     0      r = [ a0 a1  . a6 a7 ]
+     1      r = [  0 a0  . a5 a6 ]
+     2      r = [  0  0  . a4 a5 ]
+      ...    ..   .. .. ... .. ..
+     6      r = [  0  0  . a0 a1 ]
+     7      r = [  0  0  .  0 a0 ]
+     8      r = [  0  0  .  0  0 ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move8_r(const any_vec16<N,V>& a)
+{
+    static_assert(shift <= 8, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move8_r_wrapper<shift>::run(ra);
+}
+
+/** Moves the 32-bit elements in a vector to the right by @a shift positions.
+
+    @code
+    shift:  pos:| 0  1  2  3  |
+     0      r = [ a0 a1 a2 a3 ]
+     1      r = [  0 a0 a1 a2 ]
+     2      r = [  0  0 a0 a1 ]
+     3      r = [  0  0  0 a0 ]
+     4      r = [  0  0  0  0 ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move4_r(const any_vec32<N,V>& a)
+{
+    static_assert(shift <= 4, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move4_r_wrapper<shift>::run(ra);
+}
+
+
+/** Moves the 64-bit elements in a vector to the right by @a shift positions.
+
+    @code
+    shift:  pos:| 0  1  |
+     0      r = [ a0 a1 ]
+     1      r = [  0 a0 ]
+     2      r = [  0  0 ]
+    @endcode
+
+    @par 256-bit version:
+    The lower and higher 128-bit halves are processed as if 128-bit instruction
+    was applied to each of them separately.
+
+    @icost{SSE2-AVX, NEON, ALTIVEC, 2}
+*/
+template<unsigned shift, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        move2_r(const any_vec64<N,V>& a)
+{
+    static_assert(shift <= 2, "Shift out of bounds");
+
+    typename detail::get_expr_nomask_nosign<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_move2_r_wrapper<shift>::run(ra);
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 135 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/permute2.h

@@ -0,0 +1,135 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_PERMUTE2_H
+#define LIBSIMDPP_SIMDPP_CORE_PERMUTE2_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/permute2.h>
+#include <simdpp/detail/get_expr.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Permutes the 16-bit values within sets of two consecutive elements of the
+    vector. The selector values must be in range [0; 1].
+
+    @code
+    r0 = a[s0]
+    r1 = a[s1]
+    r2 = a[s0+2]
+    r3 = a[s1+2]
+    r4 = a[s0+4]
+    r5 = a[s1+4]
+    ...
+    @endcode
+
+    @par: 128-bit version:
+    @icost{SSE2-AVX2, 2}
+    @icost{NEON, ALTIVEC, 1-2}
+
+    @par: 256-bit version:
+    @icost{SSE2-AVX, 4}
+    @icost{AVX2, 2}
+    @icost{NEON, 2-4}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned s0, unsigned s1, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        permute2(const any_vec16<N,V>& a)
+{
+    static_assert(s0 < 2 && s1 < 2, "Selector out of range");
+    typename detail::get_expr_nomask<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_permute2<s0,s1>(ra);
+}
+
+/** Permutes the values of each set of four consecutive 32-bit values. The
+    selector values must be in range [0; 1].
+
+    @code
+    r0 = a[s0]
+    r1 = a[s1]
+    r2 = a[s0+2]
+    r3 = a[s1+2]
+    256-bit version:
+    r4 = a[s0+4]
+    r5 = a[s1+4]
+    r6 = a[s0+6]
+    r7 = a[s1+6]
+    @endcode
+
+    @par integer
+    @par 128-bit version:
+    @icost{NEON, 2-4}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 4-8}
+    @icost{ALTIVEC, 2-3}
+
+    @par floating-point
+    @par 128-bit version:
+    @icost{NEON, 2-4}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 4-8}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned s0, unsigned s1, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        permute2(const any_vec32<N,V>& a)
+{
+    static_assert(s0 < 2 && s1 < 2, "Selector out of range");
+    typename detail::get_expr_nomask<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_permute2<s0,s1>(ra);
+}
+
+/** Permutes the values of each set of four consecutive 32-bit values. The
+    selector values must be in range [0; 1].
+
+    @code
+    r0 = a[s0]
+    r1 = a[s1]
+
+    256-bit version:
+    r2 = a[s0+2]
+    r3 = a[s1+2]
+    @endcode
+
+    @par 128-bit version:
+    @icost{NEON, 1-2}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 2-4}
+    @icost{ALTIVEC, 2-4}
+*/
+template<unsigned s0, unsigned s1, unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        permute2(const any_vec64<N,V>& a)
+{
+    static_assert(s0 < 2 && s1 < 2, "Selector out of range");
+    typename detail::get_expr_nomask<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_permute2<s0,s1>(ra);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 142 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/permute4.h

@@ -0,0 +1,142 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_PERMUTE4_H
+#define LIBSIMDPP_SIMDPP_CORE_PERMUTE4_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/permute4.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Permutes the 16-bit values within each 4 consecutive values of the vector.
+    The selector values must be in range [0; 3].
+
+    @code
+    r0 = a[s0]
+    ...
+    r3 = a[s3]
+    r4 = a[s0+4]
+    ...
+    r7 = a[s3+4]
+
+    256-bit version:
+
+    r8 = a[s0+8]
+    ...
+    r11 = a[s3+8]
+    r12 = a[s0+12]
+    ...
+    r15 = a[s3+12]
+    @endcode
+
+    @par: 128-bit version:
+    @icost{SSE2-AVX2, 2}
+    @icost{NEON, 1-5}
+    @icost{ALTIVEC, 1-2}
+
+    @par: 256-bit version:
+    @icost{SSE2-AVX, 4}
+    @icost{AVX2, 2}
+    @icost{NEON, 2-10}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned s0, unsigned s1, unsigned s2, unsigned s3,
+         unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        permute4(const any_vec16<N,V>& a)
+{
+    static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
+    typename detail::get_expr_nomask<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_permute4<s0,s1,s2,s3>(ra);
+}
+
+/** Permutes the values of each set of four consecutive 32-bit values. The
+    selector values must be in range [0; 3].
+
+    @code
+    r0 = a[s0]
+    ...
+    r3 = a[s3]
+
+    256-bit version:
+    r4 = a[s0+4]
+    ...
+    r7 = a[s3+4]
+    @endcode
+
+    @par integer
+    @par 128-bit version:
+    @icost{NEON, 1-4}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 2-8}
+    @icost{ALTIVEC, 2-3}
+
+    @par floating-point
+    @par 128-bit version:
+    @icost{NEON, 1-4}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, 2-8}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned s0, unsigned s1, unsigned s2, unsigned s3,
+         unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        permute4(const any_vec32<N,V>& a)
+{
+    static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
+    typename detail::get_expr_nomask<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_permute4<s0,s1,s2,s3>(ra);
+}
+
+/** Permutes the values of each set of four consecutive 64-bit values. The
+    selector values must be in range [0; 3].
+
+    @code
+    r0 = a[s0]
+    r1 = a[s1]
+    r2 = a[s2]
+    r3 = a[s3]
+    @endcode
+
+    @par integer
+    @icost{SSE2-AVX, 2}
+
+    @par floating-point
+    @icost{SSE2-AVX, 1-2}
+    @icost{NEON, 1-4}
+    @icost{ALTIVEC, 1-4}
+*/
+template<unsigned s0, unsigned s1, unsigned s2, unsigned s3,
+         unsigned N, class V> SIMDPP_INL
+typename detail::get_expr_nomask<V>::empty
+        permute4(const any_vec64<N,V>& a)
+{
+    static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
+    typename detail::get_expr_nomask<V>::type ra;
+    ra = a.wrapped().eval();
+    return detail::insn::i_permute4<s0,s1,s2,s3>(ra);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 62 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/permute_bytes16.h

@@ -0,0 +1,62 @@
+/*  Copyright (C) 2013-2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_PERMUTE_BYTES16_H
+#define LIBSIMDPP_SIMDPP_CORE_PERMUTE_BYTES16_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/get_expr.h>
+#include <simdpp/detail/insn/permute_bytes16.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Selects bytes from a vector according to a mask. Each byte within the
+    mask defines which element to select:
+     * Bits 7-4 must be zero or the behavior is undefined
+     * Bits 3-0 define the element within the given vector.
+*/
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_bytes16(const any_vec8<N,V1>& a, const uint8<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_bytes16(ra, mask.eval());
+}
+
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_bytes16(const any_vec16<N,V1>& a, const uint16<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_bytes16(ra, mask.eval());
+}
+
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_bytes16(const any_vec32<N,V1>& a, const uint32<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_bytes16(ra, mask.eval());
+}
+
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_bytes16(const any_vec64<N,V1>& a, const uint64<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_bytes16(ra, mask.eval());
+}
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 64 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/permute_zbytes16.h

@@ -0,0 +1,64 @@
+/*  Copyright (C) 2013-2017  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_PERMUTE_ZBYTES16_H
+#define LIBSIMDPP_SIMDPP_CORE_PERMUTE_ZBYTES16_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/get_expr.h>
+#include <simdpp/detail/insn/permute_zbytes16.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Selects bytes from a vector according to a mask, optionally selecting zero.
+    Each byte within the mask defines which element to select:
+     * Bit 7 results in the result byte being zeroed, if set.
+     * Bits 6-4 must be zero or the behavior is undefined
+     * Bits 3-0 define the element within the given vector.
+*/
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_zbytes16(const any_vec8<N,V1>& a, const uint8<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_zbytes16(ra, mask.eval());
+}
+
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_zbytes16(const any_vec16<N,V1>& a, const uint16<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_zbytes16(ra, mask.eval());
+}
+
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_zbytes16(const any_vec32<N,V1>& a, const uint32<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_zbytes16(ra, mask.eval());
+}
+
+template<unsigned N, class V1, class E2> SIMDPP_INL
+typename detail::get_expr_nomask<V1>::empty
+    permute_zbytes16(const any_vec64<N,V1>& a, const uint64<N,E2>& mask)
+{
+    typename detail::get_expr_nomask<V1>::type ra = a.wrapped().eval();
+    return detail::insn::i_permute_zbytes16(ra, mask.eval());
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 54 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/set_splat.h

@@ -0,0 +1,54 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_SET_SPLAT_H
+#define LIBSIMDPP_SIMDPP_CORE_SET_SPLAT_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/set_splat.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Loads a value from a register and broadcasts it to all elements of a vector.
+    The argument value is converted to the element of the resulting vector using
+    standard conversions.
+
+    @code
+    r0 = a
+    ...
+    rN = a
+    @endcode
+*/
+// FIXME: return empty expression
+SIMDPP_INL expr_vec_set_splat<uint32_t> splat(unsigned x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<uint64_t> splat(unsigned long x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<uint64_t> splat(unsigned long long x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<int32_t>  splat(int x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<int64_t>  splat(long x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<int64_t>  splat(long long x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<float>    splat(float x) { return { x }; }
+SIMDPP_INL expr_vec_set_splat<double>   splat(double x) { return { x }; }
+
+template<class V> SIMDPP_INL V splat(unsigned x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(unsigned long x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(unsigned long long x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(int x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(long x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(long long x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(float x) { return detail::splat_impl<V>(x); }
+template<class V> SIMDPP_INL V splat(double x) { return detail::splat_impl<V>(x); }
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 65 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/shuffle1.h

@@ -0,0 +1,65 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_SHUFFLV1_H
+#define LIBSIMDPP_SIMDPP_CORE_SHUFFLV1_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/shuffle2x2.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Selects 64-bit values from two vectors. The first value in
+    each pair of values must come from @a a, the second - from @a b. The
+    selector values must be in range [0; 1].
+
+    @code
+    r0 = a[s0]
+    r1 = b[s1]
+
+    256-bit version:
+    r2 = a[s0+2]
+    r3 = b[s1+2]
+    @endcode
+
+    @par floating-point
+    @par 128-bit version:
+    @novec{NEON, ALTIVEC}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @novec{NEON, ALTIVEC}
+
+    @par integer
+    @par 128-bit version:
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 1-2}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned s0, unsigned s1, unsigned N, class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+        shuffle1(const any_vec64<N,V1>& a, const any_vec64<N,V2>& b)
+{
+    static_assert(s0 < 2 && s1 < 2, "Selector out of range");
+    typename detail::get_expr2_nomask<V1, V2>::type ra = a.wrapped().eval(),
+                                                    rb = b.wrapped().eval();
+    return detail::insn::i_shuffle2x2<s0,s1+2>(ra, rb);
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 122 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/shuffle2.h

@@ -0,0 +1,122 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_SHUFFLE2_H
+#define LIBSIMDPP_SIMDPP_CORE_SHUFFLE2_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/shuffle4x2.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Selects 32-bit floating-point values from two vectors. The first two values
+    in each four consecutive values must come from @a a, the last two - from @a
+    b. The selector values must be in range [0; 3].
+
+    @code
+    r0 = a[a0]
+    r1 = a[a1]
+    r2 = b[b0]
+    r3 = b[b1]
+
+    256-bit version:
+    r4 = a[a0+4]
+    r5 = a[a1+4]
+    r6 = b[b0+4]
+    r7 = b[b1+4]
+    @endcode
+
+    @par floating-point
+    @par 128-bit version:
+    @icost{ALTIVEC, 1-2}
+    @icost{NEON, 1-4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, 2-8}
+    @icost{ALTIVEC, 2-3}
+
+    @par integer
+     @par 128-bit version:
+    @icost{NEON, 1-4}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 2-8}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned sa0, unsigned sa1, unsigned sb0, unsigned sb1, unsigned N,
+         class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    shuffle2(const any_vec32<N,V1>& a, const any_vec32<N,V2>& b)
+{
+    static_assert(sa0 < 4 && sa1 < 4 && sb0 < 4 && sb1 < 4, "Selector out of range");
+    typename detail::get_expr2_nomask<V1,V2,void>::type a0 = a.wrapped().eval(),
+                                                        b0 = b.wrapped().eval();
+    return detail::insn::i_shuffle4x2<sa0,sa1,sb0+4,sb1+4>(a0, b0);
+}
+
+/** Selects 32-bit values from two vectors. The first two values in each four
+    consecutive values must come from @a a, the last two - from @a b. The
+    selector values must be in range [0; 3].
+
+    @code
+    r0 = a[s0]
+    r1 = a[s1]
+    r2 = b[s0]
+    r3 = b[s1]
+
+    256-bit version:
+    r4 = a[s0+4]
+    r5 = a[s1+4]
+    r6 = b[s0+4]
+    r7 = b[s1+4]
+    @endcode
+
+    @par floating-point
+    @par 128-bit version:
+    @icost{ALTIVEC, 1-2}
+    @icost{NEON, 2-4}
+
+    @par 256-bit version:
+    @icost{SSE2-SSE4.1, 2}
+    @icost{NEON, 4-8}
+    @icost{ALTIVEC, 2-3}
+
+    @par integer
+    @par 128-bit version:
+    @icost{NEON, 2-4}
+    @icost{ALTIVEC, 1-2}
+
+    @par 256-bit version:
+    @icost{SSE2-AVX, 2}
+    @icost{NEON, 4-8}
+    @icost{ALTIVEC, 2-3}
+*/
+template<unsigned s0, unsigned s1, unsigned N,
+         class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    shuffle2(const any_vec32<N,V1>& a, const any_vec32<N,V2>& b)
+{
+    static_assert(s0 < 4 && s1 < 4, "Selector out of range");
+    typename detail::get_expr2_nomask<V1,V2,void>::type a0 = a.wrapped().eval(),
+                                                        b0 = b.wrapped().eval();
+    return detail::insn::i_shuffle4x2<s0,s1,s0+4,s1+4>(a0, b0);
+}
+
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 75 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/shuffle2x2.h

@@ -0,0 +1,75 @@
+/*  Copyright (C) 2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_SHUFFLE2x2_H
+#define LIBSIMDPP_SIMDPP_CORE_SHUFFLE2x2_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/shuffle4x2.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Selects 32-bit values from two vectors.
+    The selector values must be in range [0; 3].
+
+    @code
+    For each 64-bit segment:
+    ab = [ a..b ]
+    r0 = ab[s0]
+    r1 = ab[s1]
+    @endcode
+*/
+template<unsigned s0, unsigned s1, unsigned N,
+         class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    shuffle2x2(const any_vec32<N,V1>& a, const any_vec32<N,V2>& b)
+{
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_NEON
+    static_assert(s0 < 4 && s1 < 4, "Selector out of range");
+    typename detail::get_expr2_nomask_nosign<V1,V2,void>::type a0 = a.wrapped().eval(),
+                                                               b0 = b.wrapped().eval();
+    return detail::insn::i_shuffle2x2<s0,s1>(a0, b0);
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(V1, a, b);
+#endif
+}
+
+/** Selects 64-bit values from two vectors.
+    The selector values must be in range [0; 3].
+
+    @code
+    For each 128-bit segment:
+    ab = [ a..b ]
+    r0 = ab[s0]
+    r1 = ab[s1]
+    @endcode
+*/
+template<unsigned s0, unsigned s1, unsigned N,
+         class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    shuffle2x2(const any_vec64<N,V1>& a, const any_vec64<N,V2>& b)
+{
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_NEON
+    static_assert(s0 < 4 && s1 < 4, "Selector out of range");
+    typename detail::get_expr2_nomask_nosign<V1,V2,void>::type a0 = a.wrapped().eval(),
+                                                               b0 = b.wrapped().eval();
+    return detail::insn::i_shuffle2x2<s0,s1>(a0, b0);
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(V1, a, b);
+#endif
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

+ 79 - 0
Source/BansheeUtility/ThirdParty/simdpp/core/shuffle4x2.h

@@ -0,0 +1,79 @@
+/*  Copyright (C) 2013-2014  Povilas Kanapickas <[email protected]>
+
+    Distributed under the Boost Software License, Version 1.0.
+        (See accompanying file LICENSE_1_0.txt or copy at
+            http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+#ifndef LIBSIMDPP_SIMDPP_CORE_SHUFFLE4x2_H
+#define LIBSIMDPP_SIMDPP_CORE_SHUFFLE4x2_H
+
+#ifndef LIBSIMDPP_SIMD_H
+    #error "This file must be included through simd.h"
+#endif
+
+#include <simdpp/types.h>
+#include <simdpp/detail/insn/shuffle4x2.h>
+
+namespace simdpp {
+namespace SIMDPP_ARCH_NAMESPACE {
+
+/** Selects 32-bit values from two vectors.
+    The selector values must be in range [0; 7].
+
+    @code
+    For each 128-bit segment:
+    ab = [ a..b ]
+    r0 = ab[s0]
+    r1 = ab[s1]
+    r2 = ab[s2]
+    r3 = ab[s3]
+    @endcode
+*/
+template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N,
+         class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    shuffle4x2(const any_vec32<N,V1>& a, const any_vec32<N,V2>& b)
+{
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_NEON
+    static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
+    typename detail::get_expr2_nomask_nosign<V1,V2,void>::type a0 = a.wrapped().eval(),
+                                                               b0 = b.wrapped().eval();
+    return detail::insn::i_shuffle4x2<s0,s1,s2,s3>(a0, b0);
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(V1, a, b);
+#endif
+}
+
+/** Selects 64-bit values from two vectors.
+    The selector values must be in range [0; 7].
+
+    @code
+    For each 256-bit segment:
+    ab = [ a..b ]
+    r0 = ab[s0]
+    r1 = ab[s1]
+    r2 = ab[s2]
+    r3 = ab[s3]
+    @endcode
+*/
+template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N,
+         class V1, class V2> SIMDPP_INL
+typename detail::get_expr2_nomask<V1, V2>::empty
+    shuffle4x2(const any_vec64<N,V1>& a, const any_vec64<N,V2>& b)
+{
+#if SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_NEON
+    static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
+    typename detail::get_expr2_nomask_nosign<V1,V2,void>::type a0 = a.wrapped().eval(),
+                                                               b0 = b.wrapped().eval();
+    return detail::insn::i_shuffle4x2<s0,s1,s2,s3>(a0, b0);
+#else
+    return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(V1, a, b);
+#endif
+}
+
+} // namespace SIMDPP_ARCH_NAMESPACE
+} // namespace simdpp
+
+#endif
+

Некоторые файлы не были показаны из-за большого количества измененных файлов