11 years ago · 0bffce4f4b
--- a/glm/detail/func_integer.inl
+++ b/glm/detail/func_integer.inl
@@ -87,6 +87,117 @@ namespace detail
 
				 			return (v & Mask) + ((v >> Shift) & Mask);
			
 
				 		}
			
 
				 	};
			
 
				+
			
 
				+	template <typename genIUType, size_t Bits>
			
 
				+	struct compute_findLSB
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static int call(genIUType Value)
			
 
				+		{
			
 
				+			if(Value == 0)
			
 
				+				return -1;
			
 
				+
			
 
				+			return glm::bitCount(~Value & (Value - static_cast<genIUType>(1)));
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+#	if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	struct compute_findLSB<genIUType, 32>
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static int call(genIUType Value)
			
 
				+		{
			
 
				+			unsigned long Result(0);
			
 
				+			unsigned char IsNotNull = _BitScanForward(&Result, *reinterpret_cast<unsigned long*>(&Value));
			
 
				+			return IsNotNull ? int(Result) : -1;
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	struct compute_findLSB<genIUType, 64>
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static int call(genIUType Value)
			
 
				+		{
			
 
				+			unsigned long Result(0);
			
 
				+			unsigned char IsNotNull = _BitScanForward64(&Result, *reinterpret_cast<unsigned __int64*>(&Value));
			
 
				+			return IsNotNull ? int(Result) : -1;
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+#	endif
			
 
				+
			
 
				+	template <typename T, glm::precision P, template <class, glm::precision> class vecType, bool EXEC = true>
			
 
				+	struct compute_findMSB_step_vec
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, T Shift)
			
 
				+		{
			
 
				+			return x | (x >> Shift);
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+	template <typename T, glm::precision P, template <class, glm::precision> class vecType>
			
 
				+	struct compute_findMSB_step_vec<T, P, vecType, false>
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, T)
			
 
				+		{
			
 
				+			return x;
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+	template <typename T, glm::precision P, template <class, glm::precision> class vecType, std::size_t>
			
 
				+	struct compute_findMSB_vec
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static vecType<int, P> call(vecType<T, P> const & vec)
			
 
				+		{
			
 
				+			vecType<T, P> x(vec);
			
 
				+			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, static_cast<T>( 1));
			
 
				+			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, static_cast<T>( 2));
			
 
				+			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, static_cast<T>( 4));
			
 
				+			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 16>::call(x, static_cast<T>( 8));
			
 
				+			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 32>::call(x, static_cast<T>(16));
			
 
				+			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 64>::call(x, static_cast<T>(32));
			
 
				+			return vecType<int, P>(sizeof(T) * 8 - 1) - glm::bitCount(~x);
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+#	if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int compute_findMSB_32(genIUType Value)
			
 
				+	{
			
 
				+		unsigned long Result(0);
			
 
				+		unsigned char IsNotNull = _BitScanReverse(&Result, *reinterpret_cast<unsigned long*>(&Value));
			
 
				+		return IsNotNull ? int(Result) : -1;
			
 
				+	}
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int compute_findMSB_64(genIUType Value)
			
 
				+	{
			
 
				+		unsigned long Result(0);
			
 
				+		unsigned char IsNotNull = _BitScanReverse64(&Result, *reinterpret_cast<unsigned __int64*>(&Value));
			
 
				+		return IsNotNull ? int(Result) : -1;
			
 
				+	}
			
 
				+
			
 
				+	template <typename T, glm::precision P, template <class, glm::precision> class vecType>
			
 
				+	struct compute_findMSB_vec<T, P, vecType, 32>
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static int call(vecType<T, P> const & x)
			
 
				+		{
			
 
				+			return detail::functor1<int, T, P, vecType>::call(compute_findMSB_32, x);
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+	template <typename T, glm::precision P, template <class, glm::precision> class vecType>
			
 
				+	struct compute_findMSB_vec<T, P, vecType, 64>
			
 
				+	{
			
 
				+		GLM_FUNC_QUALIFIER static int call(vecType<T, P> const & x)
			
 
				+		{
			
 
				+			return detail::functor1<int, T, P, vecType>::call(compute_findMSB_64, x);
			
 
				+		}
			
 
				+	};
			
 
				+
			
 
				+#	endif
			
 
				+
			
 
				 }//namespace detail
			
 
				 
			
 
				 	// uaddCarry
			
@@ -248,12 +359,8 @@ namespace detail
 
				 	GLM_FUNC_QUALIFIER int findLSB(genIUType Value)
			
 
				 	{
			
 
				 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
			
 
				-		if(Value == 0)
			
 
				-			return -1;
			
 
				 
			
 
				-		genIUType Bit;
			
 
				-		for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){}
			
 
				-		return Bit;
			
 
				+		return detail::compute_findLSB<genIUType, sizeof(genIUType) * 8>::call(Value);
			
 
				 	}
			
 
				 
			
 
				 	template <typename T, precision P, template <typename, precision> class vecType>
			
@@ -265,89 +372,19 @@ namespace detail
 
				 	}
			
 
				 
			
 
				 	// findMSB
			
 
				-#if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_VC)
			
 
				-
			
 
				 	template <typename genIUType>
			
 
				-	GLM_FUNC_QUALIFIER int findMSB(genIUType Value)
			
 
				+	GLM_FUNC_QUALIFIER int findMSB(genIUType x)
			
 
				 	{
			
 
				 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
			
 
				-		if(Value == 0)
			
 
				-			return -1;
			
 
				 
			
 
				-		unsigned long Result(0);
			
 
				-		_BitScanReverse(&Result, Value);
			
 
				-		return int(Result);
			
 
				+		return findMSB(tvec1<genIUType>(x)).x;
			
 
				 	}
			
 
				-/*
			
 
				-// __builtin_clz seems to be buggy as it crasks for some values, from 0x00200000 to 80000000
			
 
				-#elif((GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC40))
			
 
				-
			
 
				-	template <typename genIUType>
			
 
				-	GLM_FUNC_QUALIFIER int findMSB
			
 
				-	(
			
 
				-		genIUType const & Value
			
 
				-	)
			
 
				-	{
			
 
				-		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
			
 
				-		if(Value == 0)
			
 
				-			return -1;
			
 
				-
			
 
				-		// clz returns the number or trailing 0-bits; see
			
 
				-		// http://gcc.gnu.org/onlinedocs/gcc-4.7.1/gcc/Other-Builtins.html
			
 
				-		//
			
 
				-		// NoteBecause __builtin_clz only works for unsigned ints, this
			
 
				-		// implementation will not work for 64-bit integers.
			
 
				-		//
			
 
				-		return 31 - __builtin_clzl(Value);
			
 
				-	}
			
 
				-*/
			
 
				-#else
			
 
				-
			
 
				-/* SSE implementation idea
			
 
				-
			
 
				-		__m128i const Zero = _mm_set_epi32( 0,  0,  0,  0);
			
 
				-		__m128i const One = _mm_set_epi32( 1,  1,  1,  1);
			
 
				-		__m128i Bit = _mm_set_epi32(-1, -1, -1, -1);
			
 
				-		__m128i Tmp = _mm_set_epi32(Value, Value, Value, Value);
			
 
				-		__m128i Mmi = Zero;
			
 
				-		for(int i = 0; i < 32; ++i)
			
 
				-		{
			
 
				-			__m128i Shilt = _mm_and_si128(_mm_cmpgt_epi32(Tmp, One), One);
			
 
				-			Tmp = _mm_srai_epi32(Tmp, One);
			
 
				-			Bit = _mm_add_epi32(Bit, _mm_and_si128(Shilt, i));
			
 
				-			Mmi = _mm_and_si128(Mmi, One);
			
 
				-		}
			
 
				-		return Bit;
			
 
				-*/
			
 
				-
			
 
				-	template <typename genIUType>
			
 
				-	GLM_FUNC_QUALIFIER int findMSB(genIUType Value)
			
 
				-	{
			
 
				-		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
			
 
				-		
			
 
				-		if(Value == genIUType(0) || Value == genIUType(-1))
			
 
				-			return -1;
			
 
				-		else if(Value > 0)
			
 
				-		{
			
 
				-			genIUType Bit = genIUType(-1);
			
 
				-			for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){}
			
 
				-			return Bit;
			
 
				-		}
			
 
				-		else //if(Value < 0)
			
 
				-		{
			
 
				-			int const BitCount(sizeof(genIUType) * 8);
			
 
				-			int MostSignificantBit(-1);
			
 
				-			for(int BitIndex(0); BitIndex < BitCount; ++BitIndex)
			
 
				-				MostSignificantBit = (Value & (1 << BitIndex)) ? MostSignificantBit : BitIndex;
			
 
				-			assert(MostSignificantBit >= 0);
			
 
				-			return MostSignificantBit;
			
 
				-		}
			
 
				-	}
			
 
				-#endif//(GLM_COMPILER)
			
 
				 
			
 
				 	template <typename T, precision P, template <typename, precision> class vecType>
			
 
				 	GLM_FUNC_QUALIFIER vecType<int, P> findMSB(vecType<T, P> const & x)
			
 
				 	{
			
 
				-		return detail::functor1<int, T, P, vecType>::call(findMSB, x);
			
 
				+		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_integer, "'findMSB' only accept integer values");
			
 
				+
			
 
				+		return detail::compute_findMSB_vec<T, P, vecType, sizeof(T) * 8>::call(x);
			
 
				 	}
			
 
				 }//namespace glm
			
--- a/readme.txt
+++ b/readme.txt
@@ -67,16 +67,16 @@ Improvements:
 
				 - Undetected C++ compiler automatically compile with GLM_FORCE_CXX98 and 
			
 
				   GLM_FORCE_PURE
			
 
				 - Added not function (from GLSL specification) on VC12
			
 
				-- Optimized bitfield operations
			
 
				 - Optimized bitfieldReverse and bitCount functions
			
 
				+- Optimized findLSB and findMSB functions.
			
 
				 - Optimized matrix-vector multiple performance with Cuda #257, #258
			
 
				 - Reduced integer type redifinitions #233
			
 
				 - Rewrited of GTX_fast_trigonometry #264 #265
			
 
				 - Made types trivially copyable #263
			
 
				 - Removed <iostream> in GLM tests
			
 
				 - Used std features within GLM without redeclaring
			
 
				-- Optimized glm::cot #272
			
 
				-- Optimized glm::sign #272
			
 
				+- Optimized cot function #272
			
 
				+- Optimized sign function #272
			
 
				 
			
 
				 Fixes:
			
 
				 - Fixed std::nextafter not supported with C++11 on Android #217
			
--- a/test/core/CMakeLists.txt
+++ b/test/core/CMakeLists.txt
@@ -22,6 +22,7 @@ glmCreateTestGTC(core_func_geometric)
 
				 glmCreateTestGTC(core_func_integer)
			
 
				 glmCreateTestGTC(core_func_integer_bit_count)
			
 
				 glmCreateTestGTC(core_func_integer_find_lsb)
			
 
				+glmCreateTestGTC(core_func_integer_find_msb)
			
 
				 glmCreateTestGTC(core_func_matrix)
			
 
				 glmCreateTestGTC(core_func_noise)
			
 
				 glmCreateTestGTC(core_func_packing)
			
--- a/test/core/core_func_integer.cpp
+++ b/test/core/core_func_integer.cpp
@@ -8,6 +8,7 @@
 
				 ///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				 #include <glm/integer.hpp>
			
 
				+#include <glm/vector_relational.hpp>
			
 
				 #include <glm/gtc/vec1.hpp>
			
 
				 #include <vector>
			
 
				 #include <ctime>
			
@@ -555,6 +556,19 @@ namespace findMSB
 
				 		genType		Return;
			
 
				 	};
			
 
				 
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int findMSB_intrinsic(genIUType Value)
			
 
				+	{
			
 
				+		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
			
 
				+
			
 
				+		if(Value == 0)
			
 
				+			return -1;
			
 
				+
			
 
				+		unsigned long Result(0);
			
 
				+		_BitScanReverse(&Result, Value);
			
 
				+		return int(Result);
			
 
				+	}
			
 
				+
			
 
				 	template <typename genIUType>
			
 
				 	GLM_FUNC_QUALIFIER int findMSB_095(genIUType Value)
			
 
				 	{
			
@@ -583,27 +597,17 @@ namespace findMSB
 
				 	GLM_FUNC_QUALIFIER int findMSB_nlz1(genIUType x)
			
 
				 	{
			
 
				 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
			
 
				-/*
			
 
				-		int Result = 0;
			
 
				-		for(std::size_t i = 0, n = sizeof(genIUType) * 8; i < n; ++i)
			
 
				-			Result = Value & static_cast<genIUType>(1 << i) ? static_cast<int>(i) : Result;
			
 
				-		return Result;
			
 
				-*/
			
 
				-/*
			
 
				-		genIUType Bit = genIUType(-1);
			
 
				-		for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){}
			
 
				-		return Bit;
			
 
				-*/
			
 
				-		int n;
			
 
				 
			
 
				-		if (x == 0) return(32);
			
 
				-		n = 0;
			
 
				+		if (x == 0)
			
 
				+			return -1;
			
 
				+
			
 
				+		int n = 0;
			
 
				 		if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
			
 
				 		if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
			
 
				 		if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
			
 
				 		if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
			
 
				 		if (x <= 0x7FFFFFFF) {n = n + 1;}
			
 
				-		return n;
			
 
				+		return 31 - n;
			
 
				 	}
			
 
				 
			
 
				 	int findMSB_nlz2(unsigned int x)
			
@@ -617,14 +621,24 @@ namespace findMSB
 
				 		y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
			
 
				 		y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
			
 
				 		y = x >> 1;  if (y != 0) return n - 2;
			
 
				-		return n - x;
			
 
				+		return 32 - (n - x);
			
 
				 	}
			
 
				 
			
 
				-	int perf_950()
			
 
				+	int findMSB_pop(unsigned int x)
			
 
				 	{
			
 
				-		type<glm::uint> const Data[] =
			
 
				+		x = x | (x >> 1);
			
 
				+		x = x | (x >> 2);
			
 
				+		x = x | (x >> 4);
			
 
				+		x = x | (x >> 8);
			
 
				+		x = x | (x >>16);
			
 
				+		return 31 - glm::bitCount(~x);
			
 
				+	}
			
 
				+
			
 
				+	int perf_int()
			
 
				+	{
			
 
				+		type<int> const Data[] =
			
 
				 		{
			
 
				-			//{0x00000000, -1},
			
 
				+			{0x00000000, -1},
			
 
				 			{0x00000001,  0},
			
 
				 			{0x00000002,  1},
			
 
				 			{0x00000003,  1},
			
@@ -662,141 +676,131 @@ namespace findMSB
 
				 		};
			
 
				 
			
 
				 		int Error(0);
			
 
				+		std::size_t const Count(1000000);
			
 
				+
			
 
				+		std::clock_t Timestamps0 = std::clock();
			
 
				+
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = glm::findMSB(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				 
			
 
				 		std::clock_t Timestamps1 = std::clock();
			
 
				 
			
 
				-		for(std::size_t k = 0; k < 1000000; ++k)
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				 		{
			
 
				-			int Result = findMSB_095(Data[i].Value);
			
 
				+			int Result = findMSB_nlz1(Data[i].Value);
			
 
				 			Error += Data[i].Return == Result ? 0 : 1;
			
 
				 		}
			
 
				 
			
 
				 		std::clock_t Timestamps2 = std::clock();
			
 
				 
			
 
				-		std::printf("findMSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findMSB_nlz2(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				 
			
 
				-		return Error;
			
 
				-	}
			
 
				+		std::clock_t Timestamps3 = std::clock();
			
 
				 
			
 
				-	int perf_ops()
			
 
				-	{
			
 
				-		type<int> const Data[] =
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				 		{
			
 
				-			{0x00000000, -1},
			
 
				-			{0x00000001,  0},
			
 
				-			{0x00000002,  1},
			
 
				-			{0x00000003,  1},
			
 
				-			{0x00000004,  2},
			
 
				-			{0x00000005,  2},
			
 
				-			{0x00000007,  2},
			
 
				-			{0x00000008,  3},
			
 
				-			{0x00000010,  4},
			
 
				-			{0x00000020,  5},
			
 
				-			{0x00000040,  6},
			
 
				-			{0x00000080,  7},
			
 
				-			{0x00000100,  8},
			
 
				-			{0x00000200,  9},
			
 
				-			{0x00000400, 10},
			
 
				-			{0x00000800, 11},
			
 
				-			{0x00001000, 12},
			
 
				-			{0x00002000, 13},
			
 
				-			{0x00004000, 14},
			
 
				-			{0x00008000, 15},
			
 
				-			{0x00010000, 16},
			
 
				-			{0x00020000, 17},
			
 
				-			{0x00040000, 18},
			
 
				-			{0x00080000, 19},
			
 
				-			{0x00100000, 20},
			
 
				-			{0x00200000, 21},
			
 
				-			{0x00400000, 22},
			
 
				-			{0x00800000, 23},
			
 
				-			{0x01000000, 24},
			
 
				-			{0x02000000, 25},
			
 
				-			{0x04000000, 26},
			
 
				-			{0x08000000, 27},
			
 
				-			{0x10000000, 28},
			
 
				-			{0x20000000, 29},
			
 
				-			{0x40000000, 30}
			
 
				-		};
			
 
				+			int Result = findMSB_095(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				 
			
 
				-		int Error(0);
			
 
				+		std::clock_t Timestamps4 = std::clock();
			
 
				 
			
 
				-		std::clock_t Timestamps1 = std::clock();
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findMSB_intrinsic(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		std::clock_t Timestamps5 = std::clock();
			
 
				 
			
 
				-		for(std::size_t k = 0; k < 1000000; ++k)
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				 		{
			
 
				-			int Result = findMSB_nlz1(Data[i].Value);
			
 
				+			int Result = findMSB_pop(Data[i].Value);
			
 
				 			Error += Data[i].Return == Result ? 0 : 1;
			
 
				 		}
			
 
				 
			
 
				-		std::clock_t Timestamps2 = std::clock();
			
 
				+		std::clock_t Timestamps6 = std::clock();
			
 
				 
			
 
				+		std::printf("glm::findMSB: %d clocks\n", static_cast<unsigned int>(Timestamps1 - Timestamps0));
			
 
				 		std::printf("findMSB - nlz1: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
			
 
				+		std::printf("findMSB - nlz2: %d clocks\n", static_cast<unsigned int>(Timestamps3 - Timestamps2));
			
 
				+		std::printf("findMSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps4 - Timestamps3));
			
 
				+		std::printf("findMSB - intrinsics: %d clocks\n", static_cast<unsigned int>(Timestamps5 - Timestamps4));
			
 
				+		std::printf("findMSB - pop: %d clocks\n", static_cast<unsigned int>(Timestamps6 - Timestamps5));
			
 
				 
			
 
				 		return Error;
			
 
				 	}
			
 
				 
			
 
				-
			
 
				-	int test_findMSB()
			
 
				+	int test_ivec4()
			
 
				 	{
			
 
				-		type<glm::uint> const Data[] =
			
 
				+		type<glm::ivec4> const Data[] =
			
 
				 		{
			
 
				-			//{0x00000000, -1},
			
 
				-			{0x00000001,  0},
			
 
				-			{0x00000002,  1},
			
 
				-			{0x00000003,  1},
			
 
				-			{0x00000004,  2},
			
 
				-			{0x00000005,  2},
			
 
				-			{0x00000007,  2},
			
 
				-			{0x00000008,  3},
			
 
				-			{0x00000010,  4},
			
 
				-			{0x00000020,  5},
			
 
				-			{0x00000040,  6},
			
 
				-			{0x00000080,  7},
			
 
				-			{0x00000100,  8},
			
 
				-			{0x00000200,  9},
			
 
				-			{0x00000400, 10},
			
 
				-			{0x00000800, 11},
			
 
				-			{0x00001000, 12},
			
 
				-			{0x00002000, 13},
			
 
				-			{0x00004000, 14},
			
 
				-			{0x00008000, 15},
			
 
				-			{0x00010000, 16},
			
 
				-			{0x00020000, 17},
			
 
				-			{0x00040000, 18},
			
 
				-			{0x00080000, 19},
			
 
				-			{0x00100000, 20},
			
 
				-			{0x00200000, 21},
			
 
				-			{0x00400000, 22},
			
 
				-			{0x00800000, 23},
			
 
				-			{0x01000000, 24},
			
 
				-			{0x02000000, 25},
			
 
				-			{0x04000000, 26},
			
 
				-			{0x08000000, 27},
			
 
				-			{0x10000000, 28},
			
 
				-			{0x20000000, 29},
			
 
				-			{0x40000000, 30}
			
 
				+			{glm::ivec4(0x00000000), glm::ivec4(-1)},
			
 
				+			{glm::ivec4(0x00000001), glm::ivec4( 0)},
			
 
				+			{glm::ivec4(0x00000002), glm::ivec4( 1)},
			
 
				+			{glm::ivec4(0x00000003), glm::ivec4( 1)},
			
 
				+			{glm::ivec4(0x00000004), glm::ivec4( 2)},
			
 
				+			{glm::ivec4(0x00000005), glm::ivec4( 2)},
			
 
				+			{glm::ivec4(0x00000007), glm::ivec4( 2)},
			
 
				+			{glm::ivec4(0x00000008), glm::ivec4( 3)},
			
 
				+			{glm::ivec4(0x00000010), glm::ivec4( 4)},
			
 
				+			{glm::ivec4(0x00000020), glm::ivec4( 5)},
			
 
				+			{glm::ivec4(0x00000040), glm::ivec4( 6)},
			
 
				+			{glm::ivec4(0x00000080), glm::ivec4( 7)},
			
 
				+			{glm::ivec4(0x00000100), glm::ivec4( 8)},
			
 
				+			{glm::ivec4(0x00000200), glm::ivec4( 9)},
			
 
				+			{glm::ivec4(0x00000400), glm::ivec4(10)},
			
 
				+			{glm::ivec4(0x00000800), glm::ivec4(11)},
			
 
				+			{glm::ivec4(0x00001000), glm::ivec4(12)},
			
 
				+			{glm::ivec4(0x00002000), glm::ivec4(13)},
			
 
				+			{glm::ivec4(0x00004000), glm::ivec4(14)},
			
 
				+			{glm::ivec4(0x00008000), glm::ivec4(15)},
			
 
				+			{glm::ivec4(0x00010000), glm::ivec4(16)},
			
 
				+			{glm::ivec4(0x00020000), glm::ivec4(17)},
			
 
				+			{glm::ivec4(0x00040000), glm::ivec4(18)},
			
 
				+			{glm::ivec4(0x00080000), glm::ivec4(19)},
			
 
				+			{glm::ivec4(0x00100000), glm::ivec4(20)},
			
 
				+			{glm::ivec4(0x00200000), glm::ivec4(21)},
			
 
				+			{glm::ivec4(0x00400000), glm::ivec4(22)},
			
 
				+			{glm::ivec4(0x00800000), glm::ivec4(23)},
			
 
				+			{glm::ivec4(0x01000000), glm::ivec4(24)},
			
 
				+			{glm::ivec4(0x02000000), glm::ivec4(25)},
			
 
				+			{glm::ivec4(0x04000000), glm::ivec4(26)},
			
 
				+			{glm::ivec4(0x08000000), glm::ivec4(27)},
			
 
				+			{glm::ivec4(0x10000000), glm::ivec4(28)},
			
 
				+			{glm::ivec4(0x20000000), glm::ivec4(29)},
			
 
				+			{glm::ivec4(0x40000000), glm::ivec4(30)}
			
 
				 		};
			
 
				 
			
 
				 		int Error(0);
			
 
				 
			
 
				-		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<glm::ivec4>); ++i)
			
 
				 		{
			
 
				-			int Result = glm::findMSB(Data[i].Value);
			
 
				-			Error += Data[i].Return == Result ? 0 : 1;
			
 
				-			assert(!Error);
			
 
				+			glm::ivec4 Result0 = glm::findMSB(Data[i].Value);
			
 
				+			Error += glm::all(glm::equal(Data[i].Return, Result0)) ? 0 : 1;
			
 
				 		}
			
 
				 
			
 
				 		return Error;
			
 
				 	}
			
 
				 
			
 
				-	int test_nlz1()
			
 
				+	int test_int()
			
 
				 	{
			
 
				 		type<glm::uint> const Data[] =
			
 
				 		{
			
 
				-			//{0x00000000, -1},
			
 
				+			{0x00000000, -1},
			
 
				 			{0x00000001,  0},
			
 
				 			{0x00000002,  1},
			
 
				 			{0x00000003,  1},
			
@@ -837,8 +841,38 @@ namespace findMSB
 
				 
			
 
				 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				 		{
			
 
				-			int Result = findMSB_nlz2(Data[i].Value);
			
 
				-			Error += Data[i].Return == Result ? 0 : 1;
			
 
				+			int Result0 = glm::findMSB(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result0 ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result0 = findMSB_nlz1(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result0 ? 0 : 1;
			
 
				+		}
			
 
				+/*
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result0 = findMSB_nlz2(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result0 ? 0 : 1;
			
 
				+		}
			
 
				+*/
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result0 = findMSB_095(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result0 ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result0 = findMSB_intrinsic(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result0 ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result0 = findMSB_pop(Data[i].Value);
			
 
				+			Error += Data[i].Return == Result0 ? 0 : 1;
			
 
				 		}
			
 
				 
			
 
				 		return Error;
			
@@ -848,8 +882,8 @@ namespace findMSB
 
				 	{
			
 
				 		int Error(0);
			
 
				 
			
 
				-		Error += test_findMSB();
			
 
				-		//Error += test_nlz1();
			
 
				+		Error += test_ivec4();
			
 
				+		Error += test_int();
			
 
				 
			
 
				 		return Error;
			
 
				 	}
			
@@ -858,8 +892,7 @@ namespace findMSB
 
				 	{
			
 
				 		int Error(0);
			
 
				 
			
 
				-		Error += perf_950();
			
 
				-		//Error += perf_ops();
			
 
				+		Error += perf_int();
			
 
				 
			
 
				 		return Error;
			
 
				 	}
			
@@ -878,20 +911,172 @@ namespace findLSB
 
				 	{
			
 
				 		{0x00000001,  0},
			
 
				 		{0x00000003,  0},
			
 
				-		{0x00000002,  1}
			
 
				+		{0x00000002,  1},
			
 
				+		{0x80000000, 31},
			
 
				+		{0x00010000, 16},
			
 
				+		{0xFFFF0000, 16},
			
 
				+		{0xFF000000, 24},
			
 
				+		{0xFF00FF00,  8},
			
 
				+		{0x00000000, -1}
			
 
				 	};
			
 
				 
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int findLSB_intrinsic(genIUType Value)
			
 
				+	{
			
 
				+		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
			
 
				+
			
 
				+		if(Value == 0)
			
 
				+			return -1;
			
 
				+
			
 
				+		unsigned long Result(0);
			
 
				+		_BitScanForward(&Result, Value);
			
 
				+		return int(Result);
			
 
				+	}
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int findLSB_095(genIUType Value)
			
 
				+	{
			
 
				+		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
			
 
				+		if(Value == 0)
			
 
				+			return -1;
			
 
				+
			
 
				+		genIUType Bit;
			
 
				+		for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){}
			
 
				+		return Bit;
			
 
				+	}
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int findLSB_ntz2(genIUType x)
			
 
				+	{
			
 
				+		if(x == 0)
			
 
				+			return -1;
			
 
				+
			
 
				+		return glm::bitCount(~x & (x - static_cast<genIUType>(1)));
			
 
				+	}
			
 
				+
			
 
				+	template <typename genIUType>
			
 
				+	GLM_FUNC_QUALIFIER int findLSB_branchfree(genIUType x)
			
 
				+	{
			
 
				+		bool IsNull(x == 0);
			
 
				+		int const Keep(!IsNull);
			
 
				+		int const Discard(IsNull);
			
 
				+
			
 
				+		return static_cast<int>(glm::bitCount(~x & (x - static_cast<genIUType>(1)))) * Keep + Discard * -1;
			
 
				+	}
			
 
				+
			
 
				+	int test_int()
			
 
				+	{
			
 
				+		int Error(0);
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = glm::findLSB(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_095(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_intrinsic(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_ntz2(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_branchfree(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		return Error;
			
 
				+	}
			
 
				+
			
 
				 	int test()
			
 
				 	{
			
 
				 		int Error(0);
			
 
				 
			
 
				+		Error += test_int();
			
 
				+
			
 
				+		return Error;
			
 
				+	}
			
 
				+
			
 
				+	int perf_int()
			
 
				+	{
			
 
				+		int Error(0);
			
 
				+		std::size_t const Count(10000000);
			
 
				+
			
 
				+		std::clock_t Timestamps0 = std::clock();
			
 
				+
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				 		{
			
 
				 			int Result = glm::findLSB(DataI32[i].Value);
			
 
				 			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				-			assert(!Error);
			
 
				 		}
			
 
				 
			
 
				+		std::clock_t Timestamps1 = std::clock();
			
 
				+
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_095(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		std::clock_t Timestamps2 = std::clock();
			
 
				+
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_intrinsic(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		std::clock_t Timestamps3 = std::clock();
			
 
				+
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_ntz2(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		std::clock_t Timestamps4 = std::clock();
			
 
				+
			
 
				+		for(std::size_t k = 0; k < Count; ++k)
			
 
				+		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
			
 
				+		{
			
 
				+			int Result = findLSB_branchfree(DataI32[i].Value);
			
 
				+			Error += DataI32[i].Return == Result ? 0 : 1;
			
 
				+		}
			
 
				+
			
 
				+		std::clock_t Timestamps5 = std::clock();
			
 
				+
			
 
				+		std::printf("glm::findLSB: %d clocks\n", static_cast<unsigned int>(Timestamps1 - Timestamps0));
			
 
				+		std::printf("findLSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
			
 
				+		std::printf("findLSB - intrinsics: %d clocks\n", static_cast<unsigned int>(Timestamps3 - Timestamps2));
			
 
				+		std::printf("findLSB - ntz2: %d clocks\n", static_cast<unsigned int>(Timestamps4 - Timestamps3));
			
 
				+		std::printf("findLSB - branchfree: %d clocks\n", static_cast<unsigned int>(Timestamps5 - Timestamps4));
			
 
				+
			
 
				+		return Error;
			
 
				+	}
			
 
				+
			
 
				+	int perf()
			
 
				+	{
			
 
				+		int Error(0);
			
 
				+
			
 
				+		Error += perf_int();
			
 
				+
			
 
				 		return Error;
			
 
				 	}
			
 
				 }//findLSB
			
@@ -1324,6 +1509,7 @@ int main()
 
				 		Error += ::bitCount::perf();
			
 
				 		Error += ::bitfieldReverse::perf();
			
 
				 		Error += ::findMSB::perf();
			
 
				+		Error += ::findLSB::perf();
			
 
				 #	endif
			
 
				 
			
 
				 	return Error;
			
--- a/test/core/core_func_integer_find_lsb.cpp
+++ b/test/core/core_func_integer_find_lsb.cpp
@@ -7,15 +7,23 @@
 
				 // File    : test/core/func_integer_find_lsb.cpp
			
 
				 ///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				-// This has the programs for computing the number of leading zeros
			
 
				+// This has the programs for computing the number of trailing zeros
			
 
				 // in a word.
			
 
				 // Max line length is 57, to fit in hacker.book.
			
 
				-// Compile with g++, not gcc.
			
 
				 #include <cstdio>
			
 
				-#include <cstdlib>     // To define "exit", req'd by XLC.
			
 
				+#include <cstdlib>     //To define "exit", req'd by XLC.
			
 
				 #include <ctime>
			
 
				 
			
 
				-#define LE 1            // 1 for little-endian, 0 for big-endian.
			
 
				+int nlz(unsigned x) {
			
 
				+   int pop(unsigned x);
			
 
				+
			
 
				+   x = x | (x >> 1);
			
 
				+   x = x | (x >> 2);
			
 
				+   x = x | (x >> 4);
			
 
				+   x = x | (x >> 8);
			
 
				+   x = x | (x >>16);
			
 
				+   return pop(~x);
			
 
				+}
			
 
				 
			
 
				 int pop(unsigned x) {
			
 
				    x = x - ((x >> 1) & 0x55555555);
			
@@ -26,280 +34,230 @@ int pop(unsigned x) {
 
				    return x >> 24;
			
 
				 }
			
 
				 
			
 
				-int nlz1(unsigned x) {
			
 
				-   int n;
			
 
				+int ntz1(unsigned x) {
			
 
				+   return 32 - nlz(~x & (x-1));
			
 
				+}
			
 
				 
			
 
				-   if (x == 0) return(32);
			
 
				-   n = 0;
			
 
				-   if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
			
 
				-   if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
			
 
				-   if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
			
 
				-   if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
			
 
				-   if (x <= 0x7FFFFFFF) {n = n + 1;}
			
 
				-   return n;
			
 
				+int ntz2(unsigned x) {
			
 
				+   return pop(~x & (x - 1));
			
 
				 }
			
 
				 
			
 
				-int nlz1a(unsigned x) {
			
 
				+int ntz3(unsigned x) {
			
 
				    int n;
			
 
				 
			
 
				-/* if (x == 0) return(32); */
			
 
				-   if ((int)x <= 0) return (~x >> 26) & 32;
			
 
				+   if (x == 0) return(32);
			
 
				    n = 1;
			
 
				-   if ((x >> 16) == 0) {n = n +16; x = x <<16;}
			
 
				-   if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
			
 
				-   if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
			
 
				-   if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
			
 
				-   n = n - (x >> 31);
			
 
				-   return n;
			
 
				+   if ((x & 0x0000FFFF) == 0) {n = n +16; x = x >>16;}
			
 
				+   if ((x & 0x000000FF) == 0) {n = n + 8; x = x >> 8;}
			
 
				+   if ((x & 0x0000000F) == 0) {n = n + 4; x = x >> 4;}
			
 
				+   if ((x & 0x00000003) == 0) {n = n + 2; x = x >> 2;}
			
 
				+   return n - (x & 1);
			
 
				 }
			
 
				-// On basic Risc, 12 to 20 instructions.
			
 
				 
			
 
				-int nlz2(unsigned x) {
			
 
				+int ntz4(unsigned x) {
			
 
				    unsigned y;
			
 
				    int n;
			
 
				 
			
 
				-   n = 32;
			
 
				-   y = x >>16;  if (y != 0) {n = n -16;  x = y;}
			
 
				-   y = x >> 8;  if (y != 0) {n = n - 8;  x = y;}
			
 
				-   y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
			
 
				-   y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
			
 
				-   y = x >> 1;  if (y != 0) return n - 2;
			
 
				-   return n - x;
			
 
				+   if (x == 0) return 32;
			
 
				+   n = 31;
			
 
				+   y = x <<16;  if (y != 0) {n = n -16;  x = y;}
			
 
				+   y = x << 8;  if (y != 0) {n = n - 8;  x = y;}
			
 
				+   y = x << 4;  if (y != 0) {n = n - 4;  x = y;}
			
 
				+   y = x << 2;  if (y != 0) {n = n - 2;  x = y;}
			
 
				+   y = x << 1;  if (y != 0) {n = n - 1;}
			
 
				+   return n;
			
 
				 }
			
 
				 
			
 
				-// As above but coded as a loop for compactness:
			
 
				-// 23 to 33 basic Risc instructions.
			
 
				-int nlz2a(unsigned x) {
			
 
				+int ntz4a(unsigned x) {
			
 
				    unsigned y;
			
 
				-   int n, c;
			
 
				-
			
 
				-   n = 32;
			
 
				-   c = 16;
			
 
				-   do {
			
 
				-      y = x >> c;  if (y != 0) {n = n - c;  x = y;}
			
 
				-      c = c >> 1;
			
 
				-   } while (c != 0);
			
 
				-   return n - x;
			
 
				-}
			
 
				+   int n;
			
 
				 
			
 
				-int nlz3(int x) {
			
 
				-   int y, n;
			
 
				-
			
 
				-   n = 0;
			
 
				-   y = x;
			
 
				-L: if (x < 0) return n;
			
 
				-   if (y == 0) return 32 - n;
			
 
				-   n = n + 1;
			
 
				-   x = x << 1;
			
 
				-   y = y >> 1;
			
 
				-   goto L;
			
 
				+   if (x == 0) return 32;
			
 
				+   n = 31;
			
 
				+   y = x <<16;  if (y != 0) {n = n -16;  x = y;}
			
 
				+   y = x << 8;  if (y != 0) {n = n - 8;  x = y;}
			
 
				+   y = x << 4;  if (y != 0) {n = n - 4;  x = y;}
			
 
				+   y = x << 2;  if (y != 0) {n = n - 2;  x = y;}
			
 
				+   n = n - ((x << 1) >> 31);
			
 
				+   return n;
			
 
				 }
			
 
				 
			
 
				-int nlz4(unsigned x) {
			
 
				-   int y, m, n;
			
 
				-
			
 
				-   y = -(x >> 16);      // If left half of x is 0,
			
 
				-   m = (y >> 16) & 16;  // set n = 16.  If left half
			
 
				-   n = 16 - m;          // is nonzero, set n = 0 and
			
 
				-   x = x >> m;          // shift x right 16.
			
 
				-                        // Now x is of the form 0000xxxx.
			
 
				-   y = x - 0x100;       // If positions 8-15 are 0,
			
 
				-   m = (y >> 16) & 8;   // add 8 to n and shift x left 8.
			
 
				-   n = n + m;
			
 
				-   x = x << m;
			
 
				-
			
 
				-   y = x - 0x1000;      // If positions 12-15 are 0,
			
 
				-   m = (y >> 16) & 4;   // add 4 to n and shift x left 4.
			
 
				-   n = n + m;
			
 
				-   x = x << m;
			
 
				-
			
 
				-   y = x - 0x4000;      // If positions 14-15 are 0,
			
 
				-   m = (y >> 16) & 2;   // add 2 to n and shift x left 2.
			
 
				-   n = n + m;
			
 
				-   x = x << m;
			
 
				-
			
 
				-   y = x >> 14;         // Set y = 0, 1, 2, or 3.
			
 
				-   m = y & ~(y >> 1);   // Set m = 0, 1, 2, or 2 resp.
			
 
				-   return n + 2 - m;
			
 
				+int ntz5(char x)
			
 
				+{
			
 
				+	if (x & 15) {
			
 
				+		if (x & 3) {
			
 
				+			if (x & 1) return 0;
			
 
				+			else return 1;
			
 
				+		}
			
 
				+		else if (x & 4) return 2;
			
 
				+		else return 3;
			
 
				+	}
			
 
				+	else if (x & 0x30) {
			
 
				+		if (x & 0x10) return 4;
			
 
				+		else return 5;
			
 
				+	}
			
 
				+	else if (x & 0x40) return 6;
			
 
				+	else if (x) return 7;
			
 
				+	else return 8;
			
 
				 }
			
 
				 
			
 
				-int nlz5(unsigned x) {
			
 
				-   int pop(unsigned x);
			
 
				+int ntz6(unsigned x) {
			
 
				+   int n;
			
 
				 
			
 
				-   x = x | (x >> 1);
			
 
				-   x = x | (x >> 2);
			
 
				-   x = x | (x >> 4);
			
 
				-   x = x | (x >> 8);
			
 
				-   x = x | (x >>16);
			
 
				-   return pop(~x);
			
 
				+   x = ~x & (x - 1);
			
 
				+   n = 0;                       // n = 32;
			
 
				+   while(x != 0) {              // while (x != 0) {
			
 
				+      n = n + 1;                //    n = n - 1;
			
 
				+      x = x >> 1;               //    x = x + x;
			
 
				+   }                            // }
			
 
				+   return n;                    // return n;
			
 
				 }
			
 
				 
			
 
				-/* The four programs below are not valid ANSI C programs.  This is
			
 
				-because they refer to the same storage locations as two different types.
			
 
				-However, they work with xlc/AIX, gcc/AIX, and gcc/NT.  If you try to
			
 
				-code them more compactly by declaring a variable xx to be "double," and
			
 
				-then using
			
 
				-
			
 
				-   n = 1054 - (*((unsigned *)&xx + LE) >> 20);
			
 
				-
			
 
				-then you are violating not only the rule above, but also the ANSI C
			
 
				-rule that pointer arithmetic can be performed only on pointers to
			
 
				-array elements.
			
 
				-   When coded with the above statement, the program fails with xlc,
			
 
				-gcc/AIX, and gcc/NT, at some optimization levels.
			
 
				-   BTW, these programs use the "anonymous union" feature of C++, not
			
 
				-available in C. */
			
 
				-
			
 
				-int nlz6(unsigned k) {
			
 
				-   union {
			
 
				-      unsigned asInt[2];
			
 
				-      double asDouble;
			
 
				-   };
			
 
				-   int n;
			
 
				+int ntz6a(unsigned x)
			
 
				+{
			
 
				+	int n = 32;
			
 
				 
			
 
				-   asDouble = (double)k + 0.5;
			
 
				-   n = 1054 - (asInt[LE] >> 20);
			
 
				-   return n;
			
 
				+	while (x != 0) {
			
 
				+		n = n - 1;
			
 
				+		x = x + x;
			
 
				+	}
			
 
				+	return n;
			
 
				 }
			
 
				 
			
 
				-int nlz7(unsigned k) {
			
 
				-   union {
			
 
				-      unsigned asInt[2];
			
 
				-      double asDouble;
			
 
				-   };
			
 
				-   int n;
			
 
				+/* Dean Gaudet's algorithm. To be most useful there must be a good way
			
 
				+to evaluate the C "conditional expression" (a?b:c construction) without
			
 
				+branching. The result of a?b:c is b if a is true (nonzero), and c if a
			
 
				+is false (0).
			
 
				+   For example, a compare to zero op that sets a target GPR to 1 if the
			
 
				+operand is 0, and to 0 if the operand is nonzero, will do it. With this
			
 
				+instruction, the algorithm is entirely branch-free. But the most
			
 
				+interesting thing about it is the high degree of parallelism. All six
			
 
				+lines with conditional expressions can be executed in parallel (on a
			
 
				+machine with sufficient computational units).
			
 
				+   Although the instruction count is 30 measured statically, it could
			
 
				+execute in only 10 cycles on a machine with sufficient parallelism.
			
 
				+   The first two uses of y can instead be x, which would increase the
			
 
				+useful parallelism on most machines (the assignments to y, bz, and b4
			
 
				+could then all run in parallel). */
			
 
				+
			
 
				+int ntz7(unsigned x)
			
 
				+{
			
 
				+	unsigned y, bz, b4, b3, b2, b1, b0;
			
 
				+
			
 
				+	y = x & -x;               // Isolate rightmost 1-bit.
			
 
				+	bz = y ? 0 : 1;           // 1 if y = 0.
			
 
				+	b4 = (y & 0x0000FFFF) ? 0 : 16;
			
 
				+	b3 = (y & 0x00FF00FF) ? 0 : 8;
			
 
				+	b2 = (y & 0x0F0F0F0F) ? 0 : 4;
			
 
				+	b1 = (y & 0x33333333) ? 0 : 2;
			
 
				+	b0 = (y & 0x55555555) ? 0 : 1;
			
 
				+	return bz + b4 + b3 + b2 + b1 + b0;
			
 
				+}
			
 
				 
			
 
				-   asDouble = (double)k;
			
 
				-   n = 1054 - (asInt[LE] >> 20);
			
 
				-   n = (n & 31) + (n >> 9);
			
 
				-   return n;
			
 
				+int ntz7_christophe(unsigned x)
			
 
				+{
			
 
				+	unsigned y, bz, b4, b3, b2, b1, b0;
			
 
				+
			
 
				+	y = x & -x;               // Isolate rightmost 1-bit.
			
 
				+	bz = unsigned(!bool(y));           // 1 if y = 0.
			
 
				+	b4 = unsigned(!bool(y & 0x0000FFFF)) * 16;
			
 
				+	b3 = unsigned(!bool(y & 0x00FF00FF)) * 8;
			
 
				+	b2 = unsigned(!bool(y & 0x0F0F0F0F)) * 4;
			
 
				+	b1 = unsigned(!bool(y & 0x33333333)) * 2;
			
 
				+	b0 = unsigned(!bool(y & 0x55555555)) * 1;
			
 
				+	return bz + b4 + b3 + b2 + b1 + b0;
			
 
				 }
			
 
				 
			
 
				-   /* In single precision, round-to-nearest mode, the basic method fails for:
			
 
				-   k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
			
 
				-                        07FFFFFC <= k <= 07FFFFFF,
			
 
				-                        0FFFFFF8 <= k <= 0FFFFFFF,
			
 
				-                                   ...
			
 
				-                        7FFFFFC0 <= k <= 7FFFFFFF.
			
 
				-                        FFFFFF80 <= k <= FFFFFFFF.
			
 
				-   For k = 0 it gives 158, and for the other values it is too low by 1. */
			
 
				-
			
 
				-int nlz8(unsigned k) {
			
 
				-   union {
			
 
				-      unsigned asInt;
			
 
				-      float asFloat;
			
 
				-   };
			
 
				-   int n;
			
 
				+/* Below is David Seal's algorithm, found at
			
 
				+http://www.ciphersbyritter.com/NEWS4/BITCT.HTM Table
			
 
				+entries marked "u" are unused. 6 ops including a
			
 
				+multiply, plus an indexed load. */
			
 
				 
			
 
				-   k = k & ~(k >> 1);           /* Fix problem with rounding. */
			
 
				-   asFloat = (float)k + 0.5f;
			
 
				-   n = 158 - (asInt >> 23);
			
 
				-   return n;
			
 
				+#define u 99
			
 
				+int ntz8(unsigned x)
			
 
				+{
			
 
				+	static char table[64] =
			
 
				+		{32, 0, 1,12, 2, 6, u,13,   3, u, 7, u, u, u, u,14,
			
 
				+		10, 4, u, u, 8, u, u,25,   u, u, u, u, u,21,27,15,
			
 
				+		31,11, 5, u, u, u, u, u,   9, u, u,24, u, u,20,26,
			
 
				+		30, u, u, u, u,23, u,19,  29, u,22,18,28,17,16, u};
			
 
				+
			
 
				+	x = (x & -x)*0x0450FBAF;
			
 
				+	return table[x >> 26];
			
 
				 }
			
 
				 
			
 
				-/* The example below shows how to make a macro for nlz.  It uses an
			
 
				-extension to the C and C++ languages that is provided by the GNU C/C++
			
 
				-compiler, namely, that of allowing statements and declarations in
			
 
				-expressions (see "Using and Porting GNU CC", by Richard M. Stallman
			
 
				-(1998).  The underscores are necessary to protect against the
			
 
				-possibility that the macro argument will conflict with one of its local
			
 
				-variables, e.g., NLZ(k). */
			
 
				-
			
 
				-int nlz9(unsigned k) {
			
 
				-   union {
			
 
				-      unsigned asInt;
			
 
				-      float asFloat;
			
 
				-   };
			
 
				-   int n;
			
 
				+/* Seal's algorithm with multiply expanded.
			
 
				+9 elementary ops plus an indexed load. */
			
 
				 
			
 
				-   k = k & ~(k >> 1);           /* Fix problem with rounding. */
			
 
				-   asFloat = (float)k;
			
 
				-   n = 158 - (asInt >> 23);
			
 
				-   n = (n & 31) + (n >> 6);     /* Fix problem with k = 0. */
			
 
				-   return n;
			
 
				+int ntz8a(unsigned x)
			
 
				+{
			
 
				+	static char table[64] =
			
 
				+		{32, 0, 1,12, 2, 6, u,13,   3, u, 7, u, u, u, u,14,
			
 
				+		10, 4, u, u, 8, u, u,25,   u, u, u, u, u,21,27,15,
			
 
				+		31,11, 5, u, u, u, u, u,   9, u, u,24, u, u,20,26,
			
 
				+		30, u, u, u, u,23, u,19,  29, u,22,18,28,17,16, u};
			
 
				+
			
 
				+	x = (x & -x);
			
 
				+	x = (x << 4) + x;    // x = x*17.
			
 
				+	x = (x << 6) + x;    // x = x*65.
			
 
				+	x = (x << 16) - x;   // x = x*65535.
			
 
				+	return table[x >> 26];
			
 
				 }
			
 
				 
			
 
				-/* Below are three nearly equivalent programs for computing the number
			
 
				-of leading zeros in a word. This material is not in HD, but may be in a
			
 
				-future edition.
			
 
				-   Immediately below is Robert Harley's algorithm, found at the
			
 
				-comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
			
 
				-Juffa.
			
 
				-   Table entries marked "u" are unused. 14 ops including a multiply,
			
 
				-plus an indexed load.
			
 
				-   The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
			
 
				-of form 2**k + 1). There are no multipliers of three terms of the form
			
 
				-2**k +- 1 that work, with a table size of 64 or 128. There are some,
			
 
				-with a table size of 64, if you precede the multiplication with x = x -
			
 
				-(x >> 1), but that seems less elegant. There are also some if you use a
			
 
				-table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
			
 
				-save two instructions in the form of this algorithm with the
			
 
				-multiplication expanded into shifts and adds, but the table size is
			
 
				-getting a bit large). */
			
 
				+/* Reiser's algorithm. Three ops including a "remainder,"
			
 
				+plus an indexed load. */
			
 
				 
			
 
				-#define u 99
			
 
				-int nlz10(unsigned x) {
			
 
				+int ntz9(unsigned x) {
			
 
				 
			
 
				-   static char table[64] =
			
 
				-     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
			
 
				-       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
			
 
				-      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
			
 
				-       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
			
 
				+   static char table[37] = {32,  0,  1, 26,  2, 23, 27,
			
 
				+                 u,  3, 16, 24, 30, 28, 11,  u, 13,  4,
			
 
				+                 7, 17,  u, 25, 22, 31, 15, 29, 10, 12,
			
 
				+                 6,  u, 21, 14,  9,  5, 20,  8, 19, 18};
			
 
				 
			
 
				-   x = x | (x >> 1);    // Propagate leftmost
			
 
				-   x = x | (x >> 2);    // 1-bit to the right.
			
 
				-   x = x | (x >> 4);
			
 
				-   x = x | (x >> 8);
			
 
				-   x = x | (x >>16);
			
 
				-   x = x*0x06EB14F9;    // Multiplier is 7*255**3.
			
 
				-   return table[x >> 26];
			
 
				+   x = (x & -x)%37;
			
 
				+   return table[x];
			
 
				 }
			
 
				 
			
 
				-/* Harley's algorithm with multiply expanded.
			
 
				-19 elementary ops plus an indexed load. */
			
 
				+/* Using a de Bruijn sequence. This is a table lookup with a 32-entry
			
 
				+table. The de Bruijn sequence used here is
			
 
				+                0000 0100 1101 0111 0110 0101 0001 1111,
			
 
				+obtained from Danny Dube's October 3, 1997, posting in
			
 
				+comp.compression.research. Thanks to Norbert Juffa for this reference. */
			
 
				 
			
 
				-int nlz10a(unsigned x) {
			
 
				+int ntz10(unsigned x) {
			
 
				 
			
 
				-   static char table[64] =
			
 
				-     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
			
 
				-       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
			
 
				-      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
			
 
				-       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
			
 
				+   static char table[32] =
			
 
				+     { 0, 1, 2,24, 3,19, 6,25,  22, 4,20,10,16, 7,12,26,
			
 
				+      31,23,18, 5,21, 9,15,11,  30,17, 8,14,29,13,28,27};
			
 
				 
			
 
				-   x = x | (x >> 1);    // Propagate leftmost
			
 
				-   x = x | (x >> 2);    // 1-bit to the right.
			
 
				-   x = x | (x >> 4);
			
 
				-   x = x | (x >> 8);
			
 
				-   x = x | (x >> 16);
			
 
				-   x = (x << 3) - x;    // Multiply by 7.
			
 
				-   x = (x << 8) - x;    // Multiply by 255.
			
 
				-   x = (x << 8) - x;    // Again.
			
 
				-   x = (x << 8) - x;    // Again.
			
 
				-   return table[x >> 26];
			
 
				+   if (x == 0) return 32;
			
 
				+   x = (x & -x)*0x04D7651F;
			
 
				+   return table[x >> 27];
			
 
				 }
			
 
				 
			
 
				-/* Julius Goryavsky's version of Harley's algorithm.
			
 
				-17 elementary ops plus an indexed load, if the machine
			
 
				-has "and not." */
			
 
				-
			
 
				-int nlz10b(unsigned x) {
			
 
				+/* Norbert Juffa's code, answer to exercise 1 of Chapter 5 (2nd ed). */
			
 
				 
			
 
				-   static char table[64] =
			
 
				-     {32,20,19, u, u,18, u, 7,  10,17, u, u,14, u, 6, u,
			
 
				-       u, 9, u,16, u, u, 1,26,   u,13, u, u,24, 5, u, u,
			
 
				-       u,21, u, 8,11, u,15, u,   u, u, u, 2,27, 0,25, u,
			
 
				-      22, u,12, u, u, 3,28, u,  23, u, 4,29, u, u,30,31};
			
 
				+#define SLOW_MUL
			
 
				+int ntz11 (unsigned int n) {
			
 
				 
			
 
				-   x = x | (x >> 1);    // Propagate leftmost
			
 
				-   x = x | (x >> 2);    // 1-bit to the right.
			
 
				-   x = x | (x >> 4);
			
 
				-   x = x | (x >> 8);
			
 
				-   x = x & ~(x >> 16);
			
 
				-   x = x*0xFD7049FF;    // Activate this line or the following 3.
			
 
				-// x = (x << 9) - x;    // Multiply by 511.
			
 
				-// x = (x << 11) - x;   // Multiply by 2047.
			
 
				-// x = (x << 14) - x;   // Multiply by 16383.
			
 
				-   return table[x >> 26];
			
 
				+   static unsigned char tab[32] =
			
 
				+   {   0,  1,  2, 24,  3, 19, 6,  25,
			
 
				+      22,  4, 20, 10, 16,  7, 12, 26,
			
 
				+      31, 23, 18,  5, 21,  9, 15, 11,
			
 
				+      30, 17,  8, 14, 29, 13, 28, 27
			
 
				+   };
			
 
				+   unsigned int k;
			
 
				+   n = n & (-n);        /* isolate lsb */
			
 
				+   printf("n = %d\n", n);
			
 
				+#if defined(SLOW_MUL)
			
 
				+   k = (n << 11) - n;
			
 
				+   k = (k <<  2) + k;
			
 
				+   k = (k <<  8) + n;
			
 
				+   k = (k <<  5) - k;
			
 
				+#else
			
 
				+   k = n * 0x4d7651f;
			
 
				+#endif
			
 
				+   return n ? tab[k>>27] : 32;
			
 
				 }
			
 
				 
			
 
				 int errors;
			
@@ -308,19 +266,22 @@ void error(int x, int y) {
 
				    printf("Error for x = %08x, got %d\n", x, y);
			
 
				 }
			
 
				 
			
 
				+/* ------------------------------ main ------------------------------ */
			
 
				+
			
 
				 int main()
			
 
				 {
			
 
				 #	ifdef GLM_TEST_ENABLE_PERF
			
 
				 
			
 
				-	int i, n;
			
 
				-	static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
			
 
				-		7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
			
 
				-		512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
			
 
				-		65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
			
 
				-		0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
			
 
				-		0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
			
 
				-		0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
			
 
				-		0xFFFFFFFF,0};
			
 
				+	int i, m, n;
			
 
				+	static unsigned test[] = {0,32, 1,0, 2,1, 3,0, 4,2, 5,0, 6,1,  7,0,
			
 
				+		8,3, 9,0, 16,4, 32,5, 64,6, 128,7, 255,0, 256,8, 512,9, 1024,10,
			
 
				+		2048,11, 4096,12, 8192,13, 16384,14, 32768,15, 65536,16,
			
 
				+		0x20000,17, 0x40000,18, 0x80000,19, 0x100000,20, 0x200000,21,
			
 
				+		0x400000,22, 0x800000,23, 0x1000000,24, 0x2000000,25,
			
 
				+		0x4000000,26, 0x8000000,27, 0x10000000,28, 0x20000000,29,
			
 
				+		0x40000000,30, 0x80000000,31, 0xFFFFFFF0,4, 0x3000FF00,8,
			
 
				+		0xC0000000,30, 0x60000000,29, 0x00011000, 12};
			
 
				+
			
 
				 	std::size_t const Count = 10000000;
			
 
				 
			
 
				 	n = sizeof(test)/4;
			
@@ -331,114 +292,115 @@ int main()
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
			
 
				+		if (ntz1(test[i]) != test[i+1]) error(test[i], ntz1(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz1: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
			
 
				+		if (ntz2(test[i]) != test[i+1]) error(test[i], ntz2(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz2: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
			
 
				+		if (ntz3(test[i]) != test[i+1]) error(test[i], ntz3(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz3: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
			
 
				+		if (ntz4(test[i]) != test[i+1]) error(test[i], ntz4(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz4: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
			
 
				+		if (ntz4a(test[i]) != test[i+1]) error(test[i], ntz4a(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz4a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
			
 
				+		m = test[i+1]; if (m > 8) m = 8;
			
 
				+		if (ntz5(test[i]) != m) error(test[i], ntz5(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz5: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
			
 
				+		if (ntz6(test[i]) != test[i+1]) error(test[i], ntz6(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz6: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
			
 
				+		if (ntz6a(test[i]) != test[i+1]) error(test[i], ntz6a(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz6a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
			
 
				+		if (ntz7(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz7: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
			
 
				+		if (ntz7_christophe(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz7_christophe: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
			
 
				+		if (ntz8(test[i]) != test[i+1]) error(test[i], ntz8(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz8: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
			
 
				+		if (ntz8a(test[i]) != test[i+1]) error(test[i], ntz8a(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz8a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
			
 
				+		if (ntz9(test[i]) != test[i+1]) error(test[i], ntz9(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz9: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	TimestampBeg = std::clock();
			
 
				 	for (std::size_t k = 0; k < Count; ++k)
			
 
				 	for (i = 0; i < n; i += 2) {
			
 
				-		if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
			
 
				+		if (ntz10(test[i]) != test[i+1]) error(test[i], ntz10(test[i]));}
			
 
				 	TimestampEnd = std::clock();
			
 
				 
			
 
				-	printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+	printf("ntz10: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				 
			
 
				 	if (errors == 0)
			
 
				 		printf("Passed all %d cases.\n", sizeof(test)/8);
			
--- a/test/core/core_func_integer_find_msb.cpp
+++ b/test/core/core_func_integer_find_msb.cpp
@@ -0,0 +1,447 @@
 
				+///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
			
 
				+///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Created : 2014-10-27
			
 
				+// Updated : 2014-10-27
			
 
				+// Licence : This source is under MIT licence
			
 
				+// File    : test/core/func_integer_find_lsb.cpp
			
 
				+///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// This has the programs for computing the number of leading zeros
			
 
				+// in a word.
			
 
				+// Max line length is 57, to fit in hacker.book.
			
 
				+// Compile with g++, not gcc.
			
 
				+#include <cstdio>
			
 
				+#include <cstdlib>     // To define "exit", req'd by XLC.
			
 
				+#include <ctime>
			
 
				+
			
 
				+#define LE 1            // 1 for little-endian, 0 for big-endian.
			
 
				+
			
 
				+int pop(unsigned x) {
			
 
				+   x = x - ((x >> 1) & 0x55555555);
			
 
				+   x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
			
 
				+   x = (x + (x >> 4)) & 0x0F0F0F0F;
			
 
				+   x = x + (x << 8);
			
 
				+   x = x + (x << 16);
			
 
				+   return x >> 24;
			
 
				+}
			
 
				+
			
 
				+int nlz1(unsigned x) {
			
 
				+   int n;
			
 
				+
			
 
				+   if (x == 0) return(32);
			
 
				+   n = 0;
			
 
				+   if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
			
 
				+   if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
			
 
				+   if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
			
 
				+   if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
			
 
				+   if (x <= 0x7FFFFFFF) {n = n + 1;}
			
 
				+   return n;
			
 
				+}
			
 
				+
			
 
				+int nlz1a(unsigned x) {
			
 
				+   int n;
			
 
				+
			
 
				+/* if (x == 0) return(32); */
			
 
				+   if ((int)x <= 0) return (~x >> 26) & 32;
			
 
				+   n = 1;
			
 
				+   if ((x >> 16) == 0) {n = n +16; x = x <<16;}
			
 
				+   if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
			
 
				+   if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
			
 
				+   if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
			
 
				+   n = n - (x >> 31);
			
 
				+   return n;
			
 
				+}
			
 
				+// On basic Risc, 12 to 20 instructions.
			
 
				+
			
 
				+int nlz2(unsigned x) {
			
 
				+   unsigned y;
			
 
				+   int n;
			
 
				+
			
 
				+   n = 32;
			
 
				+   y = x >>16;  if (y != 0) {n = n -16;  x = y;}
			
 
				+   y = x >> 8;  if (y != 0) {n = n - 8;  x = y;}
			
 
				+   y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
			
 
				+   y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
			
 
				+   y = x >> 1;  if (y != 0) return n - 2;
			
 
				+   return n - x;
			
 
				+}
			
 
				+
			
 
				+// As above but coded as a loop for compactness:
			
 
				+// 23 to 33 basic Risc instructions.
			
 
				+int nlz2a(unsigned x) {
			
 
				+   unsigned y;
			
 
				+   int n, c;
			
 
				+
			
 
				+   n = 32;
			
 
				+   c = 16;
			
 
				+   do {
			
 
				+      y = x >> c;  if (y != 0) {n = n - c;  x = y;}
			
 
				+      c = c >> 1;
			
 
				+   } while (c != 0);
			
 
				+   return n - x;
			
 
				+}
			
 
				+
			
 
				+int nlz3(int x) {
			
 
				+   int y, n;
			
 
				+
			
 
				+   n = 0;
			
 
				+   y = x;
			
 
				+L: if (x < 0) return n;
			
 
				+   if (y == 0) return 32 - n;
			
 
				+   n = n + 1;
			
 
				+   x = x << 1;
			
 
				+   y = y >> 1;
			
 
				+   goto L;
			
 
				+}
			
 
				+
			
 
				+int nlz4(unsigned x) {
			
 
				+   int y, m, n;
			
 
				+
			
 
				+   y = -(x >> 16);      // If left half of x is 0,
			
 
				+   m = (y >> 16) & 16;  // set n = 16.  If left half
			
 
				+   n = 16 - m;          // is nonzero, set n = 0 and
			
 
				+   x = x >> m;          // shift x right 16.
			
 
				+                        // Now x is of the form 0000xxxx.
			
 
				+   y = x - 0x100;       // If positions 8-15 are 0,
			
 
				+   m = (y >> 16) & 8;   // add 8 to n and shift x left 8.
			
 
				+   n = n + m;
			
 
				+   x = x << m;
			
 
				+
			
 
				+   y = x - 0x1000;      // If positions 12-15 are 0,
			
 
				+   m = (y >> 16) & 4;   // add 4 to n and shift x left 4.
			
 
				+   n = n + m;
			
 
				+   x = x << m;
			
 
				+
			
 
				+   y = x - 0x4000;      // If positions 14-15 are 0,
			
 
				+   m = (y >> 16) & 2;   // add 2 to n and shift x left 2.
			
 
				+   n = n + m;
			
 
				+   x = x << m;
			
 
				+
			
 
				+   y = x >> 14;         // Set y = 0, 1, 2, or 3.
			
 
				+   m = y & ~(y >> 1);   // Set m = 0, 1, 2, or 2 resp.
			
 
				+   return n + 2 - m;
			
 
				+}
			
 
				+
			
 
				+int nlz5(unsigned x) {
			
 
				+   int pop(unsigned x);
			
 
				+
			
 
				+   x = x | (x >> 1);
			
 
				+   x = x | (x >> 2);
			
 
				+   x = x | (x >> 4);
			
 
				+   x = x | (x >> 8);
			
 
				+   x = x | (x >>16);
			
 
				+   return pop(~x);
			
 
				+}
			
 
				+
			
 
				+/* The four programs below are not valid ANSI C programs.  This is
			
 
				+because they refer to the same storage locations as two different types.
			
 
				+However, they work with xlc/AIX, gcc/AIX, and gcc/NT.  If you try to
			
 
				+code them more compactly by declaring a variable xx to be "double," and
			
 
				+then using
			
 
				+
			
 
				+   n = 1054 - (*((unsigned *)&xx + LE) >> 20);
			
 
				+
			
 
				+then you are violating not only the rule above, but also the ANSI C
			
 
				+rule that pointer arithmetic can be performed only on pointers to
			
 
				+array elements.
			
 
				+   When coded with the above statement, the program fails with xlc,
			
 
				+gcc/AIX, and gcc/NT, at some optimization levels.
			
 
				+   BTW, these programs use the "anonymous union" feature of C++, not
			
 
				+available in C. */
			
 
				+
			
 
				+int nlz6(unsigned k) {
			
 
				+   union {
			
 
				+      unsigned asInt[2];
			
 
				+      double asDouble;
			
 
				+   };
			
 
				+   int n;
			
 
				+
			
 
				+   asDouble = (double)k + 0.5;
			
 
				+   n = 1054 - (asInt[LE] >> 20);
			
 
				+   return n;
			
 
				+}
			
 
				+
			
 
				+int nlz7(unsigned k) {
			
 
				+   union {
			
 
				+      unsigned asInt[2];
			
 
				+      double asDouble;
			
 
				+   };
			
 
				+   int n;
			
 
				+
			
 
				+   asDouble = (double)k;
			
 
				+   n = 1054 - (asInt[LE] >> 20);
			
 
				+   n = (n & 31) + (n >> 9);
			
 
				+   return n;
			
 
				+}
			
 
				+
			
 
				+   /* In single precision, round-to-nearest mode, the basic method fails for:
			
 
				+   k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
			
 
				+                        07FFFFFC <= k <= 07FFFFFF,
			
 
				+                        0FFFFFF8 <= k <= 0FFFFFFF,
			
 
				+                                   ...
			
 
				+                        7FFFFFC0 <= k <= 7FFFFFFF.
			
 
				+                        FFFFFF80 <= k <= FFFFFFFF.
			
 
				+   For k = 0 it gives 158, and for the other values it is too low by 1. */
			
 
				+
			
 
				+int nlz8(unsigned k) {
			
 
				+   union {
			
 
				+      unsigned asInt;
			
 
				+      float asFloat;
			
 
				+   };
			
 
				+   int n;
			
 
				+
			
 
				+   k = k & ~(k >> 1);           /* Fix problem with rounding. */
			
 
				+   asFloat = (float)k + 0.5f;
			
 
				+   n = 158 - (asInt >> 23);
			
 
				+   return n;
			
 
				+}
			
 
				+
			
 
				+/* The example below shows how to make a macro for nlz.  It uses an
			
 
				+extension to the C and C++ languages that is provided by the GNU C/C++
			
 
				+compiler, namely, that of allowing statements and declarations in
			
 
				+expressions (see "Using and Porting GNU CC", by Richard M. Stallman
			
 
				+(1998).  The underscores are necessary to protect against the
			
 
				+possibility that the macro argument will conflict with one of its local
			
 
				+variables, e.g., NLZ(k). */
			
 
				+
			
 
				+int nlz9(unsigned k) {
			
 
				+   union {
			
 
				+      unsigned asInt;
			
 
				+      float asFloat;
			
 
				+   };
			
 
				+   int n;
			
 
				+
			
 
				+   k = k & ~(k >> 1);           /* Fix problem with rounding. */
			
 
				+   asFloat = (float)k;
			
 
				+   n = 158 - (asInt >> 23);
			
 
				+   n = (n & 31) + (n >> 6);     /* Fix problem with k = 0. */
			
 
				+   return n;
			
 
				+}
			
 
				+
			
 
				+/* Below are three nearly equivalent programs for computing the number
			
 
				+of leading zeros in a word. This material is not in HD, but may be in a
			
 
				+future edition.
			
 
				+   Immediately below is Robert Harley's algorithm, found at the
			
 
				+comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
			
 
				+Juffa.
			
 
				+   Table entries marked "u" are unused. 14 ops including a multiply,
			
 
				+plus an indexed load.
			
 
				+   The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
			
 
				+of form 2**k + 1). There are no multipliers of three terms of the form
			
 
				+2**k +- 1 that work, with a table size of 64 or 128. There are some,
			
 
				+with a table size of 64, if you precede the multiplication with x = x -
			
 
				+(x >> 1), but that seems less elegant. There are also some if you use a
			
 
				+table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
			
 
				+save two instructions in the form of this algorithm with the
			
 
				+multiplication expanded into shifts and adds, but the table size is
			
 
				+getting a bit large). */
			
 
				+
			
 
				+#define u 99
			
 
				+int nlz10(unsigned x) {
			
 
				+
			
 
				+   static char table[64] =
			
 
				+     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
			
 
				+       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
			
 
				+      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
			
 
				+       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
			
 
				+
			
 
				+   x = x | (x >> 1);    // Propagate leftmost
			
 
				+   x = x | (x >> 2);    // 1-bit to the right.
			
 
				+   x = x | (x >> 4);
			
 
				+   x = x | (x >> 8);
			
 
				+   x = x | (x >>16);
			
 
				+   x = x*0x06EB14F9;    // Multiplier is 7*255**3.
			
 
				+   return table[x >> 26];
			
 
				+}
			
 
				+
			
 
				+/* Harley's algorithm with multiply expanded.
			
 
				+19 elementary ops plus an indexed load. */
			
 
				+
			
 
				+int nlz10a(unsigned x) {
			
 
				+
			
 
				+   static char table[64] =
			
 
				+     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
			
 
				+       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
			
 
				+      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
			
 
				+       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
			
 
				+
			
 
				+   x = x | (x >> 1);    // Propagate leftmost
			
 
				+   x = x | (x >> 2);    // 1-bit to the right.
			
 
				+   x = x | (x >> 4);
			
 
				+   x = x | (x >> 8);
			
 
				+   x = x | (x >> 16);
			
 
				+   x = (x << 3) - x;    // Multiply by 7.
			
 
				+   x = (x << 8) - x;    // Multiply by 255.
			
 
				+   x = (x << 8) - x;    // Again.
			
 
				+   x = (x << 8) - x;    // Again.
			
 
				+   return table[x >> 26];
			
 
				+}
			
 
				+
			
 
				+/* Julius Goryavsky's version of Harley's algorithm.
			
 
				+17 elementary ops plus an indexed load, if the machine
			
 
				+has "and not." */
			
 
				+
			
 
				+int nlz10b(unsigned x) {
			
 
				+
			
 
				+   static char table[64] =
			
 
				+     {32,20,19, u, u,18, u, 7,  10,17, u, u,14, u, 6, u,
			
 
				+       u, 9, u,16, u, u, 1,26,   u,13, u, u,24, 5, u, u,
			
 
				+       u,21, u, 8,11, u,15, u,   u, u, u, 2,27, 0,25, u,
			
 
				+      22, u,12, u, u, 3,28, u,  23, u, 4,29, u, u,30,31};
			
 
				+
			
 
				+   x = x | (x >> 1);    // Propagate leftmost
			
 
				+   x = x | (x >> 2);    // 1-bit to the right.
			
 
				+   x = x | (x >> 4);
			
 
				+   x = x | (x >> 8);
			
 
				+   x = x & ~(x >> 16);
			
 
				+   x = x*0xFD7049FF;    // Activate this line or the following 3.
			
 
				+// x = (x << 9) - x;    // Multiply by 511.
			
 
				+// x = (x << 11) - x;   // Multiply by 2047.
			
 
				+// x = (x << 14) - x;   // Multiply by 16383.
			
 
				+   return table[x >> 26];
			
 
				+}
			
 
				+
			
 
				+int errors;
			
 
				+void error(int x, int y) {
			
 
				+   errors = errors + 1;
			
 
				+   printf("Error for x = %08x, got %d\n", x, y);
			
 
				+}
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+#	ifdef GLM_TEST_ENABLE_PERF
			
 
				+
			
 
				+	int i, n;
			
 
				+	static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
			
 
				+		7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
			
 
				+		512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
			
 
				+		65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
			
 
				+		0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
			
 
				+		0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
			
 
				+		0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
			
 
				+		0xFFFFFFFF,0};
			
 
				+	std::size_t const Count = 10000000;
			
 
				+
			
 
				+	n = sizeof(test)/4;
			
 
				+
			
 
				+	std::clock_t TimestampBeg = 0;
			
 
				+	std::clock_t TimestampEnd = 0;
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	TimestampBeg = std::clock();
			
 
				+	for (std::size_t k = 0; k < Count; ++k)
			
 
				+	for (i = 0; i < n; i += 2) {
			
 
				+		if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
			
 
				+	TimestampEnd = std::clock();
			
 
				+
			
 
				+	printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);
			
 
				+
			
 
				+	if (errors == 0)
			
 
				+		printf("Passed all %d cases.\n", sizeof(test)/8);
			
 
				+
			
 
				+#	endif//GLM_TEST_ENABLE_PERF
			
 
				+}