Quellcode durchsuchen

Added early out if data is already sorted. Fixed radixSort32 outputing results into temp buffers.

bkaradzic vor 12 Jahren
Ursprung
Commit
e1db65cd4b
1 geänderte Dateien mit 80 neuen und 26 gelöschten Zeilen
  1. 80 26
      include/bx/radixsort.h

+ 80 - 26
include/bx/radixsort.h

@@ -15,18 +15,34 @@ namespace bx
 #define BX_RADIXSORT_BIT_MASK (BX_RADIXSORT_HISTOGRAM_SIZE-1)
 #define BX_RADIXSORT_BIT_MASK (BX_RADIXSORT_HISTOGRAM_SIZE-1)
 
 
 	template <typename Ty>
 	template <typename Ty>
-	void radixSort32(uint32_t* _keys, uint32_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size)
+	void radixSort32(uint32_t* __restrict _keys, uint32_t* __restrict _tempKeys, Ty* __restrict _values, Ty* __restrict _tempValues, uint32_t _size)
 	{
 	{
+		uint32_t* __restrict keys = _keys;
+		uint32_t* __restrict tempKeys = _tempKeys;
+		Ty* __restrict values = _values;
+		Ty* __restrict tempValues = _tempValues;
+
 		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
 		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
 		uint16_t shift = 0;
 		uint16_t shift = 0;
-		for (uint32_t pass = 0; pass < 3; ++pass)
+		uint32_t pass = 0;
+		for (; pass < 3; ++pass)
 		{
 		{
 			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
 			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
-			for (uint32_t ii = 0; ii < _size; ++ii)
+
+			bool sorted = true;
+			uint32_t key = keys[0];
+			uint32_t prevKey = key;
+			for (uint32_t ii = 0; ii < _size; ++ii, prevKey = key)
 			{
 			{
-				uint32_t key = _keys[ii];
+				key = keys[ii];
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				++histogram[index];
 				++histogram[index];
+				sorted &= prevKey <= key;
+			}
+
+			if (sorted)
+			{
+				goto done;
 			}
 			}
 
 
 			uint16_t offset = 0;
 			uint16_t offset = 0;
@@ -39,38 +55,65 @@ namespace bx
 
 
 			for (uint32_t ii = 0; ii < _size; ++ii)
 			for (uint32_t ii = 0; ii < _size; ++ii)
 			{
 			{
-				uint32_t key = _keys[ii];
+				uint32_t key = keys[ii];
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				uint16_t dest = histogram[index]++;
 				uint16_t dest = histogram[index]++;
-				_tempKeys[dest] = key;
-				_tempValues[dest] = _values[ii];
+				tempKeys[dest] = key;
+				tempValues[dest] = values[ii];
 			}
 			}
 
 
-			uint32_t* swapKeys = _tempKeys;
-			_tempKeys = _keys;
-			_keys = swapKeys;
+			uint32_t* swapKeys = tempKeys;
+			tempKeys = keys;
+			keys = swapKeys;
 
 
-			Ty* swapValues = _tempValues;
-			_tempValues = _values;
-			_values = swapValues;
+			Ty* swapValues = tempValues;
+			tempValues = values;
+			values = swapValues;
 
 
 			shift += BX_RADIXSORT_BITS;
 			shift += BX_RADIXSORT_BITS;
 		}
 		}
+
+done:
+		if (0 != (pass&1) )
+		{
+			// Odd number of passes needs to do copy to the destination.
+			memcpy(_keys, _tempKeys, _size*sizeof(uint32_t) );
+			for (uint32_t ii = 0; ii < _size; ++ii)
+			{
+				_values[ii] = _tempValues[ii];
+			}
+		}
 	}
 	}
 
 
 	template <typename Ty>
 	template <typename Ty>
-	void radixSort64(uint64_t* _keys, uint64_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size)
+	void radixSort64(uint64_t* __restrict _keys, uint64_t* __restrict _tempKeys, Ty* __restrict _values, Ty* __restrict _tempValues, uint32_t _size)
 	{
 	{
+		uint64_t* __restrict keys = _keys;
+		uint64_t* __restrict tempKeys = _tempKeys;
+		Ty* __restrict values = _values;
+		Ty* __restrict tempValues = _tempValues;
+
 		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
 		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
 		uint16_t shift = 0;
 		uint16_t shift = 0;
-		for (uint32_t pass = 0; pass < 6; ++pass)
+		uint32_t pass = 0;
+		for (; pass < 6; ++pass)
 		{
 		{
 			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
 			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
-			for (uint32_t ii = 0; ii < _size; ++ii)
+
+			bool sorted = true;
+			uint64_t key = keys[0];
+			uint64_t prevKey = key;
+			for (uint32_t ii = 0; ii < _size; ++ii, prevKey = key)
 			{
 			{
-				uint64_t key = _keys[ii];
+				key = keys[ii];
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				++histogram[index];
 				++histogram[index];
+				sorted &= prevKey <= key;
+			}
+
+			if (sorted)
+			{
+				goto done;
 			}
 			}
 
 
 			uint16_t offset = 0;
 			uint16_t offset = 0;
@@ -83,23 +126,34 @@ namespace bx
 
 
 			for (uint32_t ii = 0; ii < _size; ++ii)
 			for (uint32_t ii = 0; ii < _size; ++ii)
 			{
 			{
-				uint64_t key = _keys[ii];
+				uint64_t key = keys[ii];
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
 				uint16_t dest = histogram[index]++;
 				uint16_t dest = histogram[index]++;
-				_tempKeys[dest] = key;
-				_tempValues[dest] = _values[ii];
+				tempKeys[dest] = key;
+				tempValues[dest] = values[ii];
 			}
 			}
 
 
-			uint64_t* swapKeys = _tempKeys;
-			_tempKeys = _keys;
-			_keys = swapKeys;
+			uint64_t* swapKeys = tempKeys;
+			tempKeys = keys;
+			keys = swapKeys;
 
 
-			Ty* swapValues = _tempValues;
-			_tempValues = _values;
-			_values = swapValues;
+			Ty* swapValues = tempValues;
+			tempValues = values;
+			values = swapValues;
 
 
 			shift += BX_RADIXSORT_BITS;
 			shift += BX_RADIXSORT_BITS;
 		}
 		}
+
+done:
+		if (0 != (pass&1) )
+		{
+			// Odd number of passes needs to do copy to the destination.
+			memcpy(_keys, _tempKeys, _size*sizeof(uint64_t) );
+			for (uint32_t ii = 0; ii < _size; ++ii)
+			{
+				_values[ii] = _tempValues[ii];
+			}
+		}
 	}
 	}
 
 
 #undef BX_RADIXSORT_BITS
 #undef BX_RADIXSORT_BITS