Browse Source

Optimize HashMap/HashSet using fastmod

Hendrik Brucker 3 years ago
parent
commit
fddafed919
3 changed files with 98 additions and 30 deletions
  1. 18 15
      core/templates/hash_map.h
  2. 18 15
      core/templates/hash_set.h
  3. 62 0
      core/templates/hashfuncs.h

+ 18 - 15
core/templates/hash_map.h

@@ -91,9 +91,9 @@ private:
 		return hash;
 	}
 
-	_FORCE_INLINE_ uint32_t _get_probe_length(uint32_t p_pos, uint32_t p_hash, uint32_t p_capacity) const {
-		uint32_t original_pos = p_hash % p_capacity;
-		return (p_pos - original_pos + p_capacity) % p_capacity;
+	static _FORCE_INLINE_ uint32_t _get_probe_length(const uint32_t p_pos, const uint32_t p_hash, const uint32_t p_capacity, const uint64_t p_capacity_inv) {
+		const uint32_t original_pos = fastmod(p_hash, p_capacity_inv, p_capacity);
+		return fastmod(p_pos - original_pos + p_capacity, p_capacity_inv, p_capacity);
 	}
 
 	bool _lookup_pos(const TKey &p_key, uint32_t &r_pos) const {
@@ -101,9 +101,10 @@ private:
 			return false; // Failed lookups, no elements
 		}
 
-		uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index];
 		uint32_t hash = _hash(p_key);
-		uint32_t pos = hash % capacity;
+		uint32_t pos = fastmod(hash, capacity_inv, capacity);
 		uint32_t distance = 0;
 
 		while (true) {
@@ -111,7 +112,7 @@ private:
 				return false;
 			}
 
-			if (distance > _get_probe_length(pos, hashes[pos], capacity)) {
+			if (distance > _get_probe_length(pos, hashes[pos], capacity, capacity_inv)) {
 				return false;
 			}
 
@@ -120,17 +121,18 @@ private:
 				return true;
 			}
 
-			pos = (pos + 1) % capacity;
+			pos = fastmod((pos + 1), capacity_inv, capacity);
 			distance++;
 		}
 	}
 
 	void _insert_with_hash(uint32_t p_hash, HashMapElement<TKey, TValue> *p_value) {
-		uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index];
 		uint32_t hash = p_hash;
 		HashMapElement<TKey, TValue> *value = p_value;
 		uint32_t distance = 0;
-		uint32_t pos = hash % capacity;
+		uint32_t pos = fastmod(hash, capacity_inv, capacity);
 
 		while (true) {
 			if (hashes[pos] == EMPTY_HASH) {
@@ -143,14 +145,14 @@ private:
 			}
 
 			// Not an empty slot, let's check the probing length of the existing one.
-			uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity);
+			uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity, capacity_inv);
 			if (existing_probe_len < distance) {
 				SWAP(hash, hashes[pos]);
 				SWAP(value, elements[pos]);
 				distance = existing_probe_len;
 			}
 
-			pos = (pos + 1) % capacity;
+			pos = fastmod((pos + 1), capacity_inv, capacity);
 			distance++;
 		}
 	}
@@ -316,13 +318,14 @@ public:
 			return false;
 		}
 
-		uint32_t capacity = hash_table_size_primes[capacity_index];
-		uint32_t next_pos = (pos + 1) % capacity;
-		while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity) != 0) {
+		const uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index];
+		uint32_t next_pos = fastmod((pos + 1), capacity_inv, capacity);
+		while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity, capacity_inv) != 0) {
 			SWAP(hashes[next_pos], hashes[pos]);
 			SWAP(elements[next_pos], elements[pos]);
 			pos = next_pos;
-			next_pos = (pos + 1) % capacity;
+			next_pos = fastmod((pos + 1), capacity_inv, capacity);
 		}
 
 		hashes[pos] = EMPTY_HASH;

+ 18 - 15
core/templates/hash_set.h

@@ -74,9 +74,9 @@ private:
 		return hash;
 	}
 
-	_FORCE_INLINE_ uint32_t _get_probe_length(uint32_t p_pos, uint32_t p_hash, uint32_t p_capacity) const {
-		uint32_t original_pos = p_hash % p_capacity;
-		return (p_pos - original_pos + p_capacity) % p_capacity;
+	static _FORCE_INLINE_ uint32_t _get_probe_length(const uint32_t p_pos, const uint32_t p_hash, const uint32_t p_capacity, const uint64_t p_capacity_inv) {
+		const uint32_t original_pos = fastmod(p_hash, p_capacity_inv, p_capacity);
+		return fastmod(p_pos - original_pos + p_capacity, p_capacity_inv, p_capacity);
 	}
 
 	bool _lookup_pos(const TKey &p_key, uint32_t &r_pos) const {
@@ -84,9 +84,10 @@ private:
 			return false; // Failed lookups, no elements
 		}
 
-		uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index];
 		uint32_t hash = _hash(p_key);
-		uint32_t pos = hash % capacity;
+		uint32_t pos = fastmod(hash, capacity_inv, capacity);
 		uint32_t distance = 0;
 
 		while (true) {
@@ -94,7 +95,7 @@ private:
 				return false;
 			}
 
-			if (distance > _get_probe_length(pos, hashes[pos], capacity)) {
+			if (distance > _get_probe_length(pos, hashes[pos], capacity, capacity_inv)) {
 				return false;
 			}
 
@@ -103,17 +104,18 @@ private:
 				return true;
 			}
 
-			pos = (pos + 1) % capacity;
+			pos = fastmod(pos + 1, capacity_inv, capacity);
 			distance++;
 		}
 	}
 
 	uint32_t _insert_with_hash(uint32_t p_hash, uint32_t p_index) {
-		uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index];
 		uint32_t hash = p_hash;
 		uint32_t index = p_index;
 		uint32_t distance = 0;
-		uint32_t pos = hash % capacity;
+		uint32_t pos = fastmod(hash, capacity_inv, capacity);
 
 		while (true) {
 			if (hashes[pos] == EMPTY_HASH) {
@@ -124,7 +126,7 @@ private:
 			}
 
 			// Not an empty slot, let's check the probing length of the existing one.
-			uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity);
+			uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity, capacity_inv);
 			if (existing_probe_len < distance) {
 				key_to_hash[index] = pos;
 				SWAP(hash, hashes[pos]);
@@ -132,7 +134,7 @@ private:
 				distance = existing_probe_len;
 			}
 
-			pos = (pos + 1) % capacity;
+			pos = fastmod(pos + 1, capacity_inv, capacity);
 			distance++;
 		}
 	}
@@ -265,9 +267,10 @@ public:
 		uint32_t key_pos = pos;
 		pos = key_to_hash[pos]; //make hash pos
 
-		uint32_t capacity = hash_table_size_primes[capacity_index];
-		uint32_t next_pos = (pos + 1) % capacity;
-		while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity) != 0) {
+		const uint32_t capacity = hash_table_size_primes[capacity_index];
+		const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index];
+		uint32_t next_pos = fastmod(pos + 1, capacity_inv, capacity);
+		while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity, capacity_inv) != 0) {
 			uint32_t kpos = hash_to_key[pos];
 			uint32_t kpos_next = hash_to_key[next_pos];
 			SWAP(key_to_hash[kpos], key_to_hash[kpos_next]);
@@ -275,7 +278,7 @@ public:
 			SWAP(hash_to_key[next_pos], hash_to_key[pos]);
 
 			pos = next_pos;
-			next_pos = (pos + 1) % capacity;
+			next_pos = fastmod(pos + 1, capacity_inv, capacity);
 		}
 
 		hashes[pos] = EMPTY_HASH;

+ 62 - 0
core/templates/hashfuncs.h

@@ -437,4 +437,66 @@ const uint32_t hash_table_size_primes[HASH_TABLE_SIZE_MAX] = {
 	1610612741,
 };
 
+// Computed with elem_i = UINT64_C (0 x FFFFFFFF FFFFFFFF ) / d_i + 1, where d_i is the i-th element of the above array.
+const uint64_t hash_table_size_primes_inv[HASH_TABLE_SIZE_MAX] = {
+	3689348814741910324,
+	1418980313362273202,
+	802032351030850071,
+	392483916461905354,
+	190172619316593316,
+	95578984837873325,
+	47420935922132524,
+	23987963684927896,
+	11955116055547344,
+	5991147799191151,
+	2998982941588287,
+	1501077717772769,
+	750081082979285,
+	375261795343686,
+	187625172388393,
+	93822606204624,
+	46909513691883,
+	23456218233098,
+	11728086747027,
+	5864041509391,
+	2932024948977,
+	1466014921160,
+	733007198436,
+	366503839517,
+	183251896093,
+	91625960335,
+	45812983922,
+	22906489714,
+	11453246088
+};
+
+/**
+ * Fastmod computes ( n mod d ) given the precomputed c much faster than n % d.
+ * The implementation of fastmod is based on the following paper by Daniel Lemire et al.
+ * Faster Remainder by Direct Computation: Applications to Compilers and Software Libraries
+ * https://arxiv.org/abs/1902.01961
+ */
+static _FORCE_INLINE_ uint32_t fastmod(const uint32_t n, const uint64_t c, const uint32_t d) {
+#if defined(_MSC_VER)
+	// Returns the upper 64 bits of the product of two 64-bit unsigned integers.
+	// This intrinsic function is required since MSVC does not support unsigned 128-bit integers.
+#if defined(_M_X64) || defined(_M_ARM64)
+	return __umulh(c * n, d);
+#else
+	// Fallback to the slower method for 32-bit platforms.
+	return n % d;
+#endif // _M_X64 || _M_ARM64
+#else
+#ifdef __SIZEOF_INT128__
+	// Prevent compiler warning, because we know what we are doing.
+	uint64_t lowbits = c * n;
+	__extension__ typedef unsigned __int128 uint128;
+	return static_cast<uint64_t>(((uint128)lowbits * d) >> 64);
+#else
+	// Fallback to the slower method if no 128-bit unsigned integer type is available.
+	return n % d;
+#endif // __SIZEOF_INT128__
+#endif // _MSC_VER
+}
+
 #endif // HASHFUNCS_H