tessellation_cache.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "../common/default.h"
  18. /* force a complete cache invalidation when running out of allocation space */
  19. #define FORCE_SIMPLE_FLUSH 0
  20. #define THREAD_BLOCK_ATOMIC_ADD 4
  21. #if defined(DEBUG)
  22. #define CACHE_STATS(x)
  23. #else
  24. #define CACHE_STATS(x)
  25. #endif
  26. namespace embree
  27. {
  28. class SharedTessellationCacheStats
  29. {
  30. public:
  31. /* stats */
  32. static std::atomic<size_t> cache_accesses;
  33. static std::atomic<size_t> cache_hits;
  34. static std::atomic<size_t> cache_misses;
  35. static std::atomic<size_t> cache_flushes;
  36. static size_t cache_num_patches;
  37. __aligned(64) static SpinLock mtx;
  38. /* print stats for debugging */
  39. static void printStats();
  40. static void clearStats();
  41. };
  42. void resizeTessellationCache(size_t new_size);
  43. void resetTessellationCache();
  44. ////////////////////////////////////////////////////////////////////////////////
  45. ////////////////////////////////////////////////////////////////////////////////
  46. ////////////////////////////////////////////////////////////////////////////////
  47. struct __aligned(64) ThreadWorkState
  48. {
  49. ALIGNED_STRUCT;
  50. std::atomic<size_t> counter;
  51. ThreadWorkState* next;
  52. bool allocated;
  53. __forceinline ThreadWorkState(bool allocated = false)
  54. : counter(0), next(nullptr), allocated(allocated)
  55. {
  56. assert( ((size_t)this % 64) == 0 );
  57. }
  58. };
  59. class __aligned(64) SharedLazyTessellationCache
  60. {
  61. public:
  62. static const size_t NUM_CACHE_SEGMENTS = 8;
  63. static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
  64. static const size_t COMMIT_INDEX_SHIFT = 32+8;
  65. #if defined(__X86_64__)
  66. static const size_t REF_TAG_MASK = 0xffffffffff;
  67. #else
  68. static const size_t REF_TAG_MASK = 0x7FFFFFFF;
  69. #endif
  70. static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;
  71. static const size_t BLOCK_SIZE = 64;
  72. /*! Per thread tessellation ref cache */
  73. static __thread ThreadWorkState* init_t_state;
  74. static ThreadWorkState* current_t_state;
  75. static __forceinline ThreadWorkState *threadState()
  76. {
  77. if (unlikely(!init_t_state))
  78. /* sets init_t_state, can't return pointer due to macosx icc bug*/
  79. SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
  80. return init_t_state;
  81. }
  82. struct Tag
  83. {
  84. __forceinline Tag() : data(0) {}
  85. __forceinline Tag(void* ptr, size_t combinedTime) {
  86. init(ptr,combinedTime);
  87. }
  88. __forceinline Tag(size_t ptr, size_t combinedTime) {
  89. init((void*)ptr,combinedTime);
  90. }
  91. __forceinline void init(void* ptr, size_t combinedTime)
  92. {
  93. if (ptr == nullptr) {
  94. data = 0;
  95. return;
  96. }
  97. int64_t new_root_ref = (int64_t) ptr;
  98. new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
  99. assert( new_root_ref <= (int64_t)REF_TAG_MASK );
  100. new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
  101. data = new_root_ref;
  102. }
  103. __forceinline int64_t get() const { return data.load(); }
  104. __forceinline void set( int64_t v ) { data.store(v); }
  105. __forceinline void reset() { data.store(0); }
  106. private:
  107. atomic<int64_t> data;
  108. };
  109. static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
  110. struct CacheEntry
  111. {
  112. Tag tag;
  113. SpinLock mutex;
  114. };
  115. private:
  116. float *data;
  117. size_t size;
  118. size_t maxBlocks;
  119. ThreadWorkState *threadWorkState;
  120. __aligned(64) std::atomic<size_t> localTime;
  121. __aligned(64) std::atomic<size_t> next_block;
  122. __aligned(64) SpinLock reset_state;
  123. __aligned(64) SpinLock linkedlist_mtx;
  124. __aligned(64) std::atomic<size_t> switch_block_threshold;
  125. __aligned(64) std::atomic<size_t> numRenderThreads;
  126. public:
  127. SharedLazyTessellationCache();
  128. ~SharedLazyTessellationCache();
  129. void getNextRenderThreadWorkState();
  130. __forceinline size_t maxAllocSize() const {
  131. return switch_block_threshold;
  132. }
  133. __forceinline size_t getCurrentIndex() { return localTime.load(); }
  134. __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
  135. __forceinline size_t getTime(const size_t globalTime) {
  136. return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
  137. }
  138. __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }
  139. __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
  140. __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
  141. static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }
  142. static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
  143. static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
  144. static __forceinline size_t getState() { return threadState()->counter.load(); }
  145. static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
  146. static __forceinline size_t getTCacheTime(const size_t globalTime) {
  147. return sharedLazyTessellationCache.getTime(globalTime);
  148. }
  149. /* per thread lock */
  150. __forceinline void lockThreadLoop (ThreadWorkState *const t_state)
  151. {
  152. while(1)
  153. {
  154. size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
  155. if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
  156. {
  157. /* lock failed wait until sync phase is over */
  158. sharedLazyTessellationCache.unlockThread(t_state,-1);
  159. sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
  160. }
  161. else
  162. break;
  163. }
  164. }
  165. static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
  166. {
  167. const int64_t subdiv_patch_root_ref = entry.tag.get();
  168. CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
  169. if (likely(subdiv_patch_root_ref != 0))
  170. {
  171. const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
  172. const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
  173. if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
  174. {
  175. CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
  176. return (void*) subdiv_patch_root;
  177. }
  178. }
  179. CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
  180. return nullptr;
  181. }
  182. template<typename Constructor>
  183. static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
  184. {
  185. ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
  186. while (true)
  187. {
  188. sharedLazyTessellationCache.lockThreadLoop(t_state);
  189. void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
  190. if (patch) return (decltype(constructor())) patch;
  191. if (entry.mutex.try_lock())
  192. {
  193. if (!validTag(entry.tag,globalTime))
  194. {
  195. auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
  196. auto ret = constructor(); // thread is locked here!
  197. assert(ret);
  198. /* this should never return nullptr */
  199. auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
  200. auto time = before ? timeBefore : timeAfter;
  201. __memory_barrier();
  202. entry.tag = SharedLazyTessellationCache::Tag(ret,time);
  203. __memory_barrier();
  204. entry.mutex.unlock();
  205. return ret;
  206. }
  207. entry.mutex.unlock();
  208. }
  209. SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
  210. }
  211. }
  212. __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
  213. {
  214. #if FORCE_SIMPLE_FLUSH == 1
  215. return i == getTime(globalTime);
  216. #else
  217. return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
  218. #endif
  219. }
  220. static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
  221. {
  222. return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
  223. }
  224. static __forceinline bool validTag(const Tag& tag, size_t globalTime)
  225. {
  226. const int64_t subdiv_patch_root_ref = tag.get();
  227. if (subdiv_patch_root_ref == 0) return false;
  228. const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
  229. return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
  230. }
  231. void waitForUsersLessEqual(ThreadWorkState *const t_state,
  232. const unsigned int users);
  233. __forceinline size_t alloc(const size_t blocks)
  234. {
  235. if (unlikely(blocks >= switch_block_threshold))
  236. throw_RTCError(RTC_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
  237. assert(blocks < switch_block_threshold);
  238. size_t index = next_block.fetch_add(blocks);
  239. if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
  240. return index;
  241. }
  242. static __forceinline void* malloc(const size_t bytes)
  243. {
  244. size_t block_index = -1;
  245. ThreadWorkState *const t_state = threadState();
  246. while (true)
  247. {
  248. block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
  249. if (block_index == (size_t)-1)
  250. {
  251. sharedLazyTessellationCache.unlockThread(t_state);
  252. sharedLazyTessellationCache.allocNextSegment();
  253. sharedLazyTessellationCache.lockThread(t_state);
  254. continue;
  255. }
  256. break;
  257. }
  258. return sharedLazyTessellationCache.getBlockPtr(block_index);
  259. }
  260. __forceinline void *getBlockPtr(const size_t block_index)
  261. {
  262. assert(block_index < maxBlocks);
  263. assert(data);
  264. assert(block_index*16 <= size);
  265. return (void*)&data[block_index*16];
  266. }
  267. __forceinline void* getDataPtr() { return data; }
  268. __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
  269. __forceinline size_t getMaxBlocks() { return maxBlocks; }
  270. __forceinline size_t getSize() { return size; }
  271. void allocNextSegment();
  272. void realloc(const size_t newSize);
  273. void reset();
  274. static SharedLazyTessellationCache sharedLazyTessellationCache;
  275. };
  276. }