alloc.h 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "default.h"
  5. #include "device.h"
  6. #include "scene.h"
  7. #include "../builders/primref.h"
  8. #include "../../common/tasking/taskscheduler.h"
  9. namespace embree
  10. {
  11. class FastAllocator
  12. {
  13. /*! maximum supported alignment */
  14. static const size_t maxAlignment = 64;
  15. /*! maximum allocation size */
  16. /* default settings */
  17. //static const size_t defaultBlockSize = 4096;
  18. #define maxAllocationSize size_t(2*1024*1024-maxAlignment)
  19. static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
  20. public:
  21. struct ThreadLocal2;
  22. enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
  23. /*! Per thread structure holding the current memory block. */
  24. struct __aligned(64) ThreadLocal
  25. {
  26. ALIGNED_CLASS_(64);
  27. public:
  28. /*! Constructor for usage with ThreadLocalData */
  29. __forceinline ThreadLocal (ThreadLocal2* parent)
  30. : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
  31. /*! initialize allocator */
  32. void init(FastAllocator* alloc)
  33. {
  34. ptr = nullptr;
  35. cur = end = 0;
  36. bytesUsed = 0;
  37. bytesWasted = 0;
  38. allocBlockSize = 0;
  39. if (alloc) allocBlockSize = alloc->defaultBlockSize;
  40. }
  41. /* Allocate aligned memory from the threads memory block. */
  42. __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
  43. {
  44. /* bind the thread local allocator to the proper FastAllocator*/
  45. parent->bind(alloc);
  46. assert(align <= maxAlignment);
  47. bytesUsed += bytes;
  48. /* try to allocate in local block */
  49. size_t ofs = (align - cur) & (align-1);
  50. cur += bytes + ofs;
  51. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  52. cur -= bytes + ofs;
  53. /* if allocation is too large allocate with parent allocator */
  54. if (4*bytes > allocBlockSize) {
  55. return alloc->malloc(bytes,maxAlignment,false);
  56. }
  57. /* get new partial block if allocation failed */
  58. size_t blockSize = allocBlockSize;
  59. ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
  60. bytesWasted += end-cur;
  61. cur = 0; end = blockSize;
  62. /* retry allocation */
  63. ofs = (align - cur) & (align-1);
  64. cur += bytes + ofs;
  65. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  66. cur -= bytes + ofs;
  67. /* get new full block if allocation failed */
  68. blockSize = allocBlockSize;
  69. ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
  70. bytesWasted += end-cur;
  71. cur = 0; end = blockSize;
  72. /* retry allocation */
  73. ofs = (align - cur) & (align-1);
  74. cur += bytes + ofs;
  75. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  76. cur -= bytes + ofs;
  77. /* should never happen as large allocations get handled specially above */
  78. assert(false);
  79. return nullptr;
  80. }
  81. __forceinline size_t getUsedBytes() const { return bytesUsed; }
  82. /*! returns amount of free bytes */
  83. __forceinline size_t getFreeBytes() const { return end-cur; }
  84. /*! returns amount of wasted bytes */
  85. __forceinline size_t getWastedBytes() const { return bytesWasted; }
  86. private:
  87. ThreadLocal2* parent;
  88. char* ptr; //!< pointer to memory block
  89. size_t cur; //!< current location of the allocator
  90. size_t end; //!< end of the memory block
  91. size_t allocBlockSize; //!< block size for allocations
  92. size_t bytesUsed; //!< number of total bytes allocated
  93. size_t bytesWasted; //!< number of bytes wasted
  94. };
  95. /*! Two thread local structures. */
  96. struct __aligned(64) ThreadLocal2
  97. {
  98. ALIGNED_CLASS_(64);
  99. public:
  100. __forceinline ThreadLocal2()
  101. : alloc(nullptr), alloc0(this), alloc1(this) {}
  102. /*! bind to fast allocator */
  103. __forceinline void bind(FastAllocator* alloc_i)
  104. {
  105. assert(alloc_i);
  106. if (alloc.load() == alloc_i) return;
  107. Lock<MutexSys> lock(mutex);
  108. //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
  109. if (alloc.load()) {
  110. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  111. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  112. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  113. }
  114. alloc0.init(alloc_i);
  115. alloc1.init(alloc_i);
  116. alloc.store(alloc_i);
  117. alloc_i->join(this);
  118. }
  119. /*! unbind to fast allocator */
  120. void unbind(FastAllocator* alloc_i)
  121. {
  122. assert(alloc_i);
  123. if (alloc.load() != alloc_i) return;
  124. Lock<MutexSys> lock(mutex);
  125. if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
  126. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  127. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  128. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  129. alloc0.init(nullptr);
  130. alloc1.init(nullptr);
  131. alloc.store(nullptr);
  132. }
  133. public:
  134. MutexSys mutex;
  135. std::atomic<FastAllocator*> alloc; //!< parent allocator
  136. ThreadLocal alloc0;
  137. ThreadLocal alloc1;
  138. };
  139. FastAllocator (Device* device,
  140. bool osAllocation,
  141. bool useUSM = false,
  142. bool blockAllocation = true)
  143. : device(device)
  144. , slotMask(0)
  145. , defaultBlockSize(PAGE_SIZE)
  146. , estimatedSize(0)
  147. , growSize(PAGE_SIZE)
  148. , maxGrowSize(maxAllocationSize)
  149. , usedBlocks(nullptr)
  150. , freeBlocks(nullptr)
  151. , useUSM(useUSM)
  152. , blockAllocation(blockAllocation)
  153. , use_single_mode(false)
  154. , log2_grow_size_scale(0)
  155. , bytesUsed(0)
  156. , bytesFree(0)
  157. , bytesWasted(0)
  158. , atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
  159. , primrefarray(device,0)
  160. {
  161. //if (osAllocation && useUSM)
  162. // throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
  163. if (osAllocation && useUSM) {
  164. abort();
  165. }
  166. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  167. {
  168. threadUsedBlocks[i] = nullptr;
  169. threadBlocks[i] = nullptr;
  170. //assert(!slotMutex[i].isLocked());
  171. }
  172. }
  173. ~FastAllocator () {
  174. clear();
  175. }
  176. /*! returns the device attached to this allocator */
  177. Device* getDevice() {
  178. return device;
  179. }
  180. void share(mvector<PrimRef>& primrefarray_i) {
  181. primrefarray = std::move(primrefarray_i);
  182. }
  183. void unshare(mvector<PrimRef>& primrefarray_o)
  184. {
  185. reset(); // this removes blocks that are allocated inside the shared primref array
  186. primrefarray_o = std::move(primrefarray);
  187. }
  188. /*! returns first fast thread local allocator */
  189. __forceinline ThreadLocal* _threadLocal() {
  190. return &threadLocal2()->alloc0;
  191. }
  192. void setOSallocation(bool flag)
  193. {
  194. atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
  195. }
  196. private:
  197. /*! returns both fast thread local allocators */
  198. __forceinline ThreadLocal2* threadLocal2()
  199. {
  200. ThreadLocal2* alloc = thread_local_allocator2;
  201. if (alloc == nullptr) {
  202. thread_local_allocator2 = alloc = new ThreadLocal2;
  203. Lock<MutexSys> lock(s_thread_local_allocators_lock);
  204. s_thread_local_allocators.push_back(make_unique(alloc));
  205. }
  206. return alloc;
  207. }
  208. public:
  209. __forceinline void join(ThreadLocal2* alloc)
  210. {
  211. Lock<MutexSys> lock(s_thread_local_allocators_lock);
  212. thread_local_allocators.push_back(alloc);
  213. }
  214. public:
  215. struct CachedAllocator
  216. {
  217. __forceinline CachedAllocator(void* ptr)
  218. : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
  219. {
  220. assert(ptr == nullptr);
  221. }
  222. __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
  223. : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
  224. __forceinline operator bool () const {
  225. return alloc != nullptr;
  226. }
  227. __forceinline void* operator() (size_t bytes, size_t align = 16) const {
  228. return talloc0->malloc(alloc,bytes,align);
  229. }
  230. __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
  231. return talloc0->malloc(alloc,bytes,align);
  232. }
  233. __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
  234. return talloc1->malloc(alloc,bytes,align);
  235. }
  236. public:
  237. FastAllocator* alloc;
  238. ThreadLocal* talloc0;
  239. ThreadLocal* talloc1;
  240. };
  241. __forceinline CachedAllocator getCachedAllocator() {
  242. return CachedAllocator(this,threadLocal2());
  243. }
  244. /*! Builder interface to create thread local allocator */
  245. struct Create
  246. {
  247. public:
  248. __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
  249. __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
  250. private:
  251. FastAllocator* allocator;
  252. };
  253. void internal_fix_used_blocks()
  254. {
  255. /* move thread local blocks to global block list */
  256. for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
  257. {
  258. while (threadBlocks[i].load() != nullptr) {
  259. Block* nextUsedBlock = threadBlocks[i].load()->next;
  260. threadBlocks[i].load()->next = usedBlocks.load();
  261. usedBlocks = threadBlocks[i].load();
  262. threadBlocks[i] = nextUsedBlock;
  263. }
  264. threadBlocks[i] = nullptr;
  265. }
  266. }
  267. static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
  268. static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
  269. static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
  270. /* calculates a single threaded threshold for the builders such
  271. * that for small scenes the overhead of partly allocated blocks
  272. * per thread is low */
  273. size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
  274. {
  275. if (numPrimitives == 0 || bytesEstimated == 0)
  276. return defaultThreshold;
  277. /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
  278. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  279. const size_t threadCount = TaskScheduler::threadCount();
  280. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
  281. /* if we do not have to limit number of threads use optimal thresdhold */
  282. if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  283. return defaultThreshold;
  284. /* otherwise limit number of threads by calculating proper single thread threshold */
  285. else {
  286. double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
  287. return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
  288. }
  289. }
  290. __forceinline size_t alignSize(size_t i) {
  291. return (i+127)/128*128;
  292. }
  293. /*! initializes the grow size */
  294. __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
  295. {
  296. /* we do not need single thread local allocator mode */
  297. use_single_mode = false;
  298. /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
  299. size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
  300. size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
  301. growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
  302. /* if we reached the maxAllocationSize for growSize, we can
  303. * increase the number of allocation slots by still guaranteeing
  304. * the mainAllocationOverhead */
  305. slotMask = 0x0;
  306. if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
  307. if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
  308. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
  309. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
  310. /* set the thread local alloc block size */
  311. size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
  312. /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
  313. #if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
  314. const size_t threadCount = TaskScheduler::threadCount();
  315. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  316. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
  317. if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  318. defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
  319. /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
  320. else
  321. #endif
  322. defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
  323. if (bytesEstimated == 0) {
  324. maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
  325. defaultBlockSize = defaultBlockSizeSwitch;
  326. }
  327. log2_grow_size_scale = 0;
  328. if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
  329. if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
  330. if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
  331. if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
  332. if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
  333. if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
  334. if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
  335. }
  336. /*! initializes the allocator */
  337. void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
  338. {
  339. internal_fix_used_blocks();
  340. /* distribute the allocation to multiple thread block slots */
  341. slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
  342. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  343. if (bytesReserve == 0) bytesReserve = bytesAllocate;
  344. freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);
  345. estimatedSize = bytesEstimate;
  346. initGrowSizeAndNumSlots(bytesEstimate,true);
  347. }
  348. /*! initializes the allocator */
  349. void init_estimate(size_t bytesEstimate)
  350. {
  351. internal_fix_used_blocks();
  352. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  353. /* single allocator mode ? */
  354. estimatedSize = bytesEstimate;
  355. //initGrowSizeAndNumSlots(bytesEstimate,false);
  356. initGrowSizeAndNumSlots(bytesEstimate,false);
  357. }
  358. /*! frees state not required after build */
  359. __forceinline void cleanup()
  360. {
  361. internal_fix_used_blocks();
  362. /* unbind all thread local allocators */
  363. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  364. thread_local_allocators.clear();
  365. }
  366. /*! resets the allocator, memory blocks get reused */
  367. void reset ()
  368. {
  369. internal_fix_used_blocks();
  370. bytesUsed.store(0);
  371. bytesFree.store(0);
  372. bytesWasted.store(0);
  373. /* reset all used blocks and move them to begin of free block list */
  374. while (usedBlocks.load() != nullptr) {
  375. usedBlocks.load()->reset_block();
  376. Block* nextUsedBlock = usedBlocks.load()->next;
  377. usedBlocks.load()->next = freeBlocks.load();
  378. freeBlocks = usedBlocks.load();
  379. usedBlocks = nextUsedBlock;
  380. }
  381. /* remove all shared blocks as they are re-added during build */
  382. freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
  383. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  384. {
  385. threadUsedBlocks[i] = nullptr;
  386. threadBlocks[i] = nullptr;
  387. }
  388. /* unbind all thread local allocators */
  389. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  390. thread_local_allocators.clear();
  391. }
  392. /*! frees all allocated memory */
  393. __forceinline void clear()
  394. {
  395. cleanup();
  396. bytesUsed.store(0);
  397. bytesFree.store(0);
  398. bytesWasted.store(0);
  399. if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;
  400. if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;
  401. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
  402. threadUsedBlocks[i] = nullptr;
  403. threadBlocks[i] = nullptr;
  404. }
  405. primrefarray.clear();
  406. }
  407. __forceinline size_t incGrowSizeScale()
  408. {
  409. size_t scale = log2_grow_size_scale.fetch_add(1)+1;
  410. return size_t(1) << min(size_t(16),scale);
  411. }
  412. /*! thread safe allocation of memory */
  413. void* malloc(size_t& bytes, size_t align, bool partial)
  414. {
  415. assert(align <= maxAlignment);
  416. while (true)
  417. {
  418. /* allocate using current block */
  419. size_t threadID = TaskScheduler::threadID();
  420. size_t slot = threadID & slotMask;
  421. Block* myUsedBlocks = threadUsedBlocks[slot];
  422. if (myUsedBlocks) {
  423. void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
  424. //if (ptr == nullptr && !blockAllocation)
  425. // throw std::bad_alloc();
  426. if (ptr == nullptr && !blockAllocation) {
  427. abort();
  428. }
  429. if (ptr) return ptr;
  430. }
  431. /* throw error if allocation is too large */
  432. if (bytes > maxAllocationSize)
  433. throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
  434. /* parallel block creation in case of no freeBlocks, avoids single global mutex */
  435. if (likely(freeBlocks.load() == nullptr))
  436. {
  437. Lock<MutexSys> lock(slotMutex[slot]);
  438. if (myUsedBlocks == threadUsedBlocks[slot]) {
  439. const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
  440. const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
  441. assert(allocSize >= bytes);
  442. threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
  443. // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
  444. }
  445. continue;
  446. }
  447. /* if this fails allocate new block */
  448. {
  449. Lock<MutexSys> lock(mutex);
  450. if (myUsedBlocks == threadUsedBlocks[slot])
  451. {
  452. if (freeBlocks.load() != nullptr) {
  453. Block* nextFreeBlock = freeBlocks.load()->next;
  454. freeBlocks.load()->next = usedBlocks;
  455. __memory_barrier();
  456. usedBlocks = freeBlocks.load();
  457. threadUsedBlocks[slot] = freeBlocks.load();
  458. freeBlocks = nextFreeBlock;
  459. } else {
  460. const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
  461. usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
  462. }
  463. }
  464. }
  465. }
  466. }
  467. /*! add new block */
  468. void addBlock(void* ptr, ssize_t bytes)
  469. {
  470. Lock<MutexSys> lock(mutex);
  471. const size_t sizeof_Header = offsetof(Block,data[0]);
  472. void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
  473. size_t ofs = (size_t) aptr - (size_t) ptr;
  474. bytes -= ofs;
  475. if (bytes < 4096) return; // ignore empty or very small blocks
  476. freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
  477. }
  478. /* special allocation only used from morton builder only a single time for each build */
  479. void* specialAlloc(size_t bytes)
  480. {
  481. assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
  482. return freeBlocks.load()->ptr();
  483. }
  484. struct Statistics
  485. {
  486. Statistics ()
  487. : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
  488. Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
  489. : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
  490. Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
  491. : bytesUsed(0), bytesFree(0), bytesWasted(0)
  492. {
  493. Block* usedBlocks = alloc->usedBlocks.load();
  494. Block* freeBlocks = alloc->freeBlocks.load();
  495. if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
  496. if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
  497. if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
  498. if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
  499. if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
  500. }
  501. std::string str(size_t numPrimitives)
  502. {
  503. std::stringstream str;
  504. str.setf(std::ios::fixed, std::ios::floatfield);
  505. str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  506. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  507. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  508. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
  509. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
  510. return str.str();
  511. }
  512. friend Statistics operator+ ( const Statistics& a, const Statistics& b)
  513. {
  514. return Statistics(a.bytesUsed+b.bytesUsed,
  515. a.bytesFree+b.bytesFree,
  516. a.bytesWasted+b.bytesWasted);
  517. }
  518. size_t bytesAllocatedTotal() const {
  519. return bytesUsed + bytesFree + bytesWasted;
  520. }
  521. public:
  522. size_t bytesUsed;
  523. size_t bytesFree;
  524. size_t bytesWasted;
  525. };
  526. Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
  527. return Statistics(this,atype,huge_pages);
  528. }
  529. size_t getUsedBytes() {
  530. return bytesUsed;
  531. }
  532. size_t getWastedBytes() {
  533. return bytesWasted;
  534. }
  535. struct AllStatistics
  536. {
  537. AllStatistics (FastAllocator* alloc)
  538. : bytesUsed(alloc->bytesUsed),
  539. bytesFree(alloc->bytesFree),
  540. bytesWasted(alloc->bytesWasted),
  541. stat_all(alloc,ANY_TYPE),
  542. stat_malloc(alloc,ALIGNED_MALLOC),
  543. stat_4K(alloc,EMBREE_OS_MALLOC,false),
  544. stat_2M(alloc,EMBREE_OS_MALLOC,true),
  545. stat_shared(alloc,SHARED) {}
  546. AllStatistics (size_t bytesUsed,
  547. size_t bytesFree,
  548. size_t bytesWasted,
  549. Statistics stat_all,
  550. Statistics stat_malloc,
  551. Statistics stat_4K,
  552. Statistics stat_2M,
  553. Statistics stat_shared)
  554. : bytesUsed(bytesUsed),
  555. bytesFree(bytesFree),
  556. bytesWasted(bytesWasted),
  557. stat_all(stat_all),
  558. stat_malloc(stat_malloc),
  559. stat_4K(stat_4K),
  560. stat_2M(stat_2M),
  561. stat_shared(stat_shared) {}
  562. friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
  563. {
  564. return AllStatistics(a.bytesUsed+b.bytesUsed,
  565. a.bytesFree+b.bytesFree,
  566. a.bytesWasted+b.bytesWasted,
  567. a.stat_all + b.stat_all,
  568. a.stat_malloc + b.stat_malloc,
  569. a.stat_4K + b.stat_4K,
  570. a.stat_2M + b.stat_2M,
  571. a.stat_shared + b.stat_shared);
  572. }
  573. void print(size_t numPrimitives)
  574. {
  575. std::stringstream str0;
  576. str0.setf(std::ios::fixed, std::ios::floatfield);
  577. str0 << " alloc : "
  578. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  579. << " "
  580. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
  581. std::cout << str0.str() << std::endl;
  582. std::stringstream str1;
  583. str1.setf(std::ios::fixed, std::ios::floatfield);
  584. str1 << " alloc : "
  585. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  586. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  587. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  588. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
  589. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
  590. std::cout << str1.str() << std::endl;
  591. std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
  592. std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
  593. std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
  594. std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
  595. std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
  596. }
  597. private:
  598. size_t bytesUsed;
  599. size_t bytesFree;
  600. size_t bytesWasted;
  601. Statistics stat_all;
  602. Statistics stat_malloc;
  603. Statistics stat_4K;
  604. Statistics stat_2M;
  605. Statistics stat_shared;
  606. };
  607. void print_blocks()
  608. {
  609. std::cout << " estimatedSize = " << estimatedSize
  610. << ", slotMask = " << slotMask
  611. << ", use_single_mode = " << use_single_mode
  612. << ", maxGrowSize = " << maxGrowSize
  613. << ", defaultBlockSize = " << defaultBlockSize
  614. << std::endl;
  615. std::cout << " used blocks = ";
  616. if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
  617. std::cout << "[END]" << std::endl;
  618. std::cout << " free blocks = ";
  619. if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
  620. std::cout << "[END]" << std::endl;
  621. }
  622. private:
  623. struct Block
  624. {
  625. __forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)
  626. {
  627. if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);
  628. else return alignedMalloc (bytesAllocate, bytesAlignment);
  629. }
  630. __forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)
  631. {
  632. if (useUSM) return device->free(ptr);
  633. else return alignedFree(ptr);
  634. }
  635. static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
  636. {
  637. /* We avoid using os_malloc for small blocks as this could
  638. * cause a risk of fragmenting the virtual address space and
  639. * reach the limit of vm.max_map_count = 65k under Linux. */
  640. if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
  641. atype = ALIGNED_MALLOC;
  642. /* we need to additionally allocate some header */
  643. const size_t sizeof_Header = offsetof(Block,data[0]);
  644. bytesAllocate = sizeof_Header+bytesAllocate;
  645. bytesReserve = sizeof_Header+bytesReserve;
  646. /* consume full 4k pages with using os_malloc */
  647. if (atype == EMBREE_OS_MALLOC) {
  648. bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  649. bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  650. }
  651. /* either use alignedMalloc or os_malloc */
  652. void *ptr = nullptr;
  653. if (atype == ALIGNED_MALLOC)
  654. {
  655. /* special handling for default block size */
  656. if (bytesAllocate == (2*PAGE_SIZE_2M))
  657. {
  658. const size_t alignment = maxAlignment;
  659. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  660. ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
  661. /* give hint to transparently convert these pages to 2MB pages */
  662. const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
  663. os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
  664. os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
  665. os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
  666. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  667. }
  668. else
  669. {
  670. const size_t alignment = maxAlignment;
  671. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  672. ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
  673. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  674. }
  675. }
  676. else if (atype == EMBREE_OS_MALLOC)
  677. {
  678. if (device) device->memoryMonitor(bytesAllocate,false);
  679. bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
  680. return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
  681. }
  682. else
  683. assert(false);
  684. return NULL;
  685. }
  686. Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
  687. : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
  688. {
  689. assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
  690. }
  691. static Block* remove_shared_blocks(Block* head)
  692. {
  693. Block** prev_next = &head;
  694. for (Block* block = head; block; block = block->next) {
  695. if (block->atype == SHARED) *prev_next = block->next;
  696. else prev_next = &block->next;
  697. }
  698. return head;
  699. }
  700. void clear_list(Device* device, bool useUSM)
  701. {
  702. Block* block = this;
  703. while (block) {
  704. Block* next = block->next;
  705. block->clear_block(device, useUSM);
  706. block = next;
  707. }
  708. }
  709. void clear_block (Device* device, bool useUSM)
  710. {
  711. const size_t sizeof_Header = offsetof(Block,data[0]);
  712. const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
  713. if (atype == ALIGNED_MALLOC) {
  714. blockAlignedFree(device, useUSM, this);
  715. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  716. }
  717. else if (atype == EMBREE_OS_MALLOC) {
  718. size_t sizeof_This = sizeof_Header+reserveEnd;
  719. os_free(this,sizeof_This,huge_pages);
  720. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  721. }
  722. else /* if (atype == SHARED) */ {
  723. }
  724. }
  725. void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
  726. {
  727. size_t bytes = bytes_in;
  728. assert(align <= maxAlignment);
  729. bytes = (bytes+(align-1)) & ~(align-1);
  730. if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
  731. const size_t i = cur.fetch_add(bytes);
  732. if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
  733. if (unlikely(i > reserveEnd)) return nullptr;
  734. bytes_in = bytes = min(bytes,reserveEnd-i);
  735. if (i+bytes > allocEnd) {
  736. if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
  737. }
  738. return &data[i];
  739. }
  740. void* ptr() {
  741. return &data[cur];
  742. }
  743. void reset_block ()
  744. {
  745. allocEnd = max(allocEnd,(size_t)cur);
  746. cur = 0;
  747. }
  748. size_t getBlockUsedBytes() const {
  749. return min(size_t(cur),reserveEnd);
  750. }
  751. size_t getBlockFreeBytes() const {
  752. return getBlockAllocatedBytes() - getBlockUsedBytes();
  753. }
  754. size_t getBlockAllocatedBytes() const {
  755. return min(max(allocEnd,size_t(cur)),reserveEnd);
  756. }
  757. size_t getBlockWastedBytes() const {
  758. const size_t sizeof_Header = offsetof(Block,data[0]);
  759. return sizeof_Header + wasted;
  760. }
  761. size_t getBlockReservedBytes() const {
  762. return reserveEnd;
  763. }
  764. bool hasType(AllocationType atype_i, bool huge_pages_i) const
  765. {
  766. if (atype_i == ANY_TYPE ) return true;
  767. else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
  768. else return atype_i == atype;
  769. }
  770. size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
  771. size_t bytes = 0;
  772. for (const Block* block = this; block; block = block->next) {
  773. if (!block->hasType(atype,huge_pages)) continue;
  774. bytes += block->getBlockUsedBytes();
  775. }
  776. return bytes;
  777. }
  778. size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
  779. size_t bytes = 0;
  780. for (const Block* block = this; block; block = block->next) {
  781. if (!block->hasType(atype,huge_pages)) continue;
  782. bytes += block->getBlockFreeBytes();
  783. }
  784. return bytes;
  785. }
  786. size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
  787. size_t bytes = 0;
  788. for (const Block* block = this; block; block = block->next) {
  789. if (!block->hasType(atype,huge_pages)) continue;
  790. bytes += block->getBlockWastedBytes();
  791. }
  792. return bytes;
  793. }
  794. size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
  795. size_t bytes = 0;
  796. for (const Block* block = this; block; block = block->next) {
  797. if (!block->hasType(atype,huge_pages)) continue;
  798. bytes += block->getBlockAllocatedBytes();
  799. }
  800. return bytes;
  801. }
  802. void print_list ()
  803. {
  804. for (const Block* block = this; block; block = block->next)
  805. block->print_block();
  806. }
  807. void print_block() const
  808. {
  809. if (atype == ALIGNED_MALLOC) std::cout << "A";
  810. else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
  811. else if (atype == SHARED) std::cout << "S";
  812. if (huge_pages) std::cout << "H";
  813. size_t bytesUsed = getBlockUsedBytes();
  814. size_t bytesFree = getBlockFreeBytes();
  815. size_t bytesWasted = getBlockWastedBytes();
  816. std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
  817. }
  818. public:
  819. std::atomic<size_t> cur; //!< current location of the allocator
  820. std::atomic<size_t> allocEnd; //!< end of the allocated memory region
  821. std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
  822. Block* next; //!< pointer to next block in list
  823. size_t wasted; //!< amount of memory wasted through block alignment
  824. AllocationType atype; //!< allocation mode of the block
  825. bool huge_pages; //!< whether the block uses huge pages
  826. char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
  827. char data[1]; //!< here starts memory to use for allocations
  828. };
  829. public:
  830. static const size_t blockHeaderSize = offsetof(Block,data[0]);
  831. private:
  832. Device* device;
  833. size_t slotMask;
  834. size_t defaultBlockSize;
  835. size_t estimatedSize;
  836. size_t growSize;
  837. size_t maxGrowSize;
  838. MutexSys mutex;
  839. MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
  840. std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  841. std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  842. std::atomic<Block*> usedBlocks;
  843. std::atomic<Block*> freeBlocks;
  844. bool useUSM;
  845. bool blockAllocation = true;
  846. bool use_single_mode;
  847. std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
  848. std::atomic<size_t> bytesUsed;
  849. std::atomic<size_t> bytesFree;
  850. std::atomic<size_t> bytesWasted;
  851. static __thread ThreadLocal2* thread_local_allocator2;
  852. static MutexSys s_thread_local_allocators_lock;
  853. static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
  854. std::vector<ThreadLocal2*> thread_local_allocators;
  855. AllocationType atype;
  856. mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
  857. };
  858. }