alloc.h 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "default.h"
  5. #include "device.h"
  6. #include "scene.h"
  7. #include "../builders/primref.h"
  8. #include "../../common/tasking/taskscheduler.h"
  9. namespace embree
  10. {
  11. class FastAllocator
  12. {
  13. /*! maximum supported alignment */
  14. static const size_t maxAlignment = 64;
  15. /*! maximum allocation size */
  16. /* default settings */
  17. //static const size_t defaultBlockSize = 4096;
  18. #define maxAllocationSize size_t(2*1024*1024-maxAlignment)
  19. static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
  20. public:
  21. struct ThreadLocal2;
  22. enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
  23. /*! Per thread structure holding the current memory block. */
  24. struct __aligned(64) ThreadLocal
  25. {
  26. ALIGNED_CLASS_(64);
  27. public:
  28. /*! Constructor for usage with ThreadLocalData */
  29. __forceinline ThreadLocal (ThreadLocal2* parent)
  30. : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
  31. /*! initialize allocator */
  32. void init(FastAllocator* alloc)
  33. {
  34. ptr = nullptr;
  35. cur = end = 0;
  36. bytesUsed = 0;
  37. bytesWasted = 0;
  38. allocBlockSize = 0;
  39. if (alloc) allocBlockSize = alloc->defaultBlockSize;
  40. }
  41. /* Allocate aligned memory from the threads memory block. */
  42. __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
  43. {
  44. /* bind the thread local allocator to the proper FastAllocator*/
  45. parent->bind(alloc);
  46. assert(align <= maxAlignment);
  47. bytesUsed += bytes;
  48. /* try to allocate in local block */
  49. size_t ofs = (align - cur) & (align-1);
  50. cur += bytes + ofs;
  51. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  52. cur -= bytes + ofs;
  53. /* if allocation is too large allocate with parent allocator */
  54. if (4*bytes > allocBlockSize) {
  55. return alloc->malloc(bytes,maxAlignment,false);
  56. }
  57. /* get new partial block if allocation failed */
  58. size_t blockSize = allocBlockSize;
  59. ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
  60. bytesWasted += end-cur;
  61. cur = 0; end = blockSize;
  62. /* retry allocation */
  63. ofs = (align - cur) & (align-1);
  64. cur += bytes + ofs;
  65. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  66. cur -= bytes + ofs;
  67. /* get new full block if allocation failed */
  68. blockSize = allocBlockSize;
  69. ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
  70. bytesWasted += end-cur;
  71. cur = 0; end = blockSize;
  72. /* retry allocation */
  73. ofs = (align - cur) & (align-1);
  74. cur += bytes + ofs;
  75. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  76. cur -= bytes + ofs;
  77. /* should never happen as large allocations get handled specially above */
  78. assert(false);
  79. return nullptr;
  80. }
  81. __forceinline size_t getUsedBytes() const { return bytesUsed; }
  82. /*! returns amount of free bytes */
  83. __forceinline size_t getFreeBytes() const { return end-cur; }
  84. /*! returns amount of wasted bytes */
  85. __forceinline size_t getWastedBytes() const { return bytesWasted; }
  86. private:
  87. ThreadLocal2* parent;
  88. char* ptr; //!< pointer to memory block
  89. size_t cur; //!< current location of the allocator
  90. size_t end; //!< end of the memory block
  91. size_t allocBlockSize; //!< block size for allocations
  92. size_t bytesUsed; //!< number of total bytes allocated
  93. size_t bytesWasted; //!< number of bytes wasted
  94. };
  95. /*! Two thread local structures. */
  96. struct __aligned(64) ThreadLocal2
  97. {
  98. ALIGNED_CLASS_(64);
  99. public:
  100. __forceinline ThreadLocal2()
  101. : alloc(nullptr), alloc0(this), alloc1(this) {}
  102. /*! bind to fast allocator */
  103. __forceinline void bind(FastAllocator* alloc_i)
  104. {
  105. assert(alloc_i);
  106. if (alloc.load() == alloc_i) return;
  107. Lock<MutexSys> lock(mutex);
  108. //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
  109. if (alloc.load()) {
  110. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  111. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  112. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  113. }
  114. alloc0.init(alloc_i);
  115. alloc1.init(alloc_i);
  116. alloc.store(alloc_i);
  117. alloc_i->join(this);
  118. }
  119. /*! unbind to fast allocator */
  120. void unbind(FastAllocator* alloc_i)
  121. {
  122. assert(alloc_i);
  123. if (alloc.load() != alloc_i) return;
  124. Lock<MutexSys> lock(mutex);
  125. if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
  126. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  127. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  128. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  129. alloc0.init(nullptr);
  130. alloc1.init(nullptr);
  131. alloc.store(nullptr);
  132. }
  133. public:
  134. MutexSys mutex;
  135. std::atomic<FastAllocator*> alloc; //!< parent allocator
  136. ThreadLocal alloc0;
  137. ThreadLocal alloc1;
  138. };
  139. FastAllocator (Device* device,
  140. bool osAllocation,
  141. bool useUSM = false,
  142. bool blockAllocation = true)
  143. : device(device)
  144. , slotMask(0)
  145. , defaultBlockSize(PAGE_SIZE)
  146. , estimatedSize(0)
  147. , growSize(PAGE_SIZE)
  148. , maxGrowSize(maxAllocationSize)
  149. , usedBlocks(nullptr)
  150. , freeBlocks(nullptr)
  151. , useUSM(useUSM)
  152. , blockAllocation(blockAllocation)
  153. , use_single_mode(false)
  154. , log2_grow_size_scale(0)
  155. , bytesUsed(0)
  156. , bytesFree(0)
  157. , bytesWasted(0)
  158. , atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
  159. , primrefarray(device,0)
  160. {
  161. if (osAllocation && useUSM)
  162. abort(); //throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
  163. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  164. {
  165. threadUsedBlocks[i] = nullptr;
  166. threadBlocks[i] = nullptr;
  167. //assert(!slotMutex[i].isLocked());
  168. }
  169. }
  170. ~FastAllocator () {
  171. clear();
  172. }
  173. /*! returns the device attached to this allocator */
  174. Device* getDevice() {
  175. return device;
  176. }
  177. void share(mvector<PrimRef>& primrefarray_i) {
  178. primrefarray = std::move(primrefarray_i);
  179. }
  180. void unshare(mvector<PrimRef>& primrefarray_o)
  181. {
  182. reset(); // this removes blocks that are allocated inside the shared primref array
  183. primrefarray_o = std::move(primrefarray);
  184. }
  185. /*! returns first fast thread local allocator */
  186. __forceinline ThreadLocal* _threadLocal() {
  187. return &threadLocal2()->alloc0;
  188. }
  189. void setOSallocation(bool flag)
  190. {
  191. atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
  192. }
  193. private:
  194. /*! returns both fast thread local allocators */
  195. __forceinline ThreadLocal2* threadLocal2()
  196. {
  197. ThreadLocal2* alloc = thread_local_allocator2;
  198. if (alloc == nullptr) {
  199. thread_local_allocator2 = alloc = new ThreadLocal2;
  200. Lock<MutexSys> lock(s_thread_local_allocators_lock);
  201. s_thread_local_allocators.push_back(make_unique(alloc));
  202. }
  203. return alloc;
  204. }
  205. public:
  206. __forceinline void join(ThreadLocal2* alloc)
  207. {
  208. Lock<MutexSys> lock(s_thread_local_allocators_lock);
  209. thread_local_allocators.push_back(alloc);
  210. }
  211. public:
  212. struct CachedAllocator
  213. {
  214. __forceinline CachedAllocator(void* ptr)
  215. : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
  216. {
  217. assert(ptr == nullptr);
  218. }
  219. __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
  220. : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
  221. __forceinline operator bool () const {
  222. return alloc != nullptr;
  223. }
  224. __forceinline void* operator() (size_t bytes, size_t align = 16) const {
  225. return talloc0->malloc(alloc,bytes,align);
  226. }
  227. __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
  228. return talloc0->malloc(alloc,bytes,align);
  229. }
  230. __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
  231. return talloc1->malloc(alloc,bytes,align);
  232. }
  233. public:
  234. FastAllocator* alloc;
  235. ThreadLocal* talloc0;
  236. ThreadLocal* talloc1;
  237. };
  238. __forceinline CachedAllocator getCachedAllocator() {
  239. return CachedAllocator(this,threadLocal2());
  240. }
  241. /*! Builder interface to create thread local allocator */
  242. struct Create
  243. {
  244. public:
  245. __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
  246. __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
  247. private:
  248. FastAllocator* allocator;
  249. };
  250. void internal_fix_used_blocks()
  251. {
  252. /* move thread local blocks to global block list */
  253. for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
  254. {
  255. while (threadBlocks[i].load() != nullptr) {
  256. Block* nextUsedBlock = threadBlocks[i].load()->next;
  257. threadBlocks[i].load()->next = usedBlocks.load();
  258. usedBlocks = threadBlocks[i].load();
  259. threadBlocks[i] = nextUsedBlock;
  260. }
  261. threadBlocks[i] = nullptr;
  262. }
  263. }
  264. static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
  265. static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
  266. static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
  267. /* calculates a single threaded threshold for the builders such
  268. * that for small scenes the overhead of partly allocated blocks
  269. * per thread is low */
  270. size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
  271. {
  272. if (numPrimitives == 0 || bytesEstimated == 0)
  273. return defaultThreshold;
  274. /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
  275. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  276. const size_t threadCount = TaskScheduler::threadCount();
  277. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
  278. /* if we do not have to limit number of threads use optimal thresdhold */
  279. if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  280. return defaultThreshold;
  281. /* otherwise limit number of threads by calculating proper single thread threshold */
  282. else {
  283. double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
  284. return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
  285. }
  286. }
  287. __forceinline size_t alignSize(size_t i) {
  288. return (i+127)/128*128;
  289. }
  290. /*! initializes the grow size */
  291. __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
  292. {
  293. /* we do not need single thread local allocator mode */
  294. use_single_mode = false;
  295. /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
  296. size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
  297. size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
  298. growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
  299. /* if we reached the maxAllocationSize for growSize, we can
  300. * increase the number of allocation slots by still guaranteeing
  301. * the mainAllocationOverhead */
  302. slotMask = 0x0;
  303. if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
  304. if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
  305. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
  306. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
  307. /* set the thread local alloc block size */
  308. size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
  309. /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
  310. #if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
  311. const size_t threadCount = TaskScheduler::threadCount();
  312. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  313. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
  314. if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  315. defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
  316. /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
  317. else
  318. #endif
  319. defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
  320. if (bytesEstimated == 0) {
  321. maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
  322. defaultBlockSize = defaultBlockSizeSwitch;
  323. }
  324. log2_grow_size_scale = 0;
  325. if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
  326. if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
  327. if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
  328. if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
  329. if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
  330. if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
  331. if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
  332. }
  333. /*! initializes the allocator */
  334. void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
  335. {
  336. internal_fix_used_blocks();
  337. /* distribute the allocation to multiple thread block slots */
  338. slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
  339. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  340. if (bytesReserve == 0) bytesReserve = bytesAllocate;
  341. freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);
  342. estimatedSize = bytesEstimate;
  343. initGrowSizeAndNumSlots(bytesEstimate,true);
  344. }
  345. /*! initializes the allocator */
  346. void init_estimate(size_t bytesEstimate)
  347. {
  348. internal_fix_used_blocks();
  349. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  350. /* single allocator mode ? */
  351. estimatedSize = bytesEstimate;
  352. //initGrowSizeAndNumSlots(bytesEstimate,false);
  353. initGrowSizeAndNumSlots(bytesEstimate,false);
  354. }
  355. /*! frees state not required after build */
  356. __forceinline void cleanup()
  357. {
  358. internal_fix_used_blocks();
  359. /* unbind all thread local allocators */
  360. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  361. thread_local_allocators.clear();
  362. }
  363. /*! resets the allocator, memory blocks get reused */
  364. void reset ()
  365. {
  366. internal_fix_used_blocks();
  367. bytesUsed.store(0);
  368. bytesFree.store(0);
  369. bytesWasted.store(0);
  370. /* reset all used blocks and move them to begin of free block list */
  371. while (usedBlocks.load() != nullptr) {
  372. usedBlocks.load()->reset_block();
  373. Block* nextUsedBlock = usedBlocks.load()->next;
  374. usedBlocks.load()->next = freeBlocks.load();
  375. freeBlocks = usedBlocks.load();
  376. usedBlocks = nextUsedBlock;
  377. }
  378. /* remove all shared blocks as they are re-added during build */
  379. freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
  380. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  381. {
  382. threadUsedBlocks[i] = nullptr;
  383. threadBlocks[i] = nullptr;
  384. }
  385. /* unbind all thread local allocators */
  386. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  387. thread_local_allocators.clear();
  388. }
  389. /*! frees all allocated memory */
  390. __forceinline void clear()
  391. {
  392. cleanup();
  393. bytesUsed.store(0);
  394. bytesFree.store(0);
  395. bytesWasted.store(0);
  396. if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;
  397. if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;
  398. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
  399. threadUsedBlocks[i] = nullptr;
  400. threadBlocks[i] = nullptr;
  401. }
  402. primrefarray.clear();
  403. }
  404. __forceinline size_t incGrowSizeScale()
  405. {
  406. size_t scale = log2_grow_size_scale.fetch_add(1)+1;
  407. return size_t(1) << min(size_t(16),scale);
  408. }
  409. /*! thread safe allocation of memory */
  410. void* malloc(size_t& bytes, size_t align, bool partial)
  411. {
  412. assert(align <= maxAlignment);
  413. while (true)
  414. {
  415. /* allocate using current block */
  416. size_t threadID = TaskScheduler::threadID();
  417. size_t slot = threadID & slotMask;
  418. Block* myUsedBlocks = threadUsedBlocks[slot];
  419. if (myUsedBlocks) {
  420. void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
  421. if (ptr == nullptr && !blockAllocation)
  422. abort(); //throw std::bad_alloc();
  423. if (ptr) return ptr;
  424. }
  425. /* throw error if allocation is too large */
  426. if (bytes > maxAllocationSize)
  427. throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
  428. /* parallel block creation in case of no freeBlocks, avoids single global mutex */
  429. if (likely(freeBlocks.load() == nullptr))
  430. {
  431. Lock<MutexSys> lock(slotMutex[slot]);
  432. if (myUsedBlocks == threadUsedBlocks[slot]) {
  433. const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
  434. const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
  435. assert(allocSize >= bytes);
  436. threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
  437. // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
  438. }
  439. continue;
  440. }
  441. /* if this fails allocate new block */
  442. {
  443. Lock<MutexSys> lock(mutex);
  444. if (myUsedBlocks == threadUsedBlocks[slot])
  445. {
  446. if (freeBlocks.load() != nullptr) {
  447. Block* nextFreeBlock = freeBlocks.load()->next;
  448. freeBlocks.load()->next = usedBlocks;
  449. __memory_barrier();
  450. usedBlocks = freeBlocks.load();
  451. threadUsedBlocks[slot] = freeBlocks.load();
  452. freeBlocks = nextFreeBlock;
  453. } else {
  454. const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
  455. usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
  456. }
  457. }
  458. }
  459. }
  460. }
  461. /*! add new block */
  462. void addBlock(void* ptr, ssize_t bytes)
  463. {
  464. Lock<MutexSys> lock(mutex);
  465. const size_t sizeof_Header = offsetof(Block,data[0]);
  466. void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
  467. size_t ofs = (size_t) aptr - (size_t) ptr;
  468. bytes -= ofs;
  469. if (bytes < 4096) return; // ignore empty or very small blocks
  470. freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
  471. }
  472. /* special allocation only used from morton builder only a single time for each build */
  473. void* specialAlloc(size_t bytes)
  474. {
  475. assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
  476. return freeBlocks.load()->ptr();
  477. }
  478. struct Statistics
  479. {
  480. Statistics ()
  481. : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
  482. Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
  483. : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
  484. Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
  485. : bytesUsed(0), bytesFree(0), bytesWasted(0)
  486. {
  487. Block* usedBlocks = alloc->usedBlocks.load();
  488. Block* freeBlocks = alloc->freeBlocks.load();
  489. if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
  490. if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
  491. if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
  492. if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
  493. if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
  494. }
  495. std::string str(size_t numPrimitives)
  496. {
  497. std::stringstream str;
  498. str.setf(std::ios::fixed, std::ios::floatfield);
  499. str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  500. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  501. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  502. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
  503. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
  504. return str.str();
  505. }
  506. friend Statistics operator+ ( const Statistics& a, const Statistics& b)
  507. {
  508. return Statistics(a.bytesUsed+b.bytesUsed,
  509. a.bytesFree+b.bytesFree,
  510. a.bytesWasted+b.bytesWasted);
  511. }
  512. size_t bytesAllocatedTotal() const {
  513. return bytesUsed + bytesFree + bytesWasted;
  514. }
  515. public:
  516. size_t bytesUsed;
  517. size_t bytesFree;
  518. size_t bytesWasted;
  519. };
  520. Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
  521. return Statistics(this,atype,huge_pages);
  522. }
  523. size_t getUsedBytes() {
  524. return bytesUsed;
  525. }
  526. size_t getWastedBytes() {
  527. return bytesWasted;
  528. }
  529. struct AllStatistics
  530. {
  531. AllStatistics (FastAllocator* alloc)
  532. : bytesUsed(alloc->bytesUsed),
  533. bytesFree(alloc->bytesFree),
  534. bytesWasted(alloc->bytesWasted),
  535. stat_all(alloc,ANY_TYPE),
  536. stat_malloc(alloc,ALIGNED_MALLOC),
  537. stat_4K(alloc,EMBREE_OS_MALLOC,false),
  538. stat_2M(alloc,EMBREE_OS_MALLOC,true),
  539. stat_shared(alloc,SHARED) {}
  540. AllStatistics (size_t bytesUsed,
  541. size_t bytesFree,
  542. size_t bytesWasted,
  543. Statistics stat_all,
  544. Statistics stat_malloc,
  545. Statistics stat_4K,
  546. Statistics stat_2M,
  547. Statistics stat_shared)
  548. : bytesUsed(bytesUsed),
  549. bytesFree(bytesFree),
  550. bytesWasted(bytesWasted),
  551. stat_all(stat_all),
  552. stat_malloc(stat_malloc),
  553. stat_4K(stat_4K),
  554. stat_2M(stat_2M),
  555. stat_shared(stat_shared) {}
  556. friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
  557. {
  558. return AllStatistics(a.bytesUsed+b.bytesUsed,
  559. a.bytesFree+b.bytesFree,
  560. a.bytesWasted+b.bytesWasted,
  561. a.stat_all + b.stat_all,
  562. a.stat_malloc + b.stat_malloc,
  563. a.stat_4K + b.stat_4K,
  564. a.stat_2M + b.stat_2M,
  565. a.stat_shared + b.stat_shared);
  566. }
  567. void print(size_t numPrimitives)
  568. {
  569. std::stringstream str0;
  570. str0.setf(std::ios::fixed, std::ios::floatfield);
  571. str0 << " alloc : "
  572. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  573. << " "
  574. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
  575. std::cout << str0.str() << std::endl;
  576. std::stringstream str1;
  577. str1.setf(std::ios::fixed, std::ios::floatfield);
  578. str1 << " alloc : "
  579. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  580. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  581. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  582. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
  583. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
  584. std::cout << str1.str() << std::endl;
  585. std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
  586. std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
  587. std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
  588. std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
  589. std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
  590. }
  591. private:
  592. size_t bytesUsed;
  593. size_t bytesFree;
  594. size_t bytesWasted;
  595. Statistics stat_all;
  596. Statistics stat_malloc;
  597. Statistics stat_4K;
  598. Statistics stat_2M;
  599. Statistics stat_shared;
  600. };
  601. void print_blocks()
  602. {
  603. std::cout << " estimatedSize = " << estimatedSize
  604. << ", slotMask = " << slotMask
  605. << ", use_single_mode = " << use_single_mode
  606. << ", maxGrowSize = " << maxGrowSize
  607. << ", defaultBlockSize = " << defaultBlockSize
  608. << std::endl;
  609. std::cout << " used blocks = ";
  610. if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
  611. std::cout << "[END]" << std::endl;
  612. std::cout << " free blocks = ";
  613. if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
  614. std::cout << "[END]" << std::endl;
  615. }
  616. private:
  617. struct Block
  618. {
  619. __forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)
  620. {
  621. if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);
  622. else return alignedMalloc (bytesAllocate, bytesAlignment);
  623. }
  624. __forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)
  625. {
  626. if (useUSM) return device->free(ptr);
  627. else return alignedFree(ptr);
  628. }
  629. static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
  630. {
  631. /* We avoid using os_malloc for small blocks as this could
  632. * cause a risk of fragmenting the virtual address space and
  633. * reach the limit of vm.max_map_count = 65k under Linux. */
  634. if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
  635. atype = ALIGNED_MALLOC;
  636. /* we need to additionally allocate some header */
  637. const size_t sizeof_Header = offsetof(Block,data[0]);
  638. bytesAllocate = sizeof_Header+bytesAllocate;
  639. bytesReserve = sizeof_Header+bytesReserve;
  640. /* consume full 4k pages with using os_malloc */
  641. if (atype == EMBREE_OS_MALLOC) {
  642. bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  643. bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  644. }
  645. /* either use alignedMalloc or os_malloc */
  646. void *ptr = nullptr;
  647. if (atype == ALIGNED_MALLOC)
  648. {
  649. /* special handling for default block size */
  650. if (bytesAllocate == (2*PAGE_SIZE_2M))
  651. {
  652. const size_t alignment = maxAlignment;
  653. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  654. ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
  655. /* give hint to transparently convert these pages to 2MB pages */
  656. const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
  657. os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
  658. os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
  659. os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
  660. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  661. }
  662. else
  663. {
  664. const size_t alignment = maxAlignment;
  665. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  666. ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
  667. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  668. }
  669. }
  670. else if (atype == EMBREE_OS_MALLOC)
  671. {
  672. if (device) device->memoryMonitor(bytesAllocate,false);
  673. bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
  674. return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
  675. }
  676. else
  677. assert(false);
  678. return NULL;
  679. }
  680. Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
  681. : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
  682. {
  683. assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
  684. }
  685. static Block* remove_shared_blocks(Block* head)
  686. {
  687. Block** prev_next = &head;
  688. for (Block* block = head; block; block = block->next) {
  689. if (block->atype == SHARED) *prev_next = block->next;
  690. else prev_next = &block->next;
  691. }
  692. return head;
  693. }
  694. void clear_list(Device* device, bool useUSM)
  695. {
  696. Block* block = this;
  697. while (block) {
  698. Block* next = block->next;
  699. block->clear_block(device, useUSM);
  700. block = next;
  701. }
  702. }
  703. void clear_block (Device* device, bool useUSM)
  704. {
  705. const size_t sizeof_Header = offsetof(Block,data[0]);
  706. const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
  707. if (atype == ALIGNED_MALLOC) {
  708. blockAlignedFree(device, useUSM, this);
  709. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  710. }
  711. else if (atype == EMBREE_OS_MALLOC) {
  712. size_t sizeof_This = sizeof_Header+reserveEnd;
  713. os_free(this,sizeof_This,huge_pages);
  714. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  715. }
  716. else /* if (atype == SHARED) */ {
  717. }
  718. }
  719. void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
  720. {
  721. size_t bytes = bytes_in;
  722. assert(align <= maxAlignment);
  723. bytes = (bytes+(align-1)) & ~(align-1);
  724. if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
  725. const size_t i = cur.fetch_add(bytes);
  726. if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
  727. if (unlikely(i > reserveEnd)) return nullptr;
  728. bytes_in = bytes = min(bytes,reserveEnd-i);
  729. if (i+bytes > allocEnd) {
  730. if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
  731. }
  732. return &data[i];
  733. }
  734. void* ptr() {
  735. return &data[cur];
  736. }
  737. void reset_block ()
  738. {
  739. allocEnd = max(allocEnd,(size_t)cur);
  740. cur = 0;
  741. }
  742. size_t getBlockUsedBytes() const {
  743. return min(size_t(cur),reserveEnd);
  744. }
  745. size_t getBlockFreeBytes() const {
  746. return getBlockAllocatedBytes() - getBlockUsedBytes();
  747. }
  748. size_t getBlockAllocatedBytes() const {
  749. return min(max(allocEnd,size_t(cur)),reserveEnd);
  750. }
  751. size_t getBlockWastedBytes() const {
  752. const size_t sizeof_Header = offsetof(Block,data[0]);
  753. return sizeof_Header + wasted;
  754. }
  755. size_t getBlockReservedBytes() const {
  756. return reserveEnd;
  757. }
  758. bool hasType(AllocationType atype_i, bool huge_pages_i) const
  759. {
  760. if (atype_i == ANY_TYPE ) return true;
  761. else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
  762. else return atype_i == atype;
  763. }
  764. size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
  765. size_t bytes = 0;
  766. for (const Block* block = this; block; block = block->next) {
  767. if (!block->hasType(atype,huge_pages)) continue;
  768. bytes += block->getBlockUsedBytes();
  769. }
  770. return bytes;
  771. }
  772. size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
  773. size_t bytes = 0;
  774. for (const Block* block = this; block; block = block->next) {
  775. if (!block->hasType(atype,huge_pages)) continue;
  776. bytes += block->getBlockFreeBytes();
  777. }
  778. return bytes;
  779. }
  780. size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
  781. size_t bytes = 0;
  782. for (const Block* block = this; block; block = block->next) {
  783. if (!block->hasType(atype,huge_pages)) continue;
  784. bytes += block->getBlockWastedBytes();
  785. }
  786. return bytes;
  787. }
  788. size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
  789. size_t bytes = 0;
  790. for (const Block* block = this; block; block = block->next) {
  791. if (!block->hasType(atype,huge_pages)) continue;
  792. bytes += block->getBlockAllocatedBytes();
  793. }
  794. return bytes;
  795. }
  796. void print_list ()
  797. {
  798. for (const Block* block = this; block; block = block->next)
  799. block->print_block();
  800. }
  801. void print_block() const
  802. {
  803. if (atype == ALIGNED_MALLOC) std::cout << "A";
  804. else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
  805. else if (atype == SHARED) std::cout << "S";
  806. if (huge_pages) std::cout << "H";
  807. size_t bytesUsed = getBlockUsedBytes();
  808. size_t bytesFree = getBlockFreeBytes();
  809. size_t bytesWasted = getBlockWastedBytes();
  810. std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
  811. }
  812. public:
  813. std::atomic<size_t> cur; //!< current location of the allocator
  814. std::atomic<size_t> allocEnd; //!< end of the allocated memory region
  815. std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
  816. Block* next; //!< pointer to next block in list
  817. size_t wasted; //!< amount of memory wasted through block alignment
  818. AllocationType atype; //!< allocation mode of the block
  819. bool huge_pages; //!< whether the block uses huge pages
  820. char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
  821. char data[1]; //!< here starts memory to use for allocations
  822. };
  823. public:
  824. static const size_t blockHeaderSize = offsetof(Block,data[0]);
  825. private:
  826. Device* device;
  827. size_t slotMask;
  828. size_t defaultBlockSize;
  829. size_t estimatedSize;
  830. size_t growSize;
  831. size_t maxGrowSize;
  832. MutexSys mutex;
  833. MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
  834. std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  835. std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  836. std::atomic<Block*> usedBlocks;
  837. std::atomic<Block*> freeBlocks;
  838. bool useUSM;
  839. bool blockAllocation = true;
  840. bool use_single_mode;
  841. std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
  842. std::atomic<size_t> bytesUsed;
  843. std::atomic<size_t> bytesFree;
  844. std::atomic<size_t> bytesWasted;
  845. static __thread ThreadLocal2* thread_local_allocator2;
  846. static MutexSys s_thread_local_allocators_lock;
  847. static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
  848. std::vector<ThreadLocal2*> thread_local_allocators;
  849. AllocationType atype;
  850. mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
  851. };
  852. }