alloc.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "default.h"
  5. #include "device.h"
  6. #include "scene.h"
  7. #include "primref.h"
  8. namespace embree
  9. {
  10. class FastAllocator
  11. {
  12. /*! maximum supported alignment */
  13. static const size_t maxAlignment = 64;
  14. /*! maximum allocation size */
  15. /* default settings */
  16. //static const size_t defaultBlockSize = 4096;
  17. #define maxAllocationSize size_t(2*1024*1024-maxAlignment)
  18. static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
  19. public:
  20. struct ThreadLocal2;
  21. enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
  22. /*! Per thread structure holding the current memory block. */
  23. struct __aligned(64) ThreadLocal
  24. {
  25. ALIGNED_CLASS_(64);
  26. public:
  27. /*! Constructor for usage with ThreadLocalData */
  28. __forceinline ThreadLocal (ThreadLocal2* parent)
  29. : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
  30. /*! initialize allocator */
  31. void init(FastAllocator* alloc)
  32. {
  33. ptr = nullptr;
  34. cur = end = 0;
  35. bytesUsed = 0;
  36. bytesWasted = 0;
  37. allocBlockSize = 0;
  38. if (alloc) allocBlockSize = alloc->defaultBlockSize;
  39. }
  40. /* Allocate aligned memory from the threads memory block. */
  41. __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
  42. {
  43. /* bind the thread local allocator to the proper FastAllocator*/
  44. parent->bind(alloc);
  45. assert(align <= maxAlignment);
  46. bytesUsed += bytes;
  47. /* try to allocate in local block */
  48. size_t ofs = (align - cur) & (align-1);
  49. cur += bytes + ofs;
  50. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  51. cur -= bytes + ofs;
  52. /* if allocation is too large allocate with parent allocator */
  53. if (4*bytes > allocBlockSize) {
  54. return alloc->malloc(bytes,maxAlignment,false);
  55. }
  56. /* get new partial block if allocation failed */
  57. size_t blockSize = allocBlockSize;
  58. ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
  59. bytesWasted += end-cur;
  60. cur = 0; end = blockSize;
  61. /* retry allocation */
  62. ofs = (align - cur) & (align-1);
  63. cur += bytes + ofs;
  64. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  65. cur -= bytes + ofs;
  66. /* get new full block if allocation failed */
  67. blockSize = allocBlockSize;
  68. ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
  69. bytesWasted += end-cur;
  70. cur = 0; end = blockSize;
  71. /* retry allocation */
  72. ofs = (align - cur) & (align-1);
  73. cur += bytes + ofs;
  74. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  75. cur -= bytes + ofs;
  76. /* should never happen as large allocations get handled specially above */
  77. assert(false);
  78. return nullptr;
  79. }
  80. /*! returns amount of used bytes */
  81. __forceinline size_t getUsedBytes() const { return bytesUsed; }
  82. /*! returns amount of free bytes */
  83. __forceinline size_t getFreeBytes() const { return end-cur; }
  84. /*! returns amount of wasted bytes */
  85. __forceinline size_t getWastedBytes() const { return bytesWasted; }
  86. private:
  87. ThreadLocal2* parent;
  88. char* ptr; //!< pointer to memory block
  89. size_t cur; //!< current location of the allocator
  90. size_t end; //!< end of the memory block
  91. size_t allocBlockSize; //!< block size for allocations
  92. size_t bytesUsed; //!< number of total bytes allocated
  93. size_t bytesWasted; //!< number of bytes wasted
  94. };
  95. /*! Two thread local structures. */
  96. struct __aligned(64) ThreadLocal2
  97. {
  98. ALIGNED_CLASS_(64);
  99. public:
  100. __forceinline ThreadLocal2()
  101. : alloc(nullptr), alloc0(this), alloc1(this) {}
  102. /*! bind to fast allocator */
  103. __forceinline void bind(FastAllocator* alloc_i)
  104. {
  105. assert(alloc_i);
  106. if (alloc.load() == alloc_i) return;
  107. Lock<SpinLock> lock(mutex);
  108. //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
  109. if (alloc.load()) {
  110. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  111. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  112. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  113. }
  114. alloc0.init(alloc_i);
  115. alloc1.init(alloc_i);
  116. alloc.store(alloc_i);
  117. alloc_i->join(this);
  118. }
  119. /*! unbind to fast allocator */
  120. void unbind(FastAllocator* alloc_i)
  121. {
  122. assert(alloc_i);
  123. if (alloc.load() != alloc_i) return;
  124. Lock<SpinLock> lock(mutex);
  125. if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
  126. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  127. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  128. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  129. alloc0.init(nullptr);
  130. alloc1.init(nullptr);
  131. alloc.store(nullptr);
  132. }
  133. public:
  134. SpinLock mutex; //!< required as unbind is called from other threads
  135. std::atomic<FastAllocator*> alloc; //!< parent allocator
  136. ThreadLocal alloc0;
  137. ThreadLocal alloc1;
  138. };
  139. FastAllocator (Device* device, bool osAllocation)
  140. : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
  141. growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
  142. primrefarray(device,0)
  143. {
  144. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  145. {
  146. threadUsedBlocks[i] = nullptr;
  147. threadBlocks[i] = nullptr;
  148. assert(!slotMutex[i].isLocked());
  149. }
  150. }
  151. ~FastAllocator () {
  152. clear();
  153. }
  154. /*! returns the device attached to this allocator */
  155. Device* getDevice() {
  156. return device;
  157. }
  158. void share(mvector<PrimRef>& primrefarray_i) {
  159. primrefarray = std::move(primrefarray_i);
  160. }
  161. void unshare(mvector<PrimRef>& primrefarray_o)
  162. {
  163. reset(); // this removes blocks that are allocated inside the shared primref array
  164. primrefarray_o = std::move(primrefarray);
  165. }
  166. /*! returns first fast thread local allocator */
  167. __forceinline ThreadLocal* _threadLocal() {
  168. return &threadLocal2()->alloc0;
  169. }
  170. void setOSallocation(bool flag)
  171. {
  172. atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
  173. }
  174. private:
  175. /*! returns both fast thread local allocators */
  176. __forceinline ThreadLocal2* threadLocal2()
  177. {
  178. ThreadLocal2* alloc = thread_local_allocator2;
  179. if (alloc == nullptr) {
  180. thread_local_allocator2 = alloc = new ThreadLocal2;
  181. Lock<SpinLock> lock(s_thread_local_allocators_lock);
  182. s_thread_local_allocators.push_back(make_unique(alloc));
  183. }
  184. return alloc;
  185. }
  186. public:
  187. __forceinline void join(ThreadLocal2* alloc)
  188. {
  189. Lock<SpinLock> lock(thread_local_allocators_lock);
  190. thread_local_allocators.push_back(alloc);
  191. }
  192. public:
  193. struct CachedAllocator
  194. {
  195. __forceinline CachedAllocator(void* ptr)
  196. : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
  197. {
  198. assert(ptr == nullptr);
  199. }
  200. __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
  201. : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
  202. __forceinline operator bool () const {
  203. return alloc != nullptr;
  204. }
  205. __forceinline void* operator() (size_t bytes, size_t align = 16) const {
  206. return talloc0->malloc(alloc,bytes,align);
  207. }
  208. __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
  209. return talloc0->malloc(alloc,bytes,align);
  210. }
  211. __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
  212. return talloc1->malloc(alloc,bytes,align);
  213. }
  214. public:
  215. FastAllocator* alloc;
  216. ThreadLocal* talloc0;
  217. ThreadLocal* talloc1;
  218. };
  219. __forceinline CachedAllocator getCachedAllocator() {
  220. return CachedAllocator(this,threadLocal2());
  221. }
  222. /*! Builder interface to create thread local allocator */
  223. struct Create
  224. {
  225. public:
  226. __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
  227. __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
  228. private:
  229. FastAllocator* allocator;
  230. };
  231. void internal_fix_used_blocks()
  232. {
  233. /* move thread local blocks to global block list */
  234. for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
  235. {
  236. while (threadBlocks[i].load() != nullptr) {
  237. Block* nextUsedBlock = threadBlocks[i].load()->next;
  238. threadBlocks[i].load()->next = usedBlocks.load();
  239. usedBlocks = threadBlocks[i].load();
  240. threadBlocks[i] = nextUsedBlock;
  241. }
  242. threadBlocks[i] = nullptr;
  243. }
  244. }
  245. static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
  246. static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
  247. static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
  248. /* calculates a single threaded threshold for the builders such
  249. * that for small scenes the overhead of partly allocated blocks
  250. * per thread is low */
  251. size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
  252. {
  253. if (numPrimitives == 0 || bytesEstimated == 0)
  254. return defaultThreshold;
  255. /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
  256. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  257. const size_t threadCount = TaskScheduler::threadCount();
  258. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
  259. /* if we do not have to limit number of threads use optimal thresdhold */
  260. if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  261. return defaultThreshold;
  262. /* otherwise limit number of threads by calculating proper single thread threshold */
  263. else {
  264. double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
  265. return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
  266. }
  267. }
  268. __forceinline size_t alignSize(size_t i) {
  269. return (i+127)/128*128;
  270. }
  271. /*! initializes the grow size */
  272. __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
  273. {
  274. /* we do not need single thread local allocator mode */
  275. use_single_mode = false;
  276. /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
  277. size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
  278. size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
  279. growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
  280. /* if we reached the maxAllocationSize for growSize, we can
  281. * increase the number of allocation slots by still guaranteeing
  282. * the mainAllocationOverhead */
  283. slotMask = 0x0;
  284. if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
  285. if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
  286. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
  287. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
  288. /* set the thread local alloc block size */
  289. size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
  290. /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
  291. #if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
  292. const size_t threadCount = TaskScheduler::threadCount();
  293. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  294. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
  295. if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  296. defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
  297. /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
  298. else
  299. #endif
  300. defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
  301. if (bytesEstimated == 0) {
  302. maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
  303. defaultBlockSize = defaultBlockSizeSwitch;
  304. }
  305. log2_grow_size_scale = 0;
  306. if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
  307. if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
  308. if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
  309. if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
  310. if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
  311. if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
  312. if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
  313. }
  314. /*! initializes the allocator */
  315. void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
  316. {
  317. internal_fix_used_blocks();
  318. /* distribute the allocation to multiple thread block slots */
  319. slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
  320. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  321. if (bytesReserve == 0) bytesReserve = bytesAllocate;
  322. freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
  323. estimatedSize = bytesEstimate;
  324. initGrowSizeAndNumSlots(bytesEstimate,true);
  325. }
  326. /*! initializes the allocator */
  327. void init_estimate(size_t bytesEstimate)
  328. {
  329. internal_fix_used_blocks();
  330. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  331. /* single allocator mode ? */
  332. estimatedSize = bytesEstimate;
  333. //initGrowSizeAndNumSlots(bytesEstimate,false);
  334. initGrowSizeAndNumSlots(bytesEstimate,false);
  335. }
  336. /*! frees state not required after build */
  337. __forceinline void cleanup()
  338. {
  339. internal_fix_used_blocks();
  340. /* unbind all thread local allocators */
  341. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  342. thread_local_allocators.clear();
  343. }
  344. /*! resets the allocator, memory blocks get reused */
  345. void reset ()
  346. {
  347. internal_fix_used_blocks();
  348. bytesUsed.store(0);
  349. bytesFree.store(0);
  350. bytesWasted.store(0);
  351. /* reset all used blocks and move them to begin of free block list */
  352. while (usedBlocks.load() != nullptr) {
  353. usedBlocks.load()->reset_block();
  354. Block* nextUsedBlock = usedBlocks.load()->next;
  355. usedBlocks.load()->next = freeBlocks.load();
  356. freeBlocks = usedBlocks.load();
  357. usedBlocks = nextUsedBlock;
  358. }
  359. /* remove all shared blocks as they are re-added during build */
  360. freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
  361. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  362. {
  363. threadUsedBlocks[i] = nullptr;
  364. threadBlocks[i] = nullptr;
  365. }
  366. /* unbind all thread local allocators */
  367. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  368. thread_local_allocators.clear();
  369. }
  370. /*! frees all allocated memory */
  371. __forceinline void clear()
  372. {
  373. cleanup();
  374. bytesUsed.store(0);
  375. bytesFree.store(0);
  376. bytesWasted.store(0);
  377. if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
  378. if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
  379. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
  380. threadUsedBlocks[i] = nullptr;
  381. threadBlocks[i] = nullptr;
  382. }
  383. primrefarray.clear();
  384. }
  385. __forceinline size_t incGrowSizeScale()
  386. {
  387. size_t scale = log2_grow_size_scale.fetch_add(1)+1;
  388. return size_t(1) << min(size_t(16),scale);
  389. }
  390. /*! thread safe allocation of memory */
  391. void* malloc(size_t& bytes, size_t align, bool partial)
  392. {
  393. assert(align <= maxAlignment);
  394. while (true)
  395. {
  396. /* allocate using current block */
  397. size_t threadID = TaskScheduler::threadID();
  398. size_t slot = threadID & slotMask;
  399. Block* myUsedBlocks = threadUsedBlocks[slot];
  400. if (myUsedBlocks) {
  401. void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
  402. if (ptr) return ptr;
  403. }
  404. /* throw error if allocation is too large */
  405. if (bytes > maxAllocationSize)
  406. throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
  407. /* parallel block creation in case of no freeBlocks, avoids single global mutex */
  408. if (likely(freeBlocks.load() == nullptr))
  409. {
  410. Lock<SpinLock> lock(slotMutex[slot]);
  411. if (myUsedBlocks == threadUsedBlocks[slot]) {
  412. const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
  413. const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
  414. assert(allocSize >= bytes);
  415. threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
  416. // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
  417. }
  418. continue;
  419. }
  420. /* if this fails allocate new block */
  421. {
  422. Lock<SpinLock> lock(mutex);
  423. if (myUsedBlocks == threadUsedBlocks[slot])
  424. {
  425. if (freeBlocks.load() != nullptr) {
  426. Block* nextFreeBlock = freeBlocks.load()->next;
  427. freeBlocks.load()->next = usedBlocks;
  428. __memory_barrier();
  429. usedBlocks = freeBlocks.load();
  430. threadUsedBlocks[slot] = freeBlocks.load();
  431. freeBlocks = nextFreeBlock;
  432. } else {
  433. const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
  434. usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
  435. }
  436. }
  437. }
  438. }
  439. }
  440. /*! add new block */
  441. void addBlock(void* ptr, ssize_t bytes)
  442. {
  443. Lock<SpinLock> lock(mutex);
  444. const size_t sizeof_Header = offsetof(Block,data[0]);
  445. void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
  446. size_t ofs = (size_t) aptr - (size_t) ptr;
  447. bytes -= ofs;
  448. if (bytes < 4096) return; // ignore empty or very small blocks
  449. freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
  450. }
  451. /* special allocation only used from morton builder only a single time for each build */
  452. void* specialAlloc(size_t bytes)
  453. {
  454. assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
  455. return freeBlocks.load()->ptr();
  456. }
  457. struct Statistics
  458. {
  459. Statistics ()
  460. : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
  461. Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
  462. : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
  463. Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
  464. : bytesUsed(0), bytesFree(0), bytesWasted(0)
  465. {
  466. Block* usedBlocks = alloc->usedBlocks.load();
  467. Block* freeBlocks = alloc->freeBlocks.load();
  468. if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
  469. if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
  470. if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
  471. if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
  472. if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
  473. }
  474. std::string str(size_t numPrimitives)
  475. {
  476. std::stringstream str;
  477. str.setf(std::ios::fixed, std::ios::floatfield);
  478. str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  479. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  480. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  481. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
  482. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
  483. return str.str();
  484. }
  485. friend Statistics operator+ ( const Statistics& a, const Statistics& b)
  486. {
  487. return Statistics(a.bytesUsed+b.bytesUsed,
  488. a.bytesFree+b.bytesFree,
  489. a.bytesWasted+b.bytesWasted);
  490. }
  491. size_t bytesAllocatedTotal() const {
  492. return bytesUsed + bytesFree + bytesWasted;
  493. }
  494. public:
  495. size_t bytesUsed;
  496. size_t bytesFree;
  497. size_t bytesWasted;
  498. };
  499. Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
  500. return Statistics(this,atype,huge_pages);
  501. }
  502. size_t getUsedBytes() {
  503. return bytesUsed;
  504. }
  505. size_t getWastedBytes() {
  506. return bytesWasted;
  507. }
  508. struct AllStatistics
  509. {
  510. AllStatistics (FastAllocator* alloc)
  511. : bytesUsed(alloc->bytesUsed),
  512. bytesFree(alloc->bytesFree),
  513. bytesWasted(alloc->bytesWasted),
  514. stat_all(alloc,ANY_TYPE),
  515. stat_malloc(alloc,ALIGNED_MALLOC),
  516. stat_4K(alloc,OS_MALLOC,false),
  517. stat_2M(alloc,OS_MALLOC,true),
  518. stat_shared(alloc,SHARED) {}
  519. AllStatistics (size_t bytesUsed,
  520. size_t bytesFree,
  521. size_t bytesWasted,
  522. Statistics stat_all,
  523. Statistics stat_malloc,
  524. Statistics stat_4K,
  525. Statistics stat_2M,
  526. Statistics stat_shared)
  527. : bytesUsed(bytesUsed),
  528. bytesFree(bytesFree),
  529. bytesWasted(bytesWasted),
  530. stat_all(stat_all),
  531. stat_malloc(stat_malloc),
  532. stat_4K(stat_4K),
  533. stat_2M(stat_2M),
  534. stat_shared(stat_shared) {}
  535. friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
  536. {
  537. return AllStatistics(a.bytesUsed+b.bytesUsed,
  538. a.bytesFree+b.bytesFree,
  539. a.bytesWasted+b.bytesWasted,
  540. a.stat_all + b.stat_all,
  541. a.stat_malloc + b.stat_malloc,
  542. a.stat_4K + b.stat_4K,
  543. a.stat_2M + b.stat_2M,
  544. a.stat_shared + b.stat_shared);
  545. }
  546. void print(size_t numPrimitives)
  547. {
  548. std::stringstream str0;
  549. str0.setf(std::ios::fixed, std::ios::floatfield);
  550. str0 << " alloc : "
  551. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  552. << " "
  553. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
  554. std::cout << str0.str() << std::endl;
  555. std::stringstream str1;
  556. str1.setf(std::ios::fixed, std::ios::floatfield);
  557. str1 << " alloc : "
  558. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  559. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  560. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  561. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
  562. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
  563. std::cout << str1.str() << std::endl;
  564. std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
  565. std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
  566. std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
  567. std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
  568. std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
  569. }
  570. private:
  571. size_t bytesUsed;
  572. size_t bytesFree;
  573. size_t bytesWasted;
  574. Statistics stat_all;
  575. Statistics stat_malloc;
  576. Statistics stat_4K;
  577. Statistics stat_2M;
  578. Statistics stat_shared;
  579. };
  580. void print_blocks()
  581. {
  582. std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
  583. std::cout << " used blocks = ";
  584. if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
  585. std::cout << "[END]" << std::endl;
  586. std::cout << " free blocks = ";
  587. if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
  588. std::cout << "[END]" << std::endl;
  589. }
  590. private:
  591. struct Block
  592. {
  593. static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
  594. {
  595. /* We avoid using os_malloc for small blocks as this could
  596. * cause a risk of fragmenting the virtual address space and
  597. * reach the limit of vm.max_map_count = 65k under Linux. */
  598. if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
  599. atype = ALIGNED_MALLOC;
  600. /* we need to additionally allocate some header */
  601. const size_t sizeof_Header = offsetof(Block,data[0]);
  602. bytesAllocate = sizeof_Header+bytesAllocate;
  603. bytesReserve = sizeof_Header+bytesReserve;
  604. /* consume full 4k pages with using os_malloc */
  605. if (atype == OS_MALLOC) {
  606. bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  607. bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  608. }
  609. /* either use alignedMalloc or os_malloc */
  610. void *ptr = nullptr;
  611. if (atype == ALIGNED_MALLOC)
  612. {
  613. /* special handling for default block size */
  614. if (bytesAllocate == (2*PAGE_SIZE_2M))
  615. {
  616. const size_t alignment = maxAlignment;
  617. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  618. ptr = alignedMalloc(bytesAllocate,alignment);
  619. /* give hint to transparently convert these pages to 2MB pages */
  620. const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
  621. os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
  622. os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
  623. os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
  624. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  625. }
  626. else
  627. {
  628. const size_t alignment = maxAlignment;
  629. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  630. ptr = alignedMalloc(bytesAllocate,alignment);
  631. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  632. }
  633. }
  634. else if (atype == OS_MALLOC)
  635. {
  636. if (device) device->memoryMonitor(bytesAllocate,false);
  637. bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
  638. return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
  639. }
  640. else
  641. assert(false);
  642. return NULL;
  643. }
  644. Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
  645. : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
  646. {
  647. assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
  648. }
  649. static Block* remove_shared_blocks(Block* head)
  650. {
  651. Block** prev_next = &head;
  652. for (Block* block = head; block; block = block->next) {
  653. if (block->atype == SHARED) *prev_next = block->next;
  654. else prev_next = &block->next;
  655. }
  656. return head;
  657. }
  658. void clear_list(MemoryMonitorInterface* device)
  659. {
  660. Block* block = this;
  661. while (block) {
  662. Block* next = block->next;
  663. block->clear_block(device);
  664. block = next;
  665. }
  666. }
  667. void clear_block (MemoryMonitorInterface* device)
  668. {
  669. const size_t sizeof_Header = offsetof(Block,data[0]);
  670. const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
  671. if (atype == ALIGNED_MALLOC) {
  672. alignedFree(this);
  673. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  674. }
  675. else if (atype == OS_MALLOC) {
  676. size_t sizeof_This = sizeof_Header+reserveEnd;
  677. os_free(this,sizeof_This,huge_pages);
  678. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  679. }
  680. else /* if (atype == SHARED) */ {
  681. }
  682. }
  683. void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
  684. {
  685. size_t bytes = bytes_in;
  686. assert(align <= maxAlignment);
  687. bytes = (bytes+(align-1)) & ~(align-1);
  688. if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
  689. const size_t i = cur.fetch_add(bytes);
  690. if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
  691. if (unlikely(i > reserveEnd)) return nullptr;
  692. bytes_in = bytes = min(bytes,reserveEnd-i);
  693. if (i+bytes > allocEnd) {
  694. if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
  695. }
  696. return &data[i];
  697. }
  698. void* ptr() {
  699. return &data[cur];
  700. }
  701. void reset_block ()
  702. {
  703. allocEnd = max(allocEnd,(size_t)cur);
  704. cur = 0;
  705. }
  706. size_t getBlockUsedBytes() const {
  707. return min(size_t(cur),reserveEnd);
  708. }
  709. size_t getBlockFreeBytes() const {
  710. return getBlockAllocatedBytes() - getBlockUsedBytes();
  711. }
  712. size_t getBlockAllocatedBytes() const {
  713. return min(max(allocEnd,size_t(cur)),reserveEnd);
  714. }
  715. size_t getBlockWastedBytes() const {
  716. const size_t sizeof_Header = offsetof(Block,data[0]);
  717. return sizeof_Header + wasted;
  718. }
  719. size_t getBlockReservedBytes() const {
  720. return reserveEnd;
  721. }
  722. bool hasType(AllocationType atype_i, bool huge_pages_i) const
  723. {
  724. if (atype_i == ANY_TYPE ) return true;
  725. else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
  726. else return atype_i == atype;
  727. }
  728. size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
  729. size_t bytes = 0;
  730. for (const Block* block = this; block; block = block->next) {
  731. if (!block->hasType(atype,huge_pages)) continue;
  732. bytes += block->getBlockUsedBytes();
  733. }
  734. return bytes;
  735. }
  736. size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
  737. size_t bytes = 0;
  738. for (const Block* block = this; block; block = block->next) {
  739. if (!block->hasType(atype,huge_pages)) continue;
  740. bytes += block->getBlockFreeBytes();
  741. }
  742. return bytes;
  743. }
  744. size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
  745. size_t bytes = 0;
  746. for (const Block* block = this; block; block = block->next) {
  747. if (!block->hasType(atype,huge_pages)) continue;
  748. bytes += block->getBlockWastedBytes();
  749. }
  750. return bytes;
  751. }
  752. size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
  753. size_t bytes = 0;
  754. for (const Block* block = this; block; block = block->next) {
  755. if (!block->hasType(atype,huge_pages)) continue;
  756. bytes += block->getBlockAllocatedBytes();
  757. }
  758. return bytes;
  759. }
  760. void print_list ()
  761. {
  762. for (const Block* block = this; block; block = block->next)
  763. block->print_block();
  764. }
  765. void print_block() const
  766. {
  767. if (atype == ALIGNED_MALLOC) std::cout << "A";
  768. else if (atype == OS_MALLOC) std::cout << "O";
  769. else if (atype == SHARED) std::cout << "S";
  770. if (huge_pages) std::cout << "H";
  771. size_t bytesUsed = getBlockUsedBytes();
  772. size_t bytesFree = getBlockFreeBytes();
  773. size_t bytesWasted = getBlockWastedBytes();
  774. std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
  775. }
  776. public:
  777. std::atomic<size_t> cur; //!< current location of the allocator
  778. std::atomic<size_t> allocEnd; //!< end of the allocated memory region
  779. std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
  780. Block* next; //!< pointer to next block in list
  781. size_t wasted; //!< amount of memory wasted through block alignment
  782. AllocationType atype; //!< allocation mode of the block
  783. bool huge_pages; //!< whether the block uses huge pages
  784. char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
  785. char data[1]; //!< here starts memory to use for allocations
  786. };
  787. private:
  788. Device* device;
  789. SpinLock mutex;
  790. size_t slotMask;
  791. std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  792. std::atomic<Block*> usedBlocks;
  793. std::atomic<Block*> freeBlocks;
  794. std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  795. SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
  796. bool use_single_mode;
  797. size_t defaultBlockSize;
  798. size_t estimatedSize;
  799. size_t growSize;
  800. size_t maxGrowSize;
  801. std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
  802. std::atomic<size_t> bytesUsed;
  803. std::atomic<size_t> bytesFree;
  804. std::atomic<size_t> bytesWasted;
  805. static __thread ThreadLocal2* thread_local_allocator2;
  806. static SpinLock s_thread_local_allocators_lock;
  807. static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
  808. SpinLock thread_local_allocators_lock;
  809. std::vector<ThreadLocal2*> thread_local_allocators;
  810. AllocationType atype;
  811. mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
  812. };
  813. }