alloc.h 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "default.h"
  18. #include "device.h"
  19. namespace embree
  20. {
  21. class FastAllocator
  22. {
  23. /*! maximal supported alignment */
  24. static const size_t maxAlignment = 64;
  25. /*! maximal allocation size */
  26. /* default settings */
  27. //static const size_t defaultBlockSize = 4096;
  28. #define maxAllocationSize size_t(4*1024*1024-maxAlignment)
  29. static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
  30. public:
  31. enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED };
  32. /*! Per thread structure holding the current memory block. */
  33. struct __aligned(64) ThreadLocal
  34. {
  35. ALIGNED_CLASS_(64);
  36. public:
  37. __forceinline ThreadLocal() {}
  38. /*! Constructor for usage with ThreadLocalData */
  39. __forceinline ThreadLocal (FastAllocator* alloc)
  40. : alloc(alloc), ptr(nullptr), cur(0), end(0), allocBlockSize(((FastAllocator*)alloc)->defaultBlockSize), bytesUsed(0), bytesWasted(0) {}
  41. /*! resets the allocator */
  42. __forceinline void reset()
  43. {
  44. ptr = nullptr;
  45. cur = end = 0;
  46. bytesWasted = bytesUsed = 0;
  47. }
  48. /* Allocate aligned memory from the threads memory block. */
  49. __forceinline void* operator() (size_t bytes, size_t align = 16) {
  50. return malloc(bytes,align);
  51. }
  52. /* Allocate aligned memory from the threads memory block. */
  53. __forceinline void* malloc(size_t bytes, size_t align = 16)
  54. {
  55. assert(align <= maxAlignment);
  56. bytesUsed += bytes;
  57. /* try to allocate in local block */
  58. size_t ofs = (align - cur) & (align-1);
  59. cur += bytes + ofs;
  60. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  61. cur -= bytes + ofs;
  62. /* if allocation is too large allocate with parent allocator */
  63. if (4*bytes > allocBlockSize) {
  64. return alloc->malloc(bytes,maxAlignment,false);
  65. }
  66. /* get new partial block if allocation failed */
  67. size_t blockSize = allocBlockSize;
  68. ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
  69. bytesWasted += end-cur;
  70. cur = 0; end = blockSize;
  71. /* retry allocation */
  72. ofs = (align - cur) & (align-1);
  73. cur += bytes + ofs;
  74. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  75. cur -= bytes + ofs;
  76. /* get new full block if allocation failed */
  77. blockSize = allocBlockSize;
  78. ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
  79. bytesWasted += end-cur;
  80. cur = 0; end = blockSize;
  81. /* retry allocation */
  82. ofs = (align - cur) & (align-1);
  83. cur += bytes + ofs;
  84. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  85. cur -= bytes + ofs;
  86. /* should never happen as large allocations get handled specially above */
  87. assert(false);
  88. return nullptr;
  89. }
  90. /*! returns amount of used bytes */
  91. size_t getUsedBytes() const { return bytesUsed; }
  92. /*! returns amount of wasted bytes */
  93. size_t getWastedBytes() const { return bytesWasted + (end-cur); }
  94. public:
  95. FastAllocator* alloc; //!< parent allocator
  96. char* ptr; //!< pointer to memory block
  97. size_t cur; //!< current location of the allocator
  98. size_t end; //!< end of the memory block
  99. size_t allocBlockSize; //!< block size for allocations
  100. private:
  101. size_t bytesUsed; //!< number of total bytes allocated
  102. size_t bytesWasted; //!< number of bytes wasted
  103. };
  104. /*! Two thread local structures. */
  105. struct __aligned(64) ThreadLocal2
  106. {
  107. ALIGNED_STRUCT;
  108. /*! Constructor for usage with ThreadLocalData */
  109. __forceinline ThreadLocal2 (FastAllocator* alloc)
  110. {
  111. allocators[0] = ThreadLocal(alloc); alloc0 = &allocators[0];
  112. allocators[1] = ThreadLocal(alloc); alloc1 = &allocators[1];
  113. if (alloc->use_single_mode) alloc1 = &allocators[0];
  114. }
  115. /*! resets the allocator */
  116. __forceinline void reset() {
  117. allocators[0].reset();
  118. allocators[1].reset();
  119. }
  120. /*! returns amount of used bytes */
  121. size_t getUsedBytes() const { return allocators[0].getUsedBytes() + allocators[1].getUsedBytes(); }
  122. /*! returns amount of wasted bytes */
  123. size_t getWastedBytes() const { return allocators[0].getWastedBytes() + allocators[1].getWastedBytes(); }
  124. public:
  125. ThreadLocal* alloc0;
  126. ThreadLocal* alloc1;
  127. private:
  128. ThreadLocal allocators[2];
  129. };
  130. /*! Builder interface to create thread local allocator */
  131. struct CreateAlloc2
  132. {
  133. public:
  134. __forceinline CreateAlloc2 (FastAllocator* allocator) : allocator(allocator) {}
  135. __forceinline ThreadLocal2* operator() () const { return allocator->threadLocal2(); }
  136. private:
  137. FastAllocator* allocator;
  138. };
  139. FastAllocator (Device* device, bool osAllocation)
  140. : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE),
  141. growSize(PAGE_SIZE), log2_grow_size_scale(0), bytesUsed(0), bytesWasted(0), thread_local_allocators2(this), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC)
  142. {
  143. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  144. {
  145. threadUsedBlocks[i] = nullptr;
  146. threadBlocks[i] = nullptr;
  147. assert(!slotMutex[i].isLocked());
  148. }
  149. }
  150. ~FastAllocator () {
  151. clear();
  152. }
  153. /*! returns the device attached to this allocator */
  154. Device* getDevice() {
  155. return device;
  156. }
  157. /*! returns first fast thread local allocator */
  158. __forceinline ThreadLocal* threadLocal() {
  159. return thread_local_allocators2.get()->alloc0;
  160. }
  161. /*! returns both fast thread local allocators */
  162. __forceinline ThreadLocal2* threadLocal2() {
  163. return thread_local_allocators2.get();
  164. }
  165. /*! initializes the grow size */
  166. __forceinline void initGrowSizeAndNumSlots(size_t bytesAllocate, size_t bytesReserve = 0)
  167. {
  168. if (bytesReserve == 0) bytesReserve = bytesAllocate;
  169. bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); // always consume full pages
  170. growSize = clamp(bytesReserve,size_t(PAGE_SIZE),maxAllocationSize); // PAGE_SIZE -maxAlignment ?
  171. log2_grow_size_scale = 0;
  172. slotMask = 0x0;
  173. if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesAllocate > 4*maxAllocationSize) slotMask = 0x1;
  174. if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesAllocate > 8*maxAllocationSize) slotMask = 0x3;
  175. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesAllocate > 16*maxAllocationSize) slotMask = 0x7;
  176. }
  177. void internal_fix_used_blocks()
  178. {
  179. /* move thread local blocks to global block list */
  180. for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
  181. {
  182. while (threadBlocks[i].load() != nullptr) {
  183. Block* nextUsedBlock = threadBlocks[i].load()->next;
  184. threadBlocks[i].load()->next = usedBlocks.load();
  185. usedBlocks = threadBlocks[i].load();
  186. threadBlocks[i] = nextUsedBlock;
  187. }
  188. threadBlocks[i] = nullptr;
  189. }
  190. }
  191. /*! initializes the allocator */
  192. void init(size_t bytesAllocate, size_t bytesReserve = 0)
  193. {
  194. internal_fix_used_blocks();
  195. /* distribute the allocation to multiple thread block slots */
  196. slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1;
  197. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  198. if (bytesReserve == 0) bytesReserve = bytesAllocate;
  199. freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
  200. defaultBlockSize = clamp(bytesAllocate/4,size_t(128),size_t(PAGE_SIZE+maxAlignment));
  201. initGrowSizeAndNumSlots(bytesAllocate,bytesReserve);
  202. }
  203. /*! initializes the allocator */
  204. void init_estimate(size_t bytesAllocate, const bool single_mode = false)
  205. {
  206. internal_fix_used_blocks();
  207. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  208. /* single allocator mode ? */
  209. use_single_mode = single_mode;
  210. defaultBlockSize = clamp(bytesAllocate/4,size_t(128),size_t(PAGE_SIZE+maxAlignment));
  211. /* if in memory conservative single_mode, reduce bytesAllocate/growSize by 2 */
  212. initGrowSizeAndNumSlots(single_mode == false ? bytesAllocate : bytesAllocate/2);
  213. }
  214. /*! frees state not required after build */
  215. __forceinline void cleanup()
  216. {
  217. internal_fix_used_blocks();
  218. for (size_t t=0; t<thread_local_allocators2.threads.size(); t++) {
  219. bytesUsed += thread_local_allocators2.threads[t]->getUsedBytes();
  220. bytesWasted += thread_local_allocators2.threads[t]->getWastedBytes();
  221. }
  222. thread_local_allocators2.clear();
  223. }
  224. /*! shrinks all memory blocks to the actually used size */
  225. void shrink ()
  226. {
  227. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  228. if (threadUsedBlocks[i].load() != nullptr) threadUsedBlocks[i].load()->shrink_list(device);
  229. if (usedBlocks.load() != nullptr) usedBlocks.load()->shrink_list(device);
  230. if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
  231. }
  232. /*! resets the allocator, memory blocks get reused */
  233. void reset ()
  234. {
  235. internal_fix_used_blocks();
  236. bytesUsed = 0;
  237. bytesWasted = 0;
  238. /* first reset all used blocks */
  239. if (usedBlocks.load() != nullptr) usedBlocks.load()->reset_list();
  240. /* move all used blocks to begin of free block list */
  241. while (usedBlocks.load() != nullptr) {
  242. Block* nextUsedBlock = usedBlocks.load()->next;
  243. usedBlocks.load()->next = freeBlocks.load();
  244. freeBlocks = usedBlocks.load();
  245. usedBlocks = nextUsedBlock;
  246. }
  247. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  248. {
  249. threadUsedBlocks[i] = nullptr;
  250. threadBlocks[i] = nullptr;
  251. }
  252. /* reset all thread local allocators */
  253. thread_local_allocators2.apply([] (ThreadLocal2* alloc) { alloc->reset(); });
  254. }
  255. /*! frees all allocated memory */
  256. __forceinline void clear()
  257. {
  258. cleanup();
  259. bytesUsed = 0;
  260. bytesWasted = 0;
  261. if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
  262. if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
  263. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
  264. threadUsedBlocks[i] = nullptr;
  265. threadBlocks[i] = nullptr;
  266. }
  267. }
  268. __forceinline size_t incGrowSizeScale()
  269. {
  270. size_t scale = log2_grow_size_scale.fetch_add(1)+1;
  271. return size_t(1) << min(size_t(16),scale);
  272. }
  273. /*! thread safe allocation of memory */
  274. void* malloc(size_t& bytes, size_t align, bool partial)
  275. {
  276. assert(align <= maxAlignment);
  277. while (true)
  278. {
  279. /* allocate using current block */
  280. size_t threadIndex = TaskScheduler::threadIndex();
  281. size_t slot = threadIndex & slotMask;
  282. Block* myUsedBlocks = threadUsedBlocks[slot];
  283. if (myUsedBlocks) {
  284. void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
  285. if (ptr) return ptr;
  286. }
  287. /* throw error if allocation is too large */
  288. if (bytes > maxAllocationSize)
  289. throw_RTCError(RTC_UNKNOWN_ERROR,"allocation is too large");
  290. /* parallel block creation in case of no freeBlocks, avoids single global mutex */
  291. if (likely(freeBlocks.load() == nullptr))
  292. {
  293. Lock<SpinLock> lock(slotMutex[slot]);
  294. if (myUsedBlocks == threadUsedBlocks[slot]) {
  295. const size_t allocSize = min(max(growSize,bytes),size_t(maxAllocationSize));
  296. assert(allocSize >= bytes);
  297. threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype);
  298. }
  299. continue;
  300. }
  301. /* if this fails allocate new block */
  302. {
  303. Lock<SpinLock> lock(mutex);
  304. if (myUsedBlocks == threadUsedBlocks[slot])
  305. {
  306. if (freeBlocks.load() != nullptr) {
  307. Block* nextFreeBlock = freeBlocks.load()->next;
  308. freeBlocks.load()->next = usedBlocks;
  309. __memory_barrier();
  310. usedBlocks = freeBlocks.load();
  311. threadUsedBlocks[slot] = freeBlocks.load();
  312. freeBlocks = nextFreeBlock;
  313. } else {
  314. //growSize = min(2*growSize,size_t(maxAllocationSize+maxAlignment));
  315. const size_t allocSize = min(growSize * incGrowSizeScale(),size_t(maxAllocationSize+maxAlignment))-maxAlignment;
  316. usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype);
  317. }
  318. }
  319. }
  320. }
  321. }
  322. /*! add new block */
  323. void addBlock(void* ptr, ssize_t bytes)
  324. {
  325. Lock<SpinLock> lock(mutex);
  326. const size_t sizeof_Header = offsetof(Block,data[0]);
  327. void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
  328. size_t ofs = (size_t) aptr - (size_t) ptr;
  329. bytes -= ofs;
  330. if (bytes < 4096) return; // ignore empty or very small blocks
  331. freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
  332. }
  333. /* special allocation only used from morton builder only a single time for each build */
  334. void* specialAlloc(size_t bytes)
  335. {
  336. assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
  337. return freeBlocks.load()->ptr();
  338. }
  339. size_t getAllocatedBytes() const
  340. {
  341. size_t bytesAllocated = 0;
  342. if (freeBlocks.load()) bytesAllocated += freeBlocks.load()->getAllocatedBytes();
  343. if (usedBlocks.load()) bytesAllocated += usedBlocks.load()->getAllocatedBytes();
  344. return bytesAllocated;
  345. }
  346. size_t getReservedBytes() const
  347. {
  348. size_t bytesReserved = 0;
  349. if (freeBlocks.load()) bytesReserved += freeBlocks.load()->getReservedBytes();
  350. if (usedBlocks.load()) bytesReserved += usedBlocks.load()->getReservedBytes();
  351. return bytesReserved;
  352. }
  353. size_t getUsedBytes() const
  354. {
  355. size_t bytes = bytesUsed;
  356. for (size_t t=0; t<thread_local_allocators2.threads.size(); t++)
  357. bytes += thread_local_allocators2.threads[t]->getUsedBytes();
  358. return bytes;
  359. }
  360. size_t getFreeBytes() const
  361. {
  362. size_t bytesFree = 0;
  363. if (freeBlocks.load()) bytesFree += freeBlocks.load()->getAllocatedBytes();
  364. return bytesFree;
  365. }
  366. size_t getWastedBytes() const
  367. {
  368. size_t bytes = bytesWasted;
  369. if (usedBlocks.load()) bytes += usedBlocks.load()->getFreeBytes();
  370. for (size_t t=0; t<thread_local_allocators2.threads.size(); t++)
  371. bytes += thread_local_allocators2.threads[t]->getWastedBytes();
  372. return bytes;
  373. }
  374. void print_statistics(bool verbose = false)
  375. {
  376. size_t bytesFree = getFreeBytes();
  377. size_t bytesAllocated = getAllocatedBytes();
  378. size_t bytesReserved = getReservedBytes();
  379. size_t bytesUsed = getUsedBytes();
  380. size_t bytesWasted = getWastedBytes();
  381. printf(" allocated = %3.3fMB, reserved = %3.3fMB, used = %3.3fMB (%3.2f%%), wasted = %3.3fMB (%3.2f%%), free = %3.3fMB (%3.2f%%)\n",
  382. 1E-6f*bytesAllocated, 1E-6f*bytesReserved,
  383. 1E-6f*bytesUsed, 100.0f*bytesUsed/bytesAllocated,
  384. 1E-6f*bytesWasted, 100.0f*bytesWasted/bytesAllocated,
  385. 1E-6f*bytesFree, 100.0f*bytesFree/bytesAllocated);
  386. if (verbose)
  387. {
  388. std::cout << " slotMask = " << slotMask << std::endl;
  389. std::cout << " use_single_mode = " << use_single_mode << std::endl;
  390. std::cout << " defaultBlockSize = " << defaultBlockSize << std::endl;
  391. std::cout << " used blocks = ";
  392. if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
  393. std::cout << "[END]" << std::endl;
  394. std::cout << " free blocks = ";
  395. if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
  396. std::cout << "[END]" << std::endl;
  397. }
  398. }
  399. private:
  400. struct Block
  401. {
  402. static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
  403. {
  404. const size_t sizeof_Header = offsetof(Block,data[0]);
  405. bytesAllocate = ((sizeof_Header+bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); // always consume full pages
  406. bytesReserve = ((sizeof_Header+bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); // always consume full pages
  407. /* either use alignedMalloc or os_reserve/os_commit */
  408. void *ptr = nullptr;
  409. if (atype == ALIGNED_MALLOC)
  410. {
  411. if (bytesReserve == (2*PAGE_SIZE_2M))
  412. {
  413. /* full 2M alignment for very first block */
  414. const size_t alignment = (next == NULL) ? PAGE_SIZE_2M : PAGE_SIZE;
  415. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  416. ptr = alignedMalloc(bytesAllocate,alignment);
  417. const size_t ptr_aligned_begin = ((size_t)ptr) & ~(PAGE_SIZE_2M-1);
  418. /* first os_advise could fail as speculative */
  419. os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M);
  420. /* second os_advise should succeed as with 4M block */
  421. os_advise((void*)(ptr_aligned_begin + PAGE_SIZE_2M),PAGE_SIZE_2M);
  422. return new (ptr) Block(atype,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  423. }
  424. else
  425. {
  426. const size_t alignment = CACHELINE_SIZE;
  427. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  428. ptr = alignedMalloc(bytesAllocate,alignment);
  429. return new (ptr) Block(atype,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  430. }
  431. }
  432. else if (atype == OS_MALLOC)
  433. {
  434. if (device) device->memoryMonitor(bytesAllocate,false);
  435. ptr = os_reserve(bytesReserve);
  436. os_commit(ptr,bytesAllocate);
  437. return new (ptr) Block(atype,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0);
  438. }
  439. else
  440. assert(false);
  441. return NULL;
  442. }
  443. Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted)
  444. : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype)
  445. {
  446. assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
  447. //for (size_t i=0; i<allocEnd; i+=defaultBlockSize) data[i] = 0;
  448. }
  449. void clear_list(MemoryMonitorInterface* device)
  450. {
  451. Block* block = this;
  452. while (block) {
  453. Block* next = block->next;
  454. block->clear_block(device);
  455. block = next;
  456. }
  457. }
  458. void clear_block (MemoryMonitorInterface* device)
  459. {
  460. const size_t sizeof_Header = offsetof(Block,data[0]);
  461. const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
  462. if (atype == ALIGNED_MALLOC) {
  463. alignedFree(this);
  464. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  465. }
  466. else if (atype == OS_MALLOC) {
  467. size_t sizeof_This = sizeof_Header+reserveEnd;
  468. os_free(this,sizeof_This);
  469. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  470. }
  471. else /* if (atype == SHARED) */ {
  472. }
  473. }
  474. void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
  475. {
  476. size_t bytes = bytes_in;
  477. assert(align <= maxAlignment);
  478. bytes = (bytes+(align-1)) & ~(align-1);
  479. if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
  480. const size_t i = cur.fetch_add(bytes);
  481. if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
  482. if (unlikely(i > reserveEnd)) return nullptr;
  483. bytes_in = bytes = min(bytes,reserveEnd-i);
  484. if (i+bytes > allocEnd) {
  485. if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
  486. if (atype == OS_MALLOC)
  487. os_commit(&data[i],bytes); // FIXME: optimize, may get called frequently
  488. }
  489. return &data[i];
  490. }
  491. void* ptr() {
  492. return &data[cur];
  493. }
  494. void reset_list ()
  495. {
  496. for (Block* block = this; block; block = block->next)
  497. block->reset_block();
  498. }
  499. void reset_block ()
  500. {
  501. allocEnd = max(allocEnd,(size_t)cur);
  502. cur = 0;
  503. }
  504. void shrink_list (MemoryMonitorInterface* device)
  505. {
  506. for (Block* block = this; block; block = block->next)
  507. block->shrink_block(device);
  508. }
  509. void shrink_block (MemoryMonitorInterface* device)
  510. {
  511. if (atype == OS_MALLOC)
  512. {
  513. const size_t sizeof_Header = offsetof(Block,data[0]);
  514. size_t newSize = os_shrink(this,sizeof_Header+getBlockUsedBytes(),reserveEnd+sizeof_Header);
  515. if (device) device->memoryMonitor(newSize-sizeof_Header-allocEnd,true);
  516. reserveEnd = allocEnd = newSize-sizeof_Header;
  517. }
  518. }
  519. size_t getBlockUsedBytes() const {
  520. return min(size_t(cur),reserveEnd);
  521. }
  522. size_t getBlockAllocatedBytes() const {
  523. return min(max(allocEnd,size_t(cur)),reserveEnd);
  524. }
  525. size_t getBlockReservedBytes() const {
  526. return reserveEnd;
  527. }
  528. size_t getBlockFreeBytes() const {
  529. return max(allocEnd,size_t(cur))-cur;
  530. }
  531. size_t getUsedBytes() const {
  532. size_t bytes = 0;
  533. for (const Block* block = this; block; block = block->next)
  534. bytes += block->getBlockUsedBytes();
  535. return bytes;
  536. }
  537. size_t getAllocatedBytes() const {
  538. size_t bytes = 0;
  539. for (const Block* block = this; block; block = block->next)
  540. bytes += block->getBlockAllocatedBytes();
  541. return bytes;
  542. }
  543. size_t getReservedBytes() const {
  544. size_t bytes = 0;
  545. for (const Block* block = this; block; block = block->next)
  546. bytes += block->getBlockReservedBytes();
  547. return bytes;
  548. }
  549. size_t getFreeBytes() const {
  550. size_t bytes = 0;
  551. for (const Block* block = this; block; block = block->next)
  552. bytes += block->getBlockFreeBytes();
  553. return bytes;
  554. }
  555. void print_list ()
  556. {
  557. for (const Block* block = this; block; block = block->next)
  558. block->print_block();
  559. }
  560. void print_block() const
  561. {
  562. if (atype == ALIGNED_MALLOC) std::cout << "A";
  563. else if (atype == OS_MALLOC) std::cout << "O";
  564. else if (atype == SHARED) std::cout << "S";
  565. std::cout << "[" << getBlockUsedBytes() << ", " << getBlockAllocatedBytes() << ", " << getBlockReservedBytes() << "] ";
  566. }
  567. public:
  568. std::atomic<size_t> cur; //!< current location of the allocator
  569. std::atomic<size_t> allocEnd; //!< end of the allocated memory region
  570. std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
  571. Block* next; //!< pointer to next block in list
  572. size_t wasted; //!< amount of memory wasted through block alignment
  573. AllocationType atype; //!< allocation mode of the block
  574. char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)]; //!< align data to maxAlignment
  575. char data[1]; //!< here starts memory to use for allocations
  576. };
  577. private:
  578. Device* device;
  579. SpinLock mutex;
  580. size_t slotMask;
  581. std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  582. std::atomic<Block*> usedBlocks;
  583. std::atomic<Block*> freeBlocks;
  584. std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  585. SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
  586. bool use_single_mode;
  587. size_t defaultBlockSize;
  588. size_t growSize;
  589. std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size
  590. size_t bytesUsed; //!< number of total bytes used
  591. size_t bytesWasted; //!< number of total wasted bytes
  592. ThreadLocalData<ThreadLocal2,FastAllocator*> thread_local_allocators2; //!< thread local allocators
  593. AllocationType atype;
  594. };
  595. }