multi_uma_buffer.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. /**************************************************************************/
  2. /* multi_uma_buffer.h */
  3. /**************************************************************************/
  4. /* This file is part of: */
  5. /* GODOT ENGINE */
  6. /* https://godotengine.org */
  7. /**************************************************************************/
  8. /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
  9. /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
  10. /* */
  11. /* Permission is hereby granted, free of charge, to any person obtaining */
  12. /* a copy of this software and associated documentation files (the */
  13. /* "Software"), to deal in the Software without restriction, including */
  14. /* without limitation the rights to use, copy, modify, merge, publish, */
  15. /* distribute, sublicense, and/or sell copies of the Software, and to */
  16. /* permit persons to whom the Software is furnished to do so, subject to */
  17. /* the following conditions: */
  18. /* */
  19. /* The above copyright notice and this permission notice shall be */
  20. /* included in all copies or substantial portions of the Software. */
  21. /* */
  22. /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
  23. /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
  24. /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
  25. /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
  26. /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
  27. /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
  28. /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  29. /**************************************************************************/
  30. #pragma once
  31. #include "servers/rendering/rendering_server.h"
  32. class MultiUmaBufferBase {
  33. protected:
  34. LocalVector<RID> buffers;
  35. uint32_t curr_idx = UINT32_MAX;
  36. uint64_t last_frame_mapped = UINT64_MAX;
  37. const uint32_t max_extra_buffers;
  38. #ifdef DEBUG_ENABLED
  39. const char *debug_name;
  40. #endif
  41. MultiUmaBufferBase(uint32_t p_max_extra_buffers, const char *p_debug_name) :
  42. max_extra_buffers(p_max_extra_buffers)
  43. #ifdef DEBUG_ENABLED
  44. ,
  45. debug_name(p_debug_name)
  46. #endif
  47. {
  48. }
  49. #ifdef DEV_ENABLED
  50. ~MultiUmaBufferBase() {
  51. DEV_ASSERT(buffers.is_empty() && "Forgot to call uninit()!");
  52. }
  53. #endif
  54. public:
  55. void uninit() {
  56. if (is_print_verbose_enabled()) {
  57. print_line("MultiUmaBuffer '"
  58. #ifdef DEBUG_ENABLED
  59. + String(debug_name) +
  60. #else
  61. "{DEBUG_ENABLED unavailable}"
  62. #endif
  63. "' used a total of " + itos(buffers.size()) +
  64. " buffers. A large number may indicate a waste of VRAM and can be brought down by tweaking MAX_EXTRA_BUFFERS for this buffer.");
  65. }
  66. RenderingDevice *rd = RD::RenderingDevice::get_singleton();
  67. for (RID buffer : buffers) {
  68. if (buffer.is_valid()) {
  69. rd->free_rid(buffer);
  70. }
  71. }
  72. buffers.clear();
  73. }
  74. void shrink_to_max_extra_buffers() {
  75. DEV_ASSERT(curr_idx == 0u && "This function can only be called after reset and before being upload_and_advance again!");
  76. RenderingDevice *rd = RD::RenderingDevice::get_singleton();
  77. uint32_t elem_count = buffers.size();
  78. if (elem_count > max_extra_buffers) {
  79. if (is_print_verbose_enabled()) {
  80. print_line("MultiUmaBuffer '"
  81. #ifdef DEBUG_ENABLED
  82. + String(debug_name) +
  83. #else
  84. "{DEBUG_ENABLED unavailable}"
  85. #endif
  86. "' peaked to " + itos(elem_count) + " elements and shrinking it to " + itos(max_extra_buffers) +
  87. ". If you see this message often, then something is wrong with rendering or MAX_EXTRA_BUFFERS needs to be increased.");
  88. }
  89. }
  90. while (elem_count > max_extra_buffers) {
  91. --elem_count;
  92. if (buffers[elem_count].is_valid()) {
  93. rd->free_rid(buffers[elem_count]);
  94. }
  95. buffers.remove_at(elem_count);
  96. }
  97. }
  98. };
  99. enum class MultiUmaBufferType : uint8_t {
  100. UNIFORM,
  101. STORAGE,
  102. VERTEX,
  103. };
  104. /// Interface for making it easier to work with UMA.
  105. ///
  106. /// # What is UMA?
  107. ///
  108. /// It stands for Unified Memory Architecture. There are two kinds of UMA:
  109. /// 1. HW UMA. This is the case of iGPUs (specially Android, iOS, Apple ARM-based macOS, PS4 & PS5)
  110. /// The CPU and GPU share the same die and same memory. So regular RAM and VRAM are internally the
  111. /// same thing. There may be some differences between them in practice due to cache synchronization
  112. /// behaviors or the regular BW RAM may be purposely throttled (as is the case of PS4 & PS5).
  113. /// 2. "Pretended UMA". On PC Desktop GPUs with ReBAR enabled can pretend VRAM behaves like normal
  114. /// RAM, while internally the data is moved across the PCIe Bus. This can cause differences
  115. /// in execution time of the routines that write to GPU buffers as the region is often uncached
  116. /// (i.e. write-combined) and PCIe latency and BW is vastly different from regular RAM.
  117. /// Without ReBAR, the amount of UMA memory is limited to 256MB (shared by the entire system).
  118. ///
  119. /// Since often this type of memory is uncached, it is not well-suited for downloading GPU -> CPU,
  120. /// but rather for uploading CPU -> GPU.
  121. ///
  122. /// # When to use UMA buffers?
  123. ///
  124. /// UMA buffers have various caveats and improper usage might lead to visual glitches. Therefore they
  125. /// should be used sparingly, where it makes a difference. Does all of the following check?:
  126. /// 1. Data is uploaded from CPU to GPU every (or almost every) frame.
  127. /// 2. Data is always uploaded from scratch. Partial uploads are unsupported.
  128. /// 3. If uploading multiple times per frame (e.g. for multiple passes). The amount of times
  129. /// per frame is relatively stable (occasional spikes are fine if using MAX_EXTRA_BUFFERS).
  130. ///
  131. /// # Why the caveats?
  132. ///
  133. /// This is due to our inability to detect race conditions. If you write to an UMA buffer, submit
  134. /// GPU commands and then write more data to it, we can't guarantee that you won't be writing to a
  135. /// region the GPU is currently reading from. Tools like the validation layers cannot detect this
  136. /// race condition at all, making it very hard to troubleshoot.
  137. ///
  138. /// Therefore the safest approach is to use an interface that forces users to upload everything at once.
  139. /// There is one exception for performance: map_raw_for_upload() will return a pointer, and it is your
  140. /// responsibility to make sure you don't use that pointer again after submitting.
  141. /// USE THIS API CALL SPARINGLY AND WITH CARE.
  142. ///
  143. /// Since we forbid uploading more data after we've uploaded to it, this Interface will create
  144. /// more buffers. This means users will need more UniformSets (i.e. uniform_set_create).
  145. ///
  146. /// # How to use
  147. ///
  148. /// Example code 01:
  149. /// MultiUmaBuffer<1> uma_buffer = MultiUmaBuffer<1>("Debug name displayed if run with --verbose");
  150. /// uma_buffer.set_uniform_size(0, max_size_bytes);
  151. ///
  152. /// for(uint32_t i = 0u; i < num_passes; ++i) {
  153. /// uma_buffer.prepare_for_upload(); // Creates a new buffer (if none exists already)
  154. /// // of max_size_bytes. Must be called.
  155. /// uma_buffer.upload(0, src_data, size_bytes);
  156. ///
  157. /// if(!uniform_set[i]) {
  158. /// RD::Uniform u;
  159. /// u.binding = 1;
  160. /// u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
  161. /// u.append_id(uma_buffer._get(0u));
  162. /// uniform_set[i] = rd->uniform_set_create( ... );
  163. /// }
  164. /// }
  165. ///
  166. /// // On shutdown (or if you need to call set_size again).
  167. /// uma_buffer.uninit();
  168. ///
  169. /// Example code 02:
  170. ///
  171. /// uma_buffer.prepare_for_upload();
  172. /// RID rid = uma_buffer.get_for_upload(0u);
  173. /// rd->buffer_update(rid, 0, sizeof(BakeParameters), &bake_parameters);
  174. /// RD::Uniform u; // Skipping full initialization of u. See Example 01.
  175. /// u.append_id(rid);
  176. ///
  177. /// Example code 03:
  178. ///
  179. /// void *dst_data = uma_buffer.map_raw_for_upload(0u);
  180. /// memcpy(dst_data, src_data, size_bytes);
  181. /// rd->buffer_flush(uma_buffer._get(0u));
  182. /// RD::Uniform u; // Skipping full initialization of u. See Example 01.
  183. /// u.append_id(rid);
  184. ///
  185. /// # Tricks
  186. ///
  187. /// Godot's shadow mapping code calls uma_buffer.uniform_buffers._get(-p_pass_offset) (i.e. a negative value)
  188. /// because for various reasons its shadow mapping code was written like this:
  189. ///
  190. /// for( uint32_t i = 0u; i < num_passes; ++i ) {
  191. /// uma_buffer.prepare_for_upload();
  192. /// uma_buffer.upload(0, src_data, size_bytes);
  193. /// }
  194. /// for( uint32_t i = 0u; i < num_passes; ++i ) {
  195. /// RD::Uniform u;
  196. /// u.binding = 1;
  197. /// u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
  198. /// u.append_id(uma_buffer._get(-(num_passes - 1u - i)));
  199. /// uniform_set[i] = rd->uniform_set_create( ... );
  200. /// }
  201. ///
  202. /// Every time prepare_for_upload() is called, uma_buffer._get(-idx) will return a different RID(*).
  203. /// Thus with a negative value we can address previous ones. This is fine as long as the value idx
  204. /// doesn't exceed the number of times the user called prepare_for_upload() for this frame.
  205. ///
  206. /// (*)This RID will be returned again on the next frame after the same amount of prepare_for_upload()
  207. /// calls; unless the number of times it was called exceeded MAX_EXTRA_BUFFERS.
  208. ///
  209. /// # Template parameters
  210. ///
  211. /// ## NUM_BUFFERS
  212. ///
  213. /// How many buffers we should track. e.g. instead of doing this:
  214. /// MultiUmaBuffer<1> omni_lights = /*...*/;
  215. /// MultiUmaBuffer<1> spot_lights = /*...*/;
  216. /// MultiUmaBuffer<1> directional_lights = /*...*/;
  217. ///
  218. /// omni_lights.set_uniform_size(0u, omni_size);
  219. /// spot_lights.set_uniform_size(0u, spot_size);
  220. /// directional_lights.set_uniform_size(0u, dir_size);
  221. ///
  222. /// omni_lights.prepare_for_upload();
  223. /// spot_lights.prepare_for_upload();
  224. /// directional_lights.prepare_for_upload();
  225. ///
  226. /// You can do this:
  227. ///
  228. /// MultiUmaBuffer<3> lights = /*...*/;
  229. ///
  230. /// lights.set_uniform_size(0u, omni_size);
  231. /// lights.set_uniform_size(1u, spot_size);
  232. /// lights.set_uniform_size(2u, dir_size);
  233. ///
  234. /// lights.prepare_for_upload();
  235. ///
  236. /// This approach works as long as all buffers would call prepare_for_upload() at the same time.
  237. /// It saves some overhead.
  238. ///
  239. /// ## MAX_EXTRA_BUFFERS
  240. ///
  241. /// Upper limit on the number of buffers per frame.
  242. ///
  243. /// There are times where rendering might spike for exceptional reasons, calling prepare_for_upload()
  244. /// too many times, never to do that again. This will cause an increase in memory usage that will
  245. /// never be reclaimed until shutdown.
  246. ///
  247. /// MAX_EXTRA_BUFFERS can be used to handle such spikes, by deallocating the extra buffers.
  248. /// Example:
  249. /// MultiUmaBuffer<1, 6> buffer;
  250. ///
  251. /// // Normal frame (assuming up to 6 passes is considered normal):
  252. /// for(uint32_t i = 0u; i < 6u; ++i) {
  253. /// buffer.prepare_for_upload();
  254. /// ...
  255. /// buffer.upload(...);
  256. /// }
  257. ///
  258. /// // Exceptional frame:
  259. /// for(uint32_t i = 0u; i < 24u; ++i) {
  260. /// buffer.prepare_for_upload();
  261. /// ...
  262. /// buffer.upload(...);
  263. /// }
  264. ///
  265. /// After the frame is done, those extra 18 buffers will be deleted.
  266. /// Launching godot with --verbose will print diagnostic information.
  267. template <uint32_t NUM_BUFFERS, uint32_t MAX_EXTRA_BUFFERS = UINT32_MAX>
  268. class MultiUmaBuffer : public MultiUmaBufferBase {
  269. struct BufferInfo {
  270. uint32_t size_bytes = 0;
  271. MultiUmaBufferType type = MultiUmaBufferType::UNIFORM;
  272. };
  273. BufferInfo buffer_info[NUM_BUFFERS];
  274. #ifdef DEV_ENABLED
  275. bool can_upload[NUM_BUFFERS] = {};
  276. #endif
  277. void push() {
  278. RenderingDevice *rd = RD::RenderingDevice::get_singleton();
  279. for (uint32_t i = 0u; i < NUM_BUFFERS; ++i) {
  280. const BufferInfo &info = buffer_info[i];
  281. RID buffer;
  282. switch (info.type) {
  283. case MultiUmaBufferType::STORAGE:
  284. buffer = rd->storage_buffer_create(info.size_bytes, Vector<uint8_t>(), BitField<RenderingDevice::StorageBufferUsage>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
  285. break;
  286. case MultiUmaBufferType::VERTEX:
  287. buffer = rd->vertex_buffer_create(info.size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
  288. break;
  289. case MultiUmaBufferType::UNIFORM:
  290. default:
  291. buffer = rd->uniform_buffer_create(info.size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
  292. break;
  293. }
  294. buffers.push_back(buffer);
  295. }
  296. }
  297. public:
  298. MultiUmaBuffer(const char *p_debug_name) :
  299. MultiUmaBufferBase(MAX_EXTRA_BUFFERS, p_debug_name) {}
  300. uint32_t get_curr_idx() const { return curr_idx; }
  301. void set_size(uint32_t p_idx, uint32_t p_size_bytes, MultiUmaBufferType p_type) {
  302. DEV_ASSERT(buffers.is_empty());
  303. buffer_info[p_idx].size_bytes = p_size_bytes;
  304. buffer_info[p_idx].type = p_type;
  305. curr_idx = UINT32_MAX;
  306. last_frame_mapped = UINT64_MAX;
  307. }
  308. void set_size(uint32_t p_idx, uint32_t p_size_bytes, bool p_is_storage) {
  309. set_size(p_idx, p_size_bytes, p_is_storage ? MultiUmaBufferType::STORAGE : MultiUmaBufferType::UNIFORM);
  310. }
  311. void set_uniform_size(uint32_t p_idx, uint32_t p_size_bytes) {
  312. set_size(p_idx, p_size_bytes, MultiUmaBufferType::UNIFORM);
  313. }
  314. void set_storage_size(uint32_t p_idx, uint32_t p_size_bytes) {
  315. set_size(p_idx, p_size_bytes, MultiUmaBufferType::STORAGE);
  316. }
  317. void set_vertex_size(uint32_t p_idx, uint32_t p_size_bytes) {
  318. set_size(p_idx, p_size_bytes, MultiUmaBufferType::VERTEX);
  319. }
  320. uint32_t get_size(uint32_t p_idx) const { return buffer_info[p_idx].size_bytes; }
  321. // Gets the raw buffer. Use with care.
  322. // If you call this function, make sure to have called prepare_for_upload() first.
  323. // Do not call _get() then prepare_for_upload().
  324. RID _get(uint32_t p_idx) {
  325. return buffers[curr_idx * NUM_BUFFERS + p_idx];
  326. }
  327. /**
  328. * @param p_append True if you wish to append more data to existing buffer.
  329. * @return False if it's possible to append. True if the internal buffer changed.
  330. */
  331. bool prepare_for_map(bool p_append) {
  332. RenderingDevice *rd = RD::RenderingDevice::get_singleton();
  333. const uint64_t frames_drawn = rd->get_frames_drawn();
  334. if (last_frame_mapped == frames_drawn) {
  335. if (!p_append) {
  336. ++curr_idx;
  337. }
  338. } else {
  339. p_append = false;
  340. curr_idx = 0u;
  341. if (max_extra_buffers != UINT32_MAX) {
  342. shrink_to_max_extra_buffers();
  343. }
  344. }
  345. last_frame_mapped = frames_drawn;
  346. if (curr_idx * NUM_BUFFERS >= buffers.size()) {
  347. push();
  348. }
  349. #ifdef DEV_ENABLED
  350. if (!p_append) {
  351. for (size_t i = 0u; i < NUM_BUFFERS; ++i) {
  352. can_upload[i] = true;
  353. }
  354. }
  355. #endif
  356. return !p_append;
  357. }
  358. void prepare_for_upload() {
  359. prepare_for_map(false);
  360. }
  361. void *map_raw_for_upload(uint32_t p_idx) {
  362. #ifdef DEV_ENABLED
  363. DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
  364. can_upload[p_idx] = false;
  365. #endif
  366. RenderingDevice *rd = RD::RenderingDevice::get_singleton();
  367. return rd->buffer_persistent_map_advance(buffers[curr_idx * NUM_BUFFERS + p_idx]);
  368. }
  369. RID get_for_upload(uint32_t p_idx) {
  370. #ifdef DEV_ENABLED
  371. DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
  372. can_upload[p_idx] = false;
  373. #endif
  374. return buffers[curr_idx * NUM_BUFFERS + p_idx];
  375. }
  376. void upload(uint32_t p_idx, const void *p_src_data, uint32_t p_size_bytes) {
  377. #ifdef DEV_ENABLED
  378. DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
  379. can_upload[p_idx] = false;
  380. #endif
  381. RenderingDevice *rd = RD::RenderingDevice::get_singleton();
  382. rd->buffer_update(buffers[curr_idx * NUM_BUFFERS + p_idx], 0, p_size_bytes, p_src_data, true);
  383. }
  384. };