Browse Source

vulkan: Use persistently-mapped ring buffer for uniform data

Put it in this 256 MiB of device-local host-writable memory that AMD and NVIDIA GPUs have (and Intel of course reports all CPU memory as device-local, so it works there too).

Also make some improvements to frame buffering / frame data rotation.
rdb 4 years ago
parent
commit
46411b1639

+ 1 - 1
panda/src/vulkandisplay/config_vulkandisplay.cxx

@@ -53,7 +53,7 @@ ConfigVariableInt64 vulkan_global_uniform_buffer_size
 ("vulkan-global-uniform-buffer-size", 64 * 1024,
  PRC_DESC("This value indicates how large the uniform buffer should be that is "
           "allocated to contain all of the global uniforms values used by all "
-          "shaders in a give frame.  To optimize this value, enable "
+          "shaders in a given frame.  To optimize this value, enable "
           "\"notify-level-vulkandisplay debug\" and look for the highest value "
           "in the message \"Used at most # bytes of global uniform buffer.\" "
           "in the most complex scene, then add a generous safety margin."));

+ 27 - 0
panda/src/vulkandisplay/vulkanGraphicsPipe.cxx

@@ -390,6 +390,33 @@ VulkanGraphicsPipe() : _max_allocation_size(0) {
         vulkandisplay_cat.debug(false) << ", multi-instance";
       }
       vulkandisplay_cat.debug(false) << "\n";
+
+      for (size_t ti = 0; ti < _memory_properties.memoryTypeCount; ++ti) {
+        const VkMemoryType &type = _memory_properties.memoryTypes[ti];
+        if (type.heapIndex == i) {
+          std::ostream &out = vulkandisplay_cat.debug();
+          out << "    Type " << ti << ":";
+          if (type.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
+            out << " DEVICE_LOCAL";
+          }
+          if (type.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+            out << " HOST_VISIBLE";
+          }
+          if (type.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) {
+            out << " HOST_COHERENT";
+          }
+          if (type.propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) {
+            out << " HOST_CACHED";
+          }
+          if (type.propertyFlags & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) {
+            out << " LAZILY_ALLOCATED";
+          }
+          if (type.propertyFlags & VK_MEMORY_PROPERTY_PROTECTED_BIT) {
+            out << " PROTECTED";
+          }
+          out << "\n";
+        }
+      }
     }
 
     // Enumerate supported extensions.

+ 180 - 96
panda/src/vulkandisplay/vulkanGraphicsStateGuardian.cxx

@@ -152,7 +152,7 @@ VulkanGraphicsStateGuardian(GraphicsEngine *engine, VulkanGraphicsPipe *pipe,
   }
 
   // Create two command buffers per frame.
-  const uint32_t num_command_buffers = 2 * sizeof(_frame_data_pool) / sizeof(FrameData);
+  const uint32_t num_command_buffers = 2 * _frame_data_capacity;
   VkCommandBufferAllocateInfo alloc_info;
   alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
   alloc_info.pNext = nullptr;
@@ -329,16 +329,28 @@ VulkanGraphicsStateGuardian(GraphicsEngine *engine, VulkanGraphicsPipe *pipe,
   }
 
   // Create a uniform buffer that we'll use for everything.
+  // Some cards set aside 256 MiB of device-local host-visible memory for data
+  // like this, so we use that.
   VkDeviceSize uniform_buffer_size = vulkan_global_uniform_buffer_size;
   if (!create_buffer(uniform_buffer_size, _uniform_buffer, _uniform_buffer_memory,
-                     VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
+                     VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+                     (VkMemoryPropertyFlagBits)(VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))) {
+    // No?  Put it in GPU-accessible CPU memory, then.
+    if (!create_buffer(uniform_buffer_size, _uniform_buffer, _uniform_buffer_memory,
+                       VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+                       (VkMemoryPropertyFlagBits)(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))) {
+      vulkandisplay_cat.error()
+        << "Failed to create global uniform buffer.\n";
+    }
+    return;
+  }
+  _uniform_buffer_ptr = _uniform_buffer_memory.map_persistent();
+  if (_uniform_buffer_ptr == nullptr) {
     vulkandisplay_cat.error()
-      << "Failed to create uniform buffer buffer.\n";
+      << "Failed to map global uniform buffer.\n";
     return;
   }
-  _uniform_buffer_size = uniform_buffer_size;
-  _uniform_buffer_offset_alignment = limits.minUniformBufferOffsetAlignment;
+  _uniform_buffer_allocator = CircularAllocator(uniform_buffer_size, limits.minUniformBufferOffsetAlignment);
 
   // Fill in the features supported by this physical device.
   _is_hardware = (pipe->_gpu_properties.deviceType != VK_PHYSICAL_DEVICE_TYPE_CPU);
@@ -541,19 +553,19 @@ close_gsg() {
 
   // Call finish_frame() on all frames.  We don't need to wait on the fences due
   // to the above call.
-  const size_t num_frames = (sizeof(_frame_data_pool) / sizeof(FrameData));
-
-  while (_frame_data_tail != _frame_data_head) {
-    FrameData &frame_data = _frame_data_pool[_frame_data_tail];
+  if (_frame_data_head != _frame_data_capacity) {
+    do {
+      FrameData &frame_data = _frame_data_pool[_frame_data_tail];
+      finish_frame(frame_data);
 
-    ++_last_finished_frame;
-    _frame_data_tail = (_frame_data_tail + 1) % num_frames;
+      _frame_data_tail = (_frame_data_tail + 1) % _frame_data_capacity;
+    }
+    while (_frame_data_tail != _frame_data_head);
 
-    finish_frame(frame_data);
+    _frame_data_head = _frame_data_capacity;
+    _frame_data_tail = 0;
   }
 
-  nassertv(_frame_data_tail == _frame_data_head);
-
   // We need to release all prepared resources, since the upcall to close_gsg
   // will cause the PreparedGraphicsObjects to be cleared out.
   {
@@ -561,6 +573,7 @@ close_gsg() {
     if (pgo != nullptr) {
       // Create a temporary FrameData to hold all objects we need to destroy.
       FrameData frame_data;
+      frame_data._frame_index = ++_frame_counter;
       _frame_data = &frame_data;
       pgo->release_all_now(this);
       _frame_data = nullptr;
@@ -2220,46 +2233,52 @@ begin_frame(Thread *current_thread) {
   nassertr_always(!_closing_gsg, false);
   nassertr_always(_frame_data == nullptr, false);
 
-  const size_t num_frames = (sizeof(_frame_data_pool) / sizeof(FrameData));
-  size_t current_head = _frame_data_head;
-  _frame_data = &_frame_data_pool[current_head];
-  _frame_data_head = (current_head + 1) % num_frames;
-
-  // Increase the frame counter, which we use to determine whether we've
-  // updated any resources in this frame.
-  _frame_data->_frame_index = ++_frame_counter;
-
-  VkFence reset_fences[num_frames];
+  // First, finish old, finished frames.
+  VkFence reset_fences[_frame_data_capacity];
   size_t num_reset_fences = 0;
 
-  while (_frame_data_tail != current_head) {
-    FrameData &frame_data = _frame_data_pool[_frame_data_tail];
+  if (_frame_data_head != _frame_data_capacity) {
+    while (true) {
+      FrameData &frame_data = _frame_data_pool[_frame_data_tail];
+      if (vkGetFenceStatus(_device, frame_data._fence) == VK_NOT_READY) {
+        // This frame is not yet ready, so abort the loop here, since there's no
+        // use checking frames that come after this one.
+        break;
+      }
 
-    if (_frame_data_tail == _frame_data_head) {
-      // We have reached the limit of queued frames, so we must wait.
-      VkResult err;
-      err = vkWaitForFences(_device, 1, &frame_data._fence, VK_TRUE, 1000000000ULL);
-      if (err == VK_TIMEOUT) {
-        vulkandisplay_cat.error()
-          << "Timed out waiting for previous frame to complete rendering.\n";
-        _frame_data = nullptr;
-        return false;
-      } else if (err) {
-        vulkan_error(err, "Failure waiting for command buffer fence");
-        return false;
+      // This frame has completed execution.
+      reset_fences[num_reset_fences++] = frame_data._fence;
+      finish_frame(frame_data);
+
+      _frame_data_tail = (_frame_data_tail + 1) % _frame_data_capacity;
+      if (_frame_data_tail == _frame_data_head) {
+        // This was the last one, it's now empty.
+        _frame_data_head = _frame_data_capacity;
+        _frame_data_tail = 0;
+        break;
       }
-    } else if (vkGetFenceStatus(_device, frame_data._fence) == VK_NOT_READY) {
-      // This frame is not yet ready, so abort the loop here, since there's no
-      // use checking frames that come after this one.
-      break;
+    }
+  }
+
+  // If the frame queue is full, we must wait until a frame is done.
+  if (_frame_data_tail == _frame_data_head) {
+    FrameData &frame_data = _frame_data_pool[_frame_data_tail];
+    VkResult err;
+    err = vkWaitForFences(_device, 1, &frame_data._fence, VK_TRUE, 1000000000ULL);
+    if (err == VK_TIMEOUT) {
+      vulkandisplay_cat.error()
+        << "Timed out waiting for previous frame to complete rendering.\n";
+      return false;
+    }
+    else if (err) {
+      vulkan_error(err, "Failure waiting for command buffer fence");
+      return false;
     }
 
     // This frame has completed execution.
     reset_fences[num_reset_fences++] = frame_data._fence;
-    ++_last_finished_frame;
-    _frame_data_tail = (_frame_data_tail + 1) % num_frames;
-
     finish_frame(frame_data);
+    _frame_data_tail = (_frame_data_tail + 1) % _frame_data_capacity;
   }
 
   // Reset the used fences to unsignaled status.
@@ -2268,19 +2287,7 @@ begin_frame(Thread *current_thread) {
     nassertr(!err, false);
   }
 
-  // Recycle the global uniform buffer.  It's probably not worth it to expect
-  // values to stick around between frames, because the vast majority of global
-  // uniforms will change more frequently than that anyway.
-  if (vulkandisplay_cat.is_debug()) {
-    static VkDeviceSize max_used = 0;
-    if (_uniform_buffer_offset > max_used) {
-      max_used = _uniform_buffer_offset;
-      vulkandisplay_cat.debug()
-        << "Used at most " << max_used << " of " << _uniform_buffer_size
-        << " bytes of global uniform buffer.\n";
-    }
-  }
-  _uniform_buffer_offset = 0;
+  _frame_data = &_frame_data_pool[_frame_data_head % _frame_data_capacity];
 
   // Begin the transfer command buffer, for preparing resources.
   VkCommandBufferBeginInfo begin_info;
@@ -2292,10 +2299,16 @@ begin_frame(Thread *current_thread) {
   VkResult err;
   err = vkBeginCommandBuffer(_frame_data->_transfer_cmd, &begin_info);
   if (err) {
-    vulkan_error(err, "Can't begin command buffer");
+    vulkan_error(err, "Can't begin transfer command buffer");
+    _frame_data = nullptr;
     return false;
   }
 
+  // Increase the frame counter, which we use to determine whether we've
+  // updated any resources in this frame.
+  _frame_data->_frame_index = ++_frame_counter;
+  _frame_data_head = (_frame_data_head + 1) % _frame_data_capacity;
+
   // Make sure we have a white texture.
   if (_white_texture.is_null()) {
     _white_texture = new Texture();
@@ -2310,38 +2323,38 @@ begin_frame(Thread *current_thread) {
 
   // Call the GSG's begin_frame, which will cause any queued-up release() and
   // prepare() methods to be called.  Note that some of them may add to the
-  // command buffer, which is why we've begun it already.
-  if (!GraphicsStateGuardian::begin_frame(current_thread)) {
-    return false;
+  // transfer command buffer, which is why we've begun it already.
+  if (GraphicsStateGuardian::begin_frame(current_thread)) {
+    // Now begin the main (ie. graphics) command buffer.
+    err = vkBeginCommandBuffer(_frame_data->_cmd, &begin_info);
+    if (!err) {
+      return true;
+    }
+    vulkan_error(err, "Can't begin command buffer");
   }
 
-  // Let's submit our preparation calls, so the GPU has something to munch on.
-  //vkEndCommandBuffer(_frame_data->_transfer_cmd);
+  // We've already started putting stuff in the transfer command buffer, so now
+  // we are obliged to submit it, even if we won't actually end up rendering
+  // anything in this frame.
+  vkEndCommandBuffer(_frame_data->_transfer_cmd);
 
-  /*VkSubmitInfo submit_info;
+  VkSubmitInfo submit_info;
   submit_info.pNext = nullptr;
   submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
   submit_info.waitSemaphoreCount = 0;
   submit_info.pWaitSemaphores = nullptr;
   submit_info.pWaitDstStageMask = nullptr;
   submit_info.commandBufferCount = 1;
-  submit_info.pCommandBuffers = &_transfer_cmd;
+  submit_info.pCommandBuffers = &_frame_data->_transfer_cmd;
   submit_info.signalSemaphoreCount = 0;
   submit_info.pSignalSemaphores = nullptr;
-  err = vkQueueSubmit(_queue, 1, &submit_info, VK_NULL_HANDLE);
-  if (err) {
-    vulkan_error(err, "Failed to submit preparation command buffer");
-    return false;
-  }*/
 
-  // Now begin the main (ie. graphics) command buffer.
-  err = vkBeginCommandBuffer(_frame_data->_cmd, &begin_info);
+  err = vkQueueSubmit(_queue, 1, &submit_info, _frame_data->_fence);
   if (err) {
-    vulkan_error(err, "Can't begin command buffer");
-    return false;
+    vulkan_error(err, "Error submitting queue");
   }
-
-  return true;
+  _frame_data = nullptr;
+  return false;
 }
 
 /**
@@ -2382,6 +2395,22 @@ end_frame(Thread *current_thread) {
   nassertv(_frame_data->_transfer_cmd != VK_NULL_HANDLE);
   vkEndCommandBuffer(_frame_data->_transfer_cmd);
 
+  // Note down the current watermark of the uniform buffer.
+  _frame_data->_uniform_buffer_head = _uniform_buffer_allocator.get_head();
+
+  // Report how much UBO memory was used.
+  size_t used = _uniform_buffer_allocator.get_size();
+  if (used > _uniform_buffer_max_used) {
+    _uniform_buffer_max_used = used;
+
+    if (vulkandisplay_cat.is_debug()) {
+      vulkandisplay_cat.debug()
+        << "Used at most " << _uniform_buffer_max_used << " of "
+        << _uniform_buffer_allocator.get_capacity()
+        << " bytes of global uniform buffer.\n";
+    }
+  }
+
   // Issue commands to transition the staging buffers of the texture downloads
   // to make sure that the previous copy operations are visible to host reads.
   if (!_download_queue.empty()) {
@@ -2483,9 +2512,14 @@ end_frame(Thread *current_thread) {
 
 /**
  * Called after the frame has finished executing to clean up any resources.
+ * All frames *must* be finished in order!  It is illegal to call this for a
+ * frame when a preceding frame has not finished yet!
  */
 void VulkanGraphicsStateGuardian::
 finish_frame(FrameData &frame_data) {
+  ++_last_finished_frame;
+  nassertv(frame_data._frame_index == _last_finished_frame);
+
   for (VkBufferView buffer_view : frame_data._pending_destroy_buffer_views) {
     vkDestroyBufferView(_device, buffer_view, nullptr);
   }
@@ -2524,6 +2558,9 @@ finish_frame(FrameData &frame_data) {
     }
     frame_data._pending_free_descriptor_sets.clear();
   }
+
+  // Make the used uniform buffer space available.
+  _uniform_buffer_allocator.set_tail(frame_data._uniform_buffer_head);
 }
 
 /**
@@ -3995,32 +4032,79 @@ update_sattr_descriptor_set(VkDescriptorSet ds, const ShaderAttrib *attr) {
 }
 
 /**
- * Reserves space in the global uniform buffer.
+ * Returns a writable pointer to the dynamic uniform buffer.
  */
-VkDeviceSize VulkanGraphicsStateGuardian::
-update_dynamic_uniform_buffer(void *data, VkDeviceSize size) {
+void *VulkanGraphicsStateGuardian::
+alloc_dynamic_uniform_buffer(VkDeviceSize size, uint32_t &offset) {
   if (size == 0) {
-    return 0;
+    offset = 0;
+    return nullptr;
+  }
+
+  ssize_t result = _uniform_buffer_allocator.alloc(size);
+  if (result >= 0) {
+    offset = (uint32_t)result;
+    return (char *)_uniform_buffer_ptr + result;
+  }
+
+  TrueClock *clock = TrueClock::get_global_ptr();
+  double start_time = clock->get_short_raw_time();
+
+  VkFence reset_fences[_frame_data_capacity - 1];
+  size_t num_reset_fences = 0;
+
+  // Wait for the last frame to be done, that should free up some space.
+  while (result < 0 && &_frame_data_pool[_frame_data_tail] != _frame_data) {
+    FrameData &frame_data = _frame_data_pool[_frame_data_tail];
+    VkResult err;
+    err = vkWaitForFences(_device, 1, &frame_data._fence, VK_TRUE, 1000000000ULL);
+    if (err == VK_TIMEOUT) {
+      vulkandisplay_cat.error()
+        << "Timed out waiting for previous frame to complete rendering.\n";
+      break;
+    }
+    else if (err) {
+      vulkan_error(err, "Failure waiting for command buffer fence");
+      break;
+    }
+
+    // This frame has completed execution.
+    reset_fences[num_reset_fences++] = frame_data._fence;
+    finish_frame(frame_data);
+    _frame_data_tail = (_frame_data_tail + 1) % _frame_data_capacity;
+
+    // Try the allocation again.
+    result = _uniform_buffer_allocator.alloc(size);
   }
 
-  VkDeviceSize offset = _uniform_buffer_offset;
-  VkDeviceSize align = _uniform_buffer_offset_alignment;
-  offset = offset - 1 - (offset - 1) % align + align;
+  // Reset the used fences to unsignaled status.
+  if (num_reset_fences > 0) {
+    VkResult err = vkResetFences(_device, num_reset_fences, reset_fences);
+    nassertr(!err, nullptr);
+  }
+
+  if (result >= 0) {
+    double end_time = clock->get_short_raw_time();
+    double stall_time = (end_time - start_time);
+    vulkandisplay_cat.warning()
+      << "Stalled for " << (stall_time * 1000) << " ms due to running out of "
+         "global uniform buffer space trying to allocate " << size << " bytes."
+         "  Increase vulkan-global-uniform-buffer-size for best performance"
+      << " (current is " << _uniform_buffer_allocator.get_capacity() << ").\n";
+
+    offset = (uint32_t)result;
+    return (char *)_uniform_buffer_ptr + result;
+  }
 
   //TODO: fail more gracefully.  Create a new buffer on the fly and manage
   // multiple buffers?  Or submit work and insert a fence, then replace the
   // buffer with a new one?
-  if (offset + size > _uniform_buffer_size) {
-    vulkandisplay_cat.error()
-      << "Ran out of space in the global uniform buffer.  Increase "
-         "vulkan-global-uniform-buffer-size in Config.prc.\n";
-    abort();
-    return offset;
-  }
-
-  vkCmdUpdateBuffer(_frame_data->_transfer_cmd, _uniform_buffer, offset, size, data);
-  _uniform_buffer_offset = offset + size;
-  return offset;
+  vulkandisplay_cat.error()
+    << "Used up entire global uniform buffer in a single frame, cannot "
+       "recover.  Increase vulkan-global-uniform-buffer-size substantially"
+    << " (current is " << _uniform_buffer_allocator.get_capacity() << ").\n";
+  abort();
+  return nullptr;
 }
 
 /**

+ 11 - 8
panda/src/vulkandisplay/vulkanGraphicsStateGuardian.h

@@ -16,6 +16,7 @@
 
 #include "config_vulkandisplay.h"
 #include "vulkanMemoryPage.h"
+#include "circularAllocator.h"
 
 class VulkanIndexBufferContext;
 class VulkanSamplerContext;
@@ -175,7 +176,7 @@ public:
   bool update_lattr_descriptor_set(VkDescriptorSet ds, const LightAttrib *attr);
   bool update_tattr_descriptor_set(VkDescriptorSet ds, const TextureAttrib *attr);
   bool update_sattr_descriptor_set(VkDescriptorSet ds, const ShaderAttrib *attr);
-  VkDeviceSize update_dynamic_uniform_buffer(void *data, VkDeviceSize size);
+  void *alloc_dynamic_uniform_buffer(VkDeviceSize size, uint32_t &offset);
 
   uint32_t get_color_palette_offset(const LColor &color);
 
@@ -204,10 +205,10 @@ private:
   // Single large uniform buffer used for everything in a frame.
   VkBuffer _uniform_buffer;
   VulkanMemoryBlock _uniform_buffer_memory;
-  VkDeviceSize _uniform_buffer_size = 0;
-  VkDeviceSize _uniform_buffer_offset = 0;
-  VkDeviceSize _uniform_buffer_offset_alignment;
+  CircularAllocator _uniform_buffer_allocator;
+  void *_uniform_buffer_ptr = nullptr;
   VkDescriptorSet _uniform_descriptor_set;
+  VkDeviceSize _uniform_buffer_max_used = 0;
 
   // Stores current framebuffer info.
   VkRenderPass _render_pass;
@@ -249,7 +250,7 @@ private:
 
   // Keep track of all the individual allocations.
   Mutex _allocator_lock;
-  pvector<VulkanMemoryPage> _memory_pages;
+  pdeque<VulkanMemoryPage> _memory_pages;
   VkDeviceSize _total_allocated;
 
   struct FrameData {
@@ -266,12 +267,14 @@ private:
     pvector<VkImageView> _pending_destroy_image_views;
     pvector<VkSampler> _pending_destroy_samplers;
     pvector<VkDescriptorSet> _pending_free_descriptor_sets;
+
+    VkDeviceSize _uniform_buffer_head = 0;
   };
-  FrameData _frame_data_pool[5];
+  static const size_t _frame_data_capacity = 5;
+  FrameData _frame_data_pool[_frame_data_capacity];
+  size_t _frame_data_head = _frame_data_capacity;
   size_t _frame_data_tail = 0;
-  size_t _frame_data_head = 0;
   FrameData *_frame_data = nullptr;
-  FrameData *_last_frame_data = nullptr;
 
   uint64_t _frame_counter = 0;
   uint64_t _last_finished_frame = 0;

+ 92 - 5
panda/src/vulkandisplay/vulkanMemoryPage.I

@@ -22,7 +22,8 @@ VulkanMemoryPage(Mutex &lock) :
   _memory(VK_NULL_HANDLE),
   _type_index(0),
   _flags(0),
-  _linear_tiling(false) {
+  _linear_tiling(false),
+  _persistent_ptr(nullptr) {
 }
 
 /**
@@ -36,7 +37,8 @@ VulkanMemoryPage(VkDevice device, VkDeviceMemory memory, VkDeviceSize size,
   _memory(memory),
   _type_index(type_index),
   _flags(flags),
-  _linear_tiling(false) {
+  _linear_tiling(false),
+  _persistent_ptr(nullptr) {
 
   nassertv(_next == this && _prev == this);
 }
@@ -51,7 +53,8 @@ VulkanMemoryPage(VulkanMemoryPage &&from) :
   _memory(from._memory),
   _type_index(from._type_index),
   _flags(from._flags),
-  _linear_tiling(from._linear_tiling) {
+  _linear_tiling(from._linear_tiling),
+  _persistent_ptr(from._persistent_ptr) {
 
   // Prevent double free.
   from._memory = VK_NULL_HANDLE;
@@ -65,6 +68,7 @@ INLINE VulkanMemoryPage::
   if (_memory != VK_NULL_HANDLE) {
     vkFreeMemory(_device, _memory, nullptr);
     _memory = VK_NULL_HANDLE;
+    _persistent_ptr = nullptr;
   }
 }
 
@@ -80,6 +84,21 @@ meets_requirements(const VkMemoryRequirements &reqs, VkFlags required_flags,
           reqs.size <= (VkDeviceSize)get_max_size());
 }
 
+/**
+ * Maps this memory page persistently.
+ */
+INLINE void *VulkanMemoryPage::
+ensure_persistently_mapped() {
+  if (_persistent_ptr == nullptr) {
+    VkResult err = vkMapMemory(_device, _memory, 0, VK_WHOLE_SIZE, 0, &_persistent_ptr);
+    if (err) {
+      vulkan_error(err, "Failed to map memory");
+      _persistent_ptr = nullptr;
+    }
+  }
+  return _persistent_ptr;
+}
+
 /**
  * Returns the VkDeviceMemory of the page this block is located in.
  */
@@ -122,13 +141,19 @@ bind_buffer(VkBuffer buffer) {
 
 /**
  * Maps this block and returns the mapped memory address in a RAII wrapper.
- * Only one block in the page can be mapped at any given time.
+ * Only one block in the page can be mapped at any given time, except if the
+ * entire memory page is enabled for persistent mapping.
  */
 INLINE VulkanMemoryMapping VulkanMemoryBlock::
 map() {
   VulkanMemoryPage *page = (VulkanMemoryPage *)get_allocator();
   VulkanMemoryMapping ptr(page);
-  assert(!ptr);
+  if (page->_persistent_ptr != nullptr) {
+    ptr._data = (char *)page->_persistent_ptr + get_start();
+    return ptr;
+  }
+  ptr._device = page->_device;
+  ptr._memory = page->_memory;
   VkResult err = vkMapMemory(page->_device, page->_memory, get_start(), get_size(), 0, &ptr._data);
   if (err) {
     vulkan_error(err, "Failed to map memory");
@@ -137,6 +162,26 @@ map() {
   return ptr;
 }
 
+/**
+ * Maps the entire page this block is in persistently and return a pointer to
+ * this block within the mapping.  Many blocks can be mapped at the same time.
+ */
+INLINE void *VulkanMemoryBlock::
+map_persistent() {
+  VulkanMemoryPage *page = (VulkanMemoryPage *)get_allocator();
+  void *ptr = page->ensure_persistently_mapped();
+  nassertr(ptr != nullptr, nullptr);
+  return (char *)ptr + get_start();
+}
+
+/**
+ * Just grabs the page lock.
+ */
+INLINE VulkanMemoryMapping::
+VulkanMemoryMapping(VulkanMemoryPage *page) :
+  _holder(page->_lock) {
+}
+
 /**
  * Move constructor.
  */
@@ -149,4 +194,46 @@ VulkanMemoryMapping(VulkanMemoryMapping &&from) noexcept :
 
   // Prevent double unmap
   from._data = nullptr;
+  from._memory = VK_NULL_HANDLE;
+}
+
+/**
+ * Destructor, auto-unmaps if necessary.
+ */
+INLINE VulkanMemoryMapping::
+~VulkanMemoryMapping() {
+  if (_data != nullptr) {
+    unmap();
+  }
+}
+
+/**
+ * Move assignment operator.
+ */
+INLINE VulkanMemoryMapping &VulkanMemoryMapping::
+operator =(VulkanMemoryMapping &&from) noexcept {
+  if (_data != nullptr) {
+    unmap();
+  }
+  _holder = std::move(from._holder);
+  _device = from._device;
+  _memory = from._memory;
+  _data = from._data;
+  from._data = nullptr;
+  from._memory = VK_NULL_HANDLE;
+  return *this;
+}
+
+/**
+ * Unmaps the memory.  Does nothing if the page is persistently mapped.
+ */
+INLINE void VulkanMemoryMapping::
+unmap() {
+  nassertv_always(_data != nullptr);
+  if (_memory != VK_NULL_HANDLE) {
+    vkUnmapMemory(_device, _memory);
+    _memory = VK_NULL_HANDLE;
+  }
+  _data = nullptr;
+  _holder.unlock();
 }

+ 12 - 19
panda/src/vulkandisplay/vulkanMemoryPage.h

@@ -35,12 +35,16 @@ public:
                                  VkFlags required_flags = 0,
                                  bool linear_tiling = false);
 
+private:
+  INLINE void *ensure_persistently_mapped();
+
 private:
   VkDevice _device;
   VkDeviceMemory _memory;
   uint32_t _type_index;
   VkFlags _flags;
   bool _linear_tiling;
+  void *_persistent_ptr;
 
   friend class VulkanGraphicsStateGuardian;
   friend class VulkanMemoryBlock;
@@ -62,7 +66,9 @@ public:
 
   INLINE bool bind_image(VkImage image);
   INLINE bool bind_buffer(VkBuffer buffer);
+
   INLINE VulkanMemoryMapping map();
+  INLINE void *map_persistent();
 
   friend class VulkanMemoryPage;
 };
@@ -73,27 +79,14 @@ public:
  */
 class VulkanMemoryMapping {
 public:
-  VulkanMemoryMapping(VulkanMemoryPage *page) :
-    _holder(page->_lock),
-    _device(page->_device),
-    _memory(page->_memory) {
-  }
-
+  INLINE VulkanMemoryMapping(VulkanMemoryPage *page);
   VulkanMemoryMapping(const VulkanMemoryMapping &copy) = delete;
   INLINE VulkanMemoryMapping(VulkanMemoryMapping &&from) noexcept;
+  INLINE ~VulkanMemoryMapping();
 
-  ~VulkanMemoryMapping() {
-    if (_data != nullptr) {
-      unmap();
-    }
-  }
+  INLINE VulkanMemoryMapping &operator =(VulkanMemoryMapping &&from) noexcept;
 
-  void unmap() {
-    nassertv_always(_data != nullptr);
-    vkUnmapMemory(_device, _memory);
-    _data = nullptr;
-    _holder.unlock();
-  }
+  INLINE void unmap();
 
   explicit operator bool() {
     return _data != nullptr;
@@ -116,8 +109,8 @@ public:
   }
 
   std::unique_lock<Mutex> _holder;
-  VkDevice _device;
-  VkDeviceMemory _memory;
+  VkDevice _device = VK_NULL_HANDLE;
+  VkDeviceMemory _memory = VK_NULL_HANDLE;
   void *_data = nullptr;
 };
 

+ 4 - 7
panda/src/vulkandisplay/vulkanShaderContext.cxx

@@ -312,7 +312,8 @@ update_sattr_uniforms(VulkanGraphicsStateGuardian *gsg) {
     return 0;
   }
 
-  void *ptr = alloca(_ptr_block_size);
+  uint32_t ubo_offset;
+  void *ptr = gsg->alloc_dynamic_uniform_buffer(_ptr_block_size, ubo_offset);
 
   size_t i = 0;
   for (Shader::ShaderPtrSpec &spec : _shader->_ptr_spec) {
@@ -420,7 +421,7 @@ update_sattr_uniforms(VulkanGraphicsStateGuardian *gsg) {
     }
   }
 
-  return gsg->update_dynamic_uniform_buffer(ptr, _ptr_block_size);
+  return ubo_offset;
 }
 
 /**
@@ -435,7 +436,7 @@ update_dynamic_uniforms(VulkanGraphicsStateGuardian *gsg, int altered) {
   if (altered & _mat_deps) {
     gsg->update_shader_matrix_cache(_shader, _mat_part_cache, altered);
 
-    void *ptr = alloca(_mat_block_size);
+    void *ptr = gsg->alloc_dynamic_uniform_buffer(_mat_block_size, _dynamic_uniform_offset);
 
     size_t i = 0;
     for (Shader::ShaderMatSpec &spec : _mat_spec) {
@@ -598,10 +599,6 @@ update_dynamic_uniforms(VulkanGraphicsStateGuardian *gsg, int altered) {
         continue;
       }
     }
-
-    uint32_t offset = gsg->update_dynamic_uniform_buffer(ptr, _mat_block_size);
-    _dynamic_uniform_offset = offset;
-    return offset;
   }
 
   return _dynamic_uniform_offset;