瀏覽代碼

Merge pull request #45672 from reduz/barrier-optimization

Rewrote how barriers work for faster rendering
Rémi Verschelde 4 年之前
父節點
當前提交
2ba66c1457

+ 262 - 34
drivers/vulkan/rendering_device_vulkan.cpp

@@ -1627,6 +1627,9 @@ void RenderingDeviceVulkan::_memory_barrier(VkPipelineStageFlags p_src_stage_mas
 	mem_barrier.srcAccessMask = p_src_access;
 	mem_barrier.dstAccessMask = p_dst_sccess;
 
+	if (p_src_stage_mask == 0 || p_dst_stage_mask == 0) {
+		return; //no barrier, since this is invalid
+	}
 	vkCmdPipelineBarrier(p_sync_with_draw ? frames[frame].draw_command_buffer : frames[frame].setup_command_buffer, p_src_stage_mask, p_dst_stage_mask, 0, 1, &mem_barrier, 0, nullptr, 0, nullptr);
 }
 
@@ -2477,6 +2480,10 @@ Error RenderingDeviceVulkan::texture_update(RID p_texture, uint32_t p_layer, con
 			access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
 		}
 
+		if (barrier_flags == 0) {
+			barrier_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+		}
+
 		VkImageMemoryBarrier image_memory_barrier;
 		image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 		image_memory_barrier.pNext = nullptr;
@@ -2496,6 +2503,13 @@ Error RenderingDeviceVulkan::texture_update(RID p_texture, uint32_t p_layer, con
 		vkCmdPipelineBarrier(command_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, barrier_flags, 0, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
 	}
 
+	if (texture->used_in_frame != frames_drawn) {
+		texture->used_in_raster = false;
+		texture->used_in_compute = false;
+		texture->used_in_frame = frames_drawn;
+	}
+	texture->used_in_transfer = true;
+
 	return OK;
 }
 
@@ -2844,6 +2858,10 @@ Error RenderingDeviceVulkan::texture_copy(RID p_from_texture, RID p_to_texture,
 			access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
 		}
 
+		if (barrier_flags == 0) {
+			barrier_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+		}
+
 		{ //restore src
 			VkImageMemoryBarrier image_memory_barrier;
 			image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
@@ -3011,6 +3029,10 @@ Error RenderingDeviceVulkan::texture_resolve_multisample(RID p_from_texture, RID
 			access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
 		}
 
+		if (barrier_flags == 0) {
+			barrier_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+		}
+
 		{ //restore src
 			VkImageMemoryBarrier image_memory_barrier;
 			image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
@@ -3143,6 +3165,10 @@ Error RenderingDeviceVulkan::texture_clear(RID p_texture, const Color &p_color,
 			access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
 		}
 
+		if (barrier_flags == 0) {
+			barrier_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+		}
+
 		VkImageMemoryBarrier image_memory_barrier;
 		image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 		image_memory_barrier.pNext = nullptr;
@@ -3163,6 +3189,13 @@ Error RenderingDeviceVulkan::texture_clear(RID p_texture, const Color &p_color,
 		vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, barrier_flags, 0, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
 	}
 
+	if (src_tex->used_in_frame != frames_drawn) {
+		src_tex->used_in_raster = false;
+		src_tex->used_in_compute = false;
+		src_tex->used_in_frame = frames_drawn;
+	}
+	src_tex->used_in_transfer = true;
+
 	return OK;
 }
 
@@ -3289,6 +3322,7 @@ VkRenderPass RenderingDeviceVulkan::_render_pass_create(const Vector<AttachmentF
 					dependency_from_external.srcStageMask |= reading_stages;
 				}
 			} break;
+			case INITIAL_ACTION_CLEAR_REGION_CONTINUE:
 			case INITIAL_ACTION_CONTINUE: {
 				if (p_format[i].usage_flags & TEXTURE_USAGE_COLOR_ATTACHMENT_BIT) {
 					description.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
@@ -3296,7 +3330,7 @@ VkRenderPass RenderingDeviceVulkan::_render_pass_create(const Vector<AttachmentF
 					description.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
 				} else if (p_format[i].usage_flags & TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
 					description.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
-					description.initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; //don't care what is there
+					description.initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
 					description.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
 				} else {
 					description.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
@@ -3425,8 +3459,13 @@ VkRenderPass RenderingDeviceVulkan::_render_pass_create(const Vector<AttachmentF
 	render_pass_create_info.pAttachments = attachments.ptr();
 	render_pass_create_info.subpassCount = 1;
 	render_pass_create_info.pSubpasses = &subpass;
-	render_pass_create_info.dependencyCount = 2;
-	render_pass_create_info.pDependencies = dependencies;
+	// Commenting this because it seems it just avoids raster and compute to work at the same time.
+	// Other barriers seem to be protecting the render pass fine.
+	//	render_pass_create_info.dependencyCount = 2;
+	//	render_pass_create_info.pDependencies = dependencies;
+
+	render_pass_create_info.dependencyCount = 0;
+	render_pass_create_info.pDependencies = nullptr;
 
 	VkRenderPass render_pass;
 	VkResult res = vkCreateRenderPass(device, &render_pass_create_info, nullptr, &render_pass);
@@ -4108,6 +4147,8 @@ RID RenderingDeviceVulkan::shader_create(const Vector<ShaderStageData> &p_stages
 
 	bool is_compute = false;
 
+	uint32_t compute_local_size[3] = { 0, 0, 0 };
+
 	for (int i = 0; i < p_stages.size(); i++) {
 		if (p_stages[i].shader_stage == SHADER_STAGE_COMPUTE) {
 			is_compute = true;
@@ -4124,6 +4165,11 @@ RID RenderingDeviceVulkan::shader_create(const Vector<ShaderStageData> &p_stages
 			ERR_FAIL_COND_V_MSG(result != SPV_REFLECT_RESULT_SUCCESS, RID(),
 					"Reflection of SPIR-V shader stage '" + String(shader_stage_names[p_stages[i].shader_stage]) + "' failed parsing shader.");
 
+			if (is_compute) {
+				compute_local_size[0] = module.entry_points->local_size.x;
+				compute_local_size[1] = module.entry_points->local_size.y;
+				compute_local_size[2] = module.entry_points->local_size.z;
+			}
 			uint32_t binding_count = 0;
 			result = spvReflectEnumerateDescriptorBindings(&module, &binding_count, nullptr);
 			ERR_FAIL_COND_V_MSG(result != SPV_REFLECT_RESULT_SUCCESS, RID(),
@@ -4328,6 +4374,7 @@ RID RenderingDeviceVulkan::shader_create(const Vector<ShaderStageData> &p_stages
 					}
 				}
 			}
+
 			uint32_t pc_count = 0;
 			result = spvReflectEnumeratePushConstantBlocks(&module, &pc_count, nullptr);
 			ERR_FAIL_COND_V_MSG(result != SPV_REFLECT_RESULT_SUCCESS, RID(),
@@ -4376,6 +4423,9 @@ RID RenderingDeviceVulkan::shader_create(const Vector<ShaderStageData> &p_stages
 	shader.fragment_outputs = fragment_outputs;
 	shader.push_constant = push_constant;
 	shader.is_compute = is_compute;
+	shader.compute_local_size[0] = compute_local_size[0];
+	shader.compute_local_size[1] = compute_local_size[1];
+	shader.compute_local_size[2] = compute_local_size[2];
 
 	String error_text;
 
@@ -5216,7 +5266,14 @@ Error RenderingDeviceVulkan::buffer_update(RID p_buffer, uint32_t p_offset, uint
 #ifdef FORCE_FULL_BARRIER
 	_full_barrier(true);
 #else
-	_buffer_memory_barrier(buffer->buffer, p_offset, p_size, VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage_mask, VK_ACCESS_TRANSFER_WRITE_BIT, dst_access, true);
+	if (dst_stage_mask == 0) {
+		dst_stage_mask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+	}
+
+	if (p_post_barrier != RD::BARRIER_MASK_NO_BARRIER) {
+		_buffer_memory_barrier(buffer->buffer, p_offset, p_size, VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage_mask, VK_ACCESS_TRANSFER_WRITE_BIT, dst_access, dst_stage_mask);
+	}
+
 #endif
 	return err;
 }
@@ -5255,7 +5312,12 @@ Error RenderingDeviceVulkan::buffer_clear(RID p_buffer, uint32_t p_offset, uint3
 #ifdef FORCE_FULL_BARRIER
 	_full_barrier(true);
 #else
-	_buffer_memory_barrier(buffer->buffer, p_offset, p_size, VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage_mask, VK_ACCESS_TRANSFER_WRITE_BIT, dst_access, p_post_barrier);
+	if (dst_stage_mask == 0) {
+		dst_stage_mask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+	}
+
+	_buffer_memory_barrier(buffer->buffer, p_offset, p_size, VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage_mask, VK_ACCESS_TRANSFER_WRITE_BIT, dst_access, dst_stage_mask);
+
 #endif
 	return OK;
 }
@@ -5710,6 +5772,9 @@ RID RenderingDeviceVulkan::compute_pipeline_create(RID p_shader) {
 	pipeline.pipeline_layout = shader->pipeline_layout;
 	pipeline.shader = p_shader;
 	pipeline.push_constant_size = shader->push_constant.push_constant_size;
+	pipeline.local_group_size[0] = shader->compute_local_size[0];
+	pipeline.local_group_size[1] = shader->compute_local_size[1];
+	pipeline.local_group_size[2] = shader->compute_local_size[2];
 
 	//create ID to associate with this pipeline
 	RID id = compute_pipeline_owner.make_rid(pipeline);
@@ -6019,7 +6084,7 @@ RenderingDevice::DrawListID RenderingDeviceVulkan::draw_list_begin(RID p_framebu
 	_THREAD_SAFE_METHOD_
 
 	ERR_FAIL_COND_V_MSG(draw_list != nullptr, INVALID_ID, "Only one draw list can be active at the same time.");
-	ERR_FAIL_COND_V_MSG(compute_list != nullptr, INVALID_ID, "Only one draw/compute list can be active at the same time.");
+	ERR_FAIL_COND_V_MSG(compute_list != nullptr && !compute_list->state.allow_draw_overlap, INVALID_ID, "Only one draw/compute list can be active at the same time.");
 
 	Framebuffer *framebuffer = framebuffer_owner.getornull(p_framebuffer);
 	ERR_FAIL_COND_V(!framebuffer, INVALID_ID);
@@ -6040,7 +6105,14 @@ RenderingDevice::DrawListID RenderingDeviceVulkan::draw_list_begin(RID p_framebu
 
 		viewport_offset = regioni.position;
 		viewport_size = regioni.size;
-
+		if (p_initial_color_action == INITIAL_ACTION_CLEAR_REGION_CONTINUE) {
+			needs_clear_color = true;
+			p_initial_color_action = INITIAL_ACTION_CONTINUE;
+		}
+		if (p_initial_depth_action == INITIAL_ACTION_CLEAR_REGION_CONTINUE) {
+			needs_clear_depth = true;
+			p_initial_depth_action = INITIAL_ACTION_CONTINUE;
+		}
 		if (p_initial_color_action == INITIAL_ACTION_CLEAR_REGION) {
 			needs_clear_color = true;
 			p_initial_color_action = INITIAL_ACTION_KEEP;
@@ -6388,6 +6460,19 @@ void RenderingDeviceVulkan::draw_list_bind_uniform_set(DrawListID p_list, RID p_
 	dl->state.sets[p_index].uniform_set_format = uniform_set->format;
 	dl->state.sets[p_index].uniform_set = p_uniform_set;
 
+	uint32_t mst_count = uniform_set->mutable_storage_textures.size();
+	if (mst_count) {
+		Texture **mst_textures = const_cast<UniformSet *>(uniform_set)->mutable_storage_textures.ptrw();
+		for (uint32_t i = 0; i < mst_count; i++) {
+			if (mst_textures[i]->used_in_frame != frames_drawn) {
+				mst_textures[i]->used_in_frame = frames_drawn;
+				mst_textures[i]->used_in_transfer = false;
+				mst_textures[i]->used_in_compute = false;
+			}
+			mst_textures[i]->used_in_raster = true;
+		}
+	}
+
 #ifdef DEBUG_ENABLED
 	{ //validate that textures bound are not attached as framebuffer bindings
 		uint32_t attachable_count = uniform_set->attachable_textures.size();
@@ -6673,23 +6758,43 @@ void RenderingDeviceVulkan::draw_list_end(uint32_t p_post_barrier) {
 		access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
 	}
 	if (p_post_barrier & BARRIER_MASK_RASTER) {
-		barrier_flags |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
-		access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+		barrier_flags |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT /*| VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT*/;
+		access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT /*| VK_ACCESS_INDIRECT_COMMAND_READ_BIT*/;
 	}
 	if (p_post_barrier & BARRIER_MASK_TRANSFER) {
 		barrier_flags |= VK_PIPELINE_STAGE_TRANSFER_BIT;
 		access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT;
 	}
 
+	if (barrier_flags == 0) {
+		barrier_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+	}
+
 	draw_list_bound_textures.clear();
 
-	for (int i = 0; i < draw_list_storage_textures.size(); i++) {
+	VkImageMemoryBarrier *image_barriers = nullptr;
+
+	uint32_t image_barrier_count = draw_list_storage_textures.size();
+
+	if (image_barrier_count) {
+		image_barriers = (VkImageMemoryBarrier *)alloca(sizeof(VkImageMemoryBarrier) * draw_list_storage_textures.size());
+	}
+
+	uint32_t src_stage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+	uint32_t src_access = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+
+	if (image_barrier_count) {
+		src_stage |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+		src_access |= VK_ACCESS_SHADER_WRITE_BIT;
+	}
+
+	for (uint32_t i = 0; i < image_barrier_count; i++) {
 		Texture *texture = texture_owner.getornull(draw_list_storage_textures[i]);
 
-		VkImageMemoryBarrier image_memory_barrier;
+		VkImageMemoryBarrier &image_memory_barrier = image_barriers[i];
 		image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 		image_memory_barrier.pNext = nullptr;
-		image_memory_barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+		image_memory_barrier.srcAccessMask = src_access;
 		image_memory_barrier.dstAccessMask = access_flags;
 		image_memory_barrier.oldLayout = texture->layout;
 		image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
@@ -6703,8 +6808,6 @@ void RenderingDeviceVulkan::draw_list_end(uint32_t p_post_barrier) {
 		image_memory_barrier.subresourceRange.baseArrayLayer = texture->base_layer;
 		image_memory_barrier.subresourceRange.layerCount = texture->layers;
 
-		vkCmdPipelineBarrier(frames[frame].draw_command_buffer, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
-
 		texture->layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 	}
 
@@ -6717,7 +6820,17 @@ void RenderingDeviceVulkan::draw_list_end(uint32_t p_post_barrier) {
 #ifdef FORCE_FULL_BARRIER
 	_full_barrier(true);
 #else
-	_memory_barrier(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, barrier_flags, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, access_flags, true);
+
+	VkMemoryBarrier mem_barrier;
+	mem_barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+	mem_barrier.pNext = nullptr;
+	mem_barrier.srcAccessMask = src_access;
+	mem_barrier.dstAccessMask = access_flags;
+
+	if (image_barrier_count > 0 || p_post_barrier != BARRIER_MASK_NO_BARRIER) {
+		vkCmdPipelineBarrier(frames[frame].draw_command_buffer, src_stage, barrier_flags, 0, 1, &mem_barrier, 0, nullptr, image_barrier_count, image_barriers);
+	}
+
 #endif
 }
 
@@ -6725,12 +6838,13 @@ void RenderingDeviceVulkan::draw_list_end(uint32_t p_post_barrier) {
 /**** COMPUTE LISTS ****/
 /***********************/
 
-RenderingDevice::ComputeListID RenderingDeviceVulkan::compute_list_begin() {
-	ERR_FAIL_COND_V_MSG(draw_list != nullptr, INVALID_ID, "Only one draw list can be active at the same time.");
+RenderingDevice::ComputeListID RenderingDeviceVulkan::compute_list_begin(bool p_allow_draw_overlap) {
+	ERR_FAIL_COND_V_MSG(!p_allow_draw_overlap && draw_list != nullptr, INVALID_ID, "Only one draw list can be active at the same time.");
 	ERR_FAIL_COND_V_MSG(compute_list != nullptr, INVALID_ID, "Only one draw/compute list can be active at the same time.");
 
 	compute_list = memnew(ComputeList);
 	compute_list->command_buffer = frames[frame].draw_command_buffer;
+	compute_list->state.allow_draw_overlap = p_allow_draw_overlap;
 
 	return ID_TYPE_COMPUTE_LIST;
 }
@@ -6787,6 +6901,9 @@ void RenderingDeviceVulkan::compute_list_bind_compute_pipeline(ComputeListID p_l
 		}
 
 		cl->state.pipeline_shader = pipeline->shader;
+		cl->state.local_group_size[0] = pipeline->local_group_size[0];
+		cl->state.local_group_size[1] = pipeline->local_group_size[1];
+		cl->state.local_group_size[2] = pipeline->local_group_size[2];
 	}
 
 #ifdef DEBUG_ENABLED
@@ -6824,11 +6941,24 @@ void RenderingDeviceVulkan::compute_list_bind_uniform_set(ComputeListID p_list,
 	cl->state.sets[p_index].uniform_set = p_uniform_set;
 
 	uint32_t textures_to_sampled_count = uniform_set->mutable_sampled_textures.size();
+	uint32_t textures_to_storage_count = uniform_set->mutable_storage_textures.size();
+
 	Texture **textures_to_sampled = uniform_set->mutable_sampled_textures.ptrw();
 
+	VkImageMemoryBarrier *texture_barriers = nullptr;
+
+	if (textures_to_sampled_count + textures_to_storage_count) {
+		texture_barriers = (VkImageMemoryBarrier *)alloca(sizeof(VkImageMemoryBarrier) * (textures_to_sampled_count + textures_to_storage_count));
+	}
+	uint32_t texture_barrier_count = 0;
+
+	uint32_t src_stage_flags = 0;
+
 	for (uint32_t i = 0; i < textures_to_sampled_count; i++) {
 		if (textures_to_sampled[i]->layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
-			VkImageMemoryBarrier image_memory_barrier;
+			src_stage_flags |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+			VkImageMemoryBarrier &image_memory_barrier = texture_barriers[texture_barrier_count++];
 			image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 			image_memory_barrier.pNext = nullptr;
 			image_memory_barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
@@ -6845,23 +6975,55 @@ void RenderingDeviceVulkan::compute_list_bind_uniform_set(ComputeListID p_list,
 			image_memory_barrier.subresourceRange.baseArrayLayer = textures_to_sampled[i]->base_layer;
 			image_memory_barrier.subresourceRange.layerCount = textures_to_sampled[i]->layers;
 
-			vkCmdPipelineBarrier(cl->command_buffer, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
-
 			textures_to_sampled[i]->layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 
 			cl->state.textures_to_sampled_layout.erase(textures_to_sampled[i]);
 		}
+
+		if (textures_to_sampled[i]->used_in_frame != frames_drawn) {
+			textures_to_sampled[i]->used_in_frame = frames_drawn;
+			textures_to_sampled[i]->used_in_transfer = false;
+			textures_to_sampled[i]->used_in_raster = false;
+		}
+		textures_to_sampled[i]->used_in_compute = true;
 	}
 
-	uint32_t textures_to_storage_count = uniform_set->mutable_storage_textures.size();
 	Texture **textures_to_storage = uniform_set->mutable_storage_textures.ptrw();
 
 	for (uint32_t i = 0; i < textures_to_storage_count; i++) {
 		if (textures_to_storage[i]->layout != VK_IMAGE_LAYOUT_GENERAL) {
-			VkImageMemoryBarrier image_memory_barrier;
+			uint32_t src_access_flags = 0;
+
+			if (textures_to_storage[i]->used_in_frame == frames_drawn) {
+				if (textures_to_storage[i]->used_in_compute) {
+					src_stage_flags |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+					src_access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+				}
+				if (textures_to_storage[i]->used_in_raster) {
+					src_stage_flags |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT;
+					src_access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+				}
+				if (textures_to_storage[i]->used_in_transfer) {
+					src_stage_flags |= VK_PIPELINE_STAGE_TRANSFER_BIT;
+					src_access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT;
+				}
+
+				textures_to_storage[i]->used_in_compute = false;
+				textures_to_storage[i]->used_in_raster = false;
+				textures_to_storage[i]->used_in_compute = false;
+
+			} else {
+				src_access_flags = 0;
+				textures_to_storage[i]->used_in_compute = false;
+				textures_to_storage[i]->used_in_raster = false;
+				textures_to_storage[i]->used_in_compute = false;
+				textures_to_storage[i]->used_in_frame = frames_drawn;
+			}
+
+			VkImageMemoryBarrier &image_memory_barrier = texture_barriers[texture_barrier_count++];
 			image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 			image_memory_barrier.pNext = nullptr;
-			image_memory_barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+			image_memory_barrier.srcAccessMask = src_access_flags;
 			image_memory_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
 			image_memory_barrier.oldLayout = textures_to_storage[i]->layout;
 			image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
@@ -6875,14 +7037,20 @@ void RenderingDeviceVulkan::compute_list_bind_uniform_set(ComputeListID p_list,
 			image_memory_barrier.subresourceRange.baseArrayLayer = textures_to_storage[i]->base_layer;
 			image_memory_barrier.subresourceRange.layerCount = textures_to_storage[i]->layers;
 
-			vkCmdPipelineBarrier(cl->command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
-
 			textures_to_storage[i]->layout = VK_IMAGE_LAYOUT_GENERAL;
 
 			cl->state.textures_to_sampled_layout.insert(textures_to_storage[i]); //needs to go back to sampled layout afterwards
 		}
 	}
 
+	if (texture_barrier_count) {
+		if (src_stage_flags == 0) {
+			src_stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+		}
+
+		vkCmdPipelineBarrier(cl->command_buffer, src_stage_flags, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, texture_barrier_count, texture_barriers);
+	}
+
 #if 0
 	{ //validate that textures bound are not attached as framebuffer bindings
 		uint32_t attachable_count = uniform_set->attachable_textures.size();
@@ -6976,6 +7144,27 @@ void RenderingDeviceVulkan::compute_list_dispatch(ComputeListID p_list, uint32_t
 	vkCmdDispatch(cl->command_buffer, p_x_groups, p_y_groups, p_z_groups);
 }
 
+void RenderingDeviceVulkan::compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads) {
+	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
+	ERR_FAIL_COND(!compute_list);
+
+	ComputeList *cl = compute_list;
+
+#ifdef DEBUG_ENABLED
+
+	ERR_FAIL_COND_MSG(!cl->validation.pipeline_active, "No compute pipeline was set before attempting to draw.");
+
+	if (cl->validation.pipeline_push_constant_size > 0) {
+		//using push constants, check that they were supplied
+		ERR_FAIL_COND_MSG(!cl->validation.pipeline_push_constant_supplied,
+				"The shader in this pipeline requires a push constant to be set before drawing, but it's not present.");
+	}
+
+#endif
+
+	compute_list_dispatch(p_list, (p_x_threads - 1) / cl->state.local_group_size[0] + 1, (p_y_threads - 1) / cl->state.local_group_size[1] + 1, (p_z_threads - 1) / cl->state.local_group_size[2] + 1);
+}
+
 void RenderingDeviceVulkan::compute_list_dispatch_indirect(ComputeListID p_list, RID p_buffer, uint32_t p_offset) {
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_COND(!compute_list);
@@ -7047,7 +7236,7 @@ void RenderingDeviceVulkan::compute_list_end(uint32_t p_post_barrier) {
 	uint32_t access_flags = 0;
 	if (p_post_barrier & BARRIER_MASK_COMPUTE) {
 		barrier_flags |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-		access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+		access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
 	}
 	if (p_post_barrier & BARRIER_MASK_RASTER) {
 		barrier_flags |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
@@ -7058,8 +7247,22 @@ void RenderingDeviceVulkan::compute_list_end(uint32_t p_post_barrier) {
 		access_flags |= VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT;
 	}
 
+	if (barrier_flags == 0) {
+		barrier_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+	}
+
+	VkImageMemoryBarrier *image_barriers = nullptr;
+
+	uint32_t image_barrier_count = compute_list->state.textures_to_sampled_layout.size();
+
+	if (image_barrier_count) {
+		image_barriers = (VkImageMemoryBarrier *)alloca(sizeof(VkImageMemoryBarrier) * image_barrier_count);
+	}
+
+	uint32_t barrier_idx = 0;
+
 	for (Set<Texture *>::Element *E = compute_list->state.textures_to_sampled_layout.front(); E; E = E->next()) {
-		VkImageMemoryBarrier image_memory_barrier;
+		VkImageMemoryBarrier &image_memory_barrier = image_barriers[barrier_idx++];
 		image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 		image_memory_barrier.pNext = nullptr;
 		image_memory_barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
@@ -7076,19 +7279,33 @@ void RenderingDeviceVulkan::compute_list_end(uint32_t p_post_barrier) {
 		image_memory_barrier.subresourceRange.baseArrayLayer = E->get()->base_layer;
 		image_memory_barrier.subresourceRange.layerCount = E->get()->layers;
 
-		// TODO: Look at the usages in the compute list and determine tighter dst stage and access masks based on some "final" usage equivalent
-		vkCmdPipelineBarrier(compute_list->command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, barrier_flags, 0, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
-
 		E->get()->layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+		if (E->get()->used_in_frame != frames_drawn) {
+			E->get()->used_in_transfer = false;
+			E->get()->used_in_raster = false;
+			E->get()->used_in_compute = false;
+			E->get()->used_in_frame = frames_drawn;
+		}
 	}
 
-	memdelete(compute_list);
-	compute_list = nullptr;
 #ifdef FORCE_FULL_BARRIER
 	_full_barrier(true);
 #else
-	_memory_barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, barrier_flags, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT, true);
+	VkMemoryBarrier mem_barrier;
+	mem_barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+	mem_barrier.pNext = nullptr;
+	mem_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+	mem_barrier.dstAccessMask = access_flags;
+
+	if (image_barrier_count > 0 || p_post_barrier != BARRIER_MASK_NO_BARRIER) {
+		vkCmdPipelineBarrier(compute_list->command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, barrier_flags, 0, 1, &mem_barrier, 0, nullptr, image_barrier_count, image_barriers);
+	}
+
 #endif
+
+	memdelete(compute_list);
+	compute_list = nullptr;
 }
 
 void RenderingDeviceVulkan::barrier(uint32_t p_from, uint32_t p_to) {
@@ -7111,7 +7328,7 @@ void RenderingDeviceVulkan::barrier(uint32_t p_from, uint32_t p_to) {
 	uint32_t dst_access_flags = 0;
 	if (p_to & BARRIER_MASK_COMPUTE) {
 		dst_barrier_flags |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-		dst_access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+		dst_access_flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
 	}
 	if (p_to & BARRIER_MASK_RASTER) {
 		dst_barrier_flags |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
@@ -7325,6 +7542,16 @@ void RenderingDeviceVulkan::draw_command_end_label() {
 	context->command_end_label(frames[frame].draw_command_buffer);
 }
 
+String RenderingDeviceVulkan::get_device_vendor_name() const {
+	return context->get_device_vendor_name();
+}
+String RenderingDeviceVulkan::get_device_name() const {
+	return context->get_device_name();
+}
+String RenderingDeviceVulkan::get_device_pipeline_cache_uuid() const {
+	return context->get_device_pipeline_cache_uuid();
+}
+
 void RenderingDeviceVulkan::_finalize_command_bufers() {
 	if (draw_list) {
 		ERR_PRINT("Found open draw list at the end of the frame, this should never happen (further drawing will likely not work).");
@@ -7377,6 +7604,7 @@ void RenderingDeviceVulkan::_begin_frame() {
 
 	if (frames[frame].timestamp_count) {
 		vkGetQueryPoolResults(device, frames[frame].timestamp_pool, 0, frames[frame].timestamp_count, sizeof(uint64_t) * max_timestamp_query_elements, frames[frame].timestamp_result_values, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT);
+		vkCmdResetQueryPool(frames[frame].setup_command_buffer, frames[frame].timestamp_pool, 0, frames[frame].timestamp_count);
 		SWAP(frames[frame].timestamp_names, frames[frame].timestamp_result_names);
 		SWAP(frames[frame].timestamp_cpu_values, frames[frame].timestamp_cpu_result_values);
 	}

+ 16 - 1
drivers/vulkan/rendering_device_vulkan.h

@@ -141,6 +141,11 @@ class RenderingDeviceVulkan : public RenderingDevice {
 
 		VkImageLayout layout;
 
+		uint64_t used_in_frame = 0;
+		bool used_in_transfer = false;
+		bool used_in_raster = false;
+		bool used_in_compute = false;
+
 		uint32_t read_aspect_mask = 0;
 		uint32_t barrier_aspect_mask = 0;
 		bool bound = false; //bound to framebffer
@@ -528,6 +533,8 @@ class RenderingDeviceVulkan : public RenderingDevice {
 
 		PushConstant push_constant;
 
+		uint32_t compute_local_size[3] = { 0, 0, 0 };
+
 		bool is_compute = false;
 		int max_output = 0;
 		Vector<Set> sets;
@@ -686,6 +693,7 @@ class RenderingDeviceVulkan : public RenderingDevice {
 		VkPipeline pipeline = VK_NULL_HANDLE;
 		uint32_t push_constant_size = 0;
 		uint32_t push_constant_stages = 0;
+		uint32_t local_group_size[3] = { 0, 0, 0 };
 	};
 
 	RID_Owner<ComputePipeline, true> compute_pipeline_owner;
@@ -808,8 +816,10 @@ class RenderingDeviceVulkan : public RenderingDevice {
 			uint32_t set_count = 0;
 			RID pipeline;
 			RID pipeline_shader;
+			uint32_t local_group_size[3] = { 0, 0, 0 };
 			VkPipelineLayout pipeline_layout = VK_NULL_HANDLE;
 			uint32_t pipeline_push_constant_stages = 0;
+			bool allow_draw_overlap;
 		} state;
 
 #ifdef DEBUG_ENABLED
@@ -1028,13 +1038,14 @@ public:
 	/**** COMPUTE LISTS ****/
 	/***********************/
 
-	virtual ComputeListID compute_list_begin();
+	virtual ComputeListID compute_list_begin(bool p_allow_draw_overlap = false);
 	virtual void compute_list_bind_compute_pipeline(ComputeListID p_list, RID p_compute_pipeline);
 	virtual void compute_list_bind_uniform_set(ComputeListID p_list, RID p_uniform_set, uint32_t p_index);
 	virtual void compute_list_set_push_constant(ComputeListID p_list, const void *p_data, uint32_t p_data_size);
 	virtual void compute_list_add_barrier(ComputeListID p_list);
 
 	virtual void compute_list_dispatch(ComputeListID p_list, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups);
+	virtual void compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads);
 	virtual void compute_list_dispatch_indirect(ComputeListID p_list, RID p_buffer, uint32_t p_offset);
 	virtual void compute_list_end(uint32_t p_post_barrier = BARRIER_MASK_ALL);
 
@@ -1085,6 +1096,10 @@ public:
 	virtual void draw_command_insert_label(String p_label_name, const Color p_color = Color(1, 1, 1, 1));
 	virtual void draw_command_end_label();
 
+	virtual String get_device_vendor_name() const;
+	virtual String get_device_name() const;
+	virtual String get_device_pipeline_cache_uuid() const;
+
 	RenderingDeviceVulkan();
 	~RenderingDeviceVulkan();
 };

+ 54 - 8
drivers/vulkan/vulkan_context.cpp

@@ -380,7 +380,8 @@ Error VulkanContext::_create_physical_device() {
 		ERR_FAIL_V(ERR_CANT_CREATE);
 	}
 	/* for now, just grab the first physical device */
-	gpu = physical_devices[0];
+	uint32_t device_index = 0;
+	gpu = physical_devices[device_index];
 	free(physical_devices);
 
 	/* Look for device extensions */
@@ -389,6 +390,40 @@ Error VulkanContext::_create_physical_device() {
 	enabled_extension_count = 0;
 	memset(extension_names, 0, sizeof(extension_names));
 
+	/* Get identifier properties */
+	vkGetPhysicalDeviceProperties(gpu, &gpu_props);
+
+	static const struct {
+		uint32_t id;
+		const char *name;
+	} vendor_names[] = {
+		{ 0x1002, "AMD" },
+		{ 0x1010, "ImgTec" },
+		{ 0x10DE, "NVIDIA" },
+		{ 0x13B5, "ARM" },
+		{ 0x5143, "Qualcomm" },
+		{ 0x8086, "INTEL" },
+		{ 0, nullptr },
+	};
+	device_name = gpu_props.deviceName;
+	pipeline_cache_id = String::hex_encode_buffer(gpu_props.pipelineCacheUUID, VK_UUID_SIZE);
+	pipeline_cache_id += "-driver-" + itos(gpu_props.driverVersion);
+	{
+		device_vendor = "Unknown";
+		uint32_t vendor_idx = 0;
+		while (vendor_names[vendor_idx].name != nullptr) {
+			if (gpu_props.vendorID == vendor_names[vendor_idx].id) {
+				device_vendor = vendor_names[vendor_idx].name;
+				break;
+			}
+			vendor_idx++;
+		}
+	}
+#ifdef DEBUG_ENABLED
+	print_line("Using Vulkan Device #" + itos(device_index) + ": " + device_vendor + " - " + device_name);
+#endif
+	device_api_version = gpu_props.apiVersion;
+
 	err = vkEnumerateDeviceExtensionProperties(gpu, nullptr, &device_extension_count, nullptr);
 	ERR_FAIL_COND_V(err, ERR_CANT_CREATE);
 
@@ -498,7 +533,6 @@ Error VulkanContext::_create_physical_device() {
 				break;
 		}
 	}
-	vkGetPhysicalDeviceProperties(gpu, &gpu_props);
 
 	/* Call with NULL data to get count */
 	vkGetPhysicalDeviceQueueFamilyProperties(gpu, &queue_family_count, nullptr);
@@ -565,6 +599,7 @@ Error VulkanContext::_create_device() {
 	}
 	err = vkCreateDevice(gpu, &sdevice, nullptr, &device);
 	ERR_FAIL_COND_V(err, ERR_CANT_CREATE);
+
 	return OK;
 }
 
@@ -1590,11 +1625,12 @@ void VulkanContext::command_begin_label(VkCommandBuffer p_command_buffer, String
 	if (!enabled_debug_utils) {
 		return;
 	}
+
+	CharString cs = p_label_name.utf8().get_data();
 	VkDebugUtilsLabelEXT label;
 	label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
 	label.pNext = nullptr;
-	CharString label_name = p_label_name.utf8();
-	label.pLabelName = label_name.get_data();
+	label.pLabelName = cs.get_data();
 	label.color[0] = p_color[0];
 	label.color[1] = p_color[1];
 	label.color[2] = p_color[2];
@@ -1606,11 +1642,11 @@ void VulkanContext::command_insert_label(VkCommandBuffer p_command_buffer, Strin
 	if (!enabled_debug_utils) {
 		return;
 	}
+	CharString cs = p_label_name.utf8().get_data();
 	VkDebugUtilsLabelEXT label;
 	label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
 	label.pNext = nullptr;
-	CharString label_name = p_label_name.utf8();
-	label.pLabelName = label_name.get_data();
+	label.pLabelName = cs.get_data();
 	label.color[0] = p_color[0];
 	label.color[1] = p_color[1];
 	label.color[2] = p_color[2];
@@ -1629,16 +1665,26 @@ void VulkanContext::set_object_name(VkObjectType p_object_type, uint64_t p_objec
 	if (!enabled_debug_utils) {
 		return;
 	}
+	CharString obj_data = p_object_name.utf8();
 	VkDebugUtilsObjectNameInfoEXT name_info;
 	name_info.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT;
 	name_info.pNext = nullptr;
 	name_info.objectType = p_object_type;
 	name_info.objectHandle = p_object_handle;
-	CharString object_name = p_object_name.utf8();
-	name_info.pObjectName = object_name.get_data();
+	name_info.pObjectName = obj_data.get_data();
 	SetDebugUtilsObjectNameEXT(device, &name_info);
 }
 
+String VulkanContext::get_device_vendor_name() const {
+	return device_vendor;
+}
+String VulkanContext::get_device_name() const {
+	return device_name;
+}
+String VulkanContext::get_device_pipeline_cache_uuid() const {
+	return pipeline_cache_id;
+}
+
 VulkanContext::VulkanContext() {
 	use_validation_layers = Engine::get_singleton()->is_validation_layers_enabled();
 

+ 9 - 0
drivers/vulkan/vulkan_context.h

@@ -57,6 +57,11 @@ class VulkanContext {
 	bool device_initialized = false;
 	bool inst_initialized = false;
 
+	String device_vendor;
+	String device_name;
+	String pipeline_cache_id;
+	uint32_t device_api_version = 0;
+
 	bool buffers_prepared = false;
 
 	// Present queue.
@@ -215,6 +220,10 @@ public:
 	void command_end_label(VkCommandBuffer p_command_buffer);
 	void set_object_name(VkObjectType p_object_type, uint64_t p_object_handle, String p_object_name);
 
+	String get_device_vendor_name() const;
+	String get_device_name() const;
+	String get_device_pipeline_cache_uuid() const;
+
 	VulkanContext();
 	virtual ~VulkanContext();
 };

+ 0 - 1
scene/resources/sky_material.cpp

@@ -597,5 +597,4 @@ PhysicalSkyMaterial::PhysicalSkyMaterial() {
 
 PhysicalSkyMaterial::~PhysicalSkyMaterial() {
 	RS::get_singleton()->free(shader);
-	RS::get_singleton()->material_set_shader(_get_material(), RID());
 }

+ 14 - 8
servers/rendering/renderer_rd/cluster_builder_rd.cpp

@@ -400,12 +400,14 @@ void ClusterBuilderRD::begin(const Transform &p_view_transform, const CameraMatr
 void ClusterBuilderRD::bake_cluster() {
 	RENDER_TIMESTAMP(">Bake Cluster");
 
+	RD::get_singleton()->draw_command_begin_label("Bake Light Cluster");
+
 	//clear cluster buffer
-	RD::get_singleton()->buffer_clear(cluster_buffer, 0, cluster_buffer_size);
+	RD::get_singleton()->buffer_clear(cluster_buffer, 0, cluster_buffer_size, 0);
 
 	if (render_element_count > 0) {
 		//clear render buffer
-		RD::get_singleton()->buffer_clear(cluster_render_buffer, 0, cluster_render_buffer_size);
+		RD::get_singleton()->buffer_clear(cluster_render_buffer, 0, cluster_render_buffer_size, 0);
 
 		{ //fill state uniform
 
@@ -420,15 +422,16 @@ void ClusterBuilderRD::bake_cluster() {
 			state.cluster_depth_offset = (render_element_max / 32);
 			state.cluster_data_size = state.cluster_depth_offset + render_element_max;
 
-			RD::get_singleton()->buffer_update(state_uniform, 0, sizeof(StateUniform), &state);
+			RD::get_singleton()->buffer_update(state_uniform, 0, sizeof(StateUniform), &state, 0);
 		}
 
 		//update instances
 
-		RD::get_singleton()->buffer_update(element_buffer, 0, sizeof(RenderElementData) * render_element_count, render_elements);
+		RD::get_singleton()->buffer_update(element_buffer, 0, sizeof(RenderElementData) * render_element_count, render_elements, 0);
 
 		RENDER_TIMESTAMP("Render Elements");
 
+		RD::get_singleton()->barrier(RD::BARRIER_MASK_TRANSFER, RD::BARRIER_MASK_RASTER);
 		//render elements
 		{
 			RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(framebuffer, RD::INITIAL_ACTION_DROP, RD::FINAL_ACTION_DISCARD, RD::INITIAL_ACTION_DROP, RD::FINAL_ACTION_DISCARD);
@@ -469,7 +472,7 @@ void ClusterBuilderRD::bake_cluster() {
 				RD::get_singleton()->draw_list_draw(draw_list, true, instances);
 				i += instances;
 			}
-			RD::get_singleton()->draw_list_end();
+			RD::get_singleton()->draw_list_end(RD::BARRIER_MASK_COMPUTE);
 		}
 		//store elements
 		RENDER_TIMESTAMP("Pack Elements");
@@ -491,12 +494,15 @@ void ClusterBuilderRD::bake_cluster() {
 
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(ClusterBuilderSharedDataRD::ClusterStore::PushConstant));
 
-			RD::get_singleton()->compute_list_dispatch_threads(compute_list, cluster_screen_size.x, cluster_screen_size.y, 1, 8, 8, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, cluster_screen_size.x, cluster_screen_size.y, 1);
 
-			RD::get_singleton()->compute_list_end();
+			RD::get_singleton()->compute_list_end(RD::BARRIER_MASK_RASTER | RD::BARRIER_MASK_COMPUTE);
 		}
+	} else {
+		RD::get_singleton()->barrier(RD::BARRIER_MASK_TRANSFER, RD::BARRIER_MASK_RASTER | RD::BARRIER_MASK_COMPUTE);
 	}
 	RENDER_TIMESTAMP("<Bake Cluster");
+	RD::get_singleton()->draw_command_end_label();
 }
 
 void ClusterBuilderRD::debug(ElementType p_element) {
@@ -519,7 +525,7 @@ void ClusterBuilderRD::debug(ElementType p_element) {
 
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(ClusterBuilderSharedDataRD::ClusterDebug::PushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, screen_size.x, screen_size.y, 1, 8, 8, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, screen_size.x, screen_size.y, 1);
 
 	RD::get_singleton()->compute_list_end();
 }

+ 57 - 100
servers/rendering/renderer_rd/effects_rd.cpp

@@ -299,15 +299,12 @@ void EffectsRD::copy_to_rect(RID p_source_rd_texture, RID p_dest_texture, const
 	copy.push_constant.target[0] = p_rect.position.x;
 	copy.push_constant.target[1] = p_rect.position.y;
 
-	int32_t x_groups = (p_rect.size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_rect.size.height - 1) / 8 + 1;
-
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[p_8_bit_dst ? COPY_MODE_SIMPLY_COPY_8BIT : COPY_MODE_SIMPLY_COPY]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_rd_texture), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_texture), 3);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_rect.size.width, p_rect.size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -322,15 +319,12 @@ void EffectsRD::copy_cubemap_to_panorama(RID p_source_cube, RID p_dest_panorama,
 	copy.push_constant.target[1] = 0;
 	copy.push_constant.camera_z_far = p_lod;
 
-	int32_t x_groups = (p_panorama_size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_panorama_size.height - 1) / 8 + 1;
-
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[p_is_array ? COPY_MODE_CUBE_ARRAY_TO_PANORAMA : COPY_MODE_CUBE_TO_PANORAMA]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_cube), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_panorama), 3);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_panorama_size.width, p_panorama_size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -349,15 +343,12 @@ void EffectsRD::copy_depth_to_rect_and_linearize(RID p_source_rd_texture, RID p_
 	copy.push_constant.camera_z_far = p_z_far;
 	copy.push_constant.camera_z_near = p_z_near;
 
-	int32_t x_groups = (p_rect.size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_rect.size.height - 1) / 8 + 1;
-
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[COPY_MODE_LINEARIZE_DEPTH]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_rd_texture), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_texture), 3);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_rect.size.width, p_rect.size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -374,15 +365,12 @@ void EffectsRD::copy_depth_to_rect(RID p_source_rd_texture, RID p_dest_texture,
 	copy.push_constant.target[0] = p_rect.position.x;
 	copy.push_constant.target[1] = p_rect.position.y;
 
-	int32_t x_groups = (p_rect.size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_rect.size.height - 1) / 8 + 1;
-
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[COPY_MODE_SIMPLY_COPY_DEPTH]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_rd_texture), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_texture), 3);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_rect.size.width, p_rect.size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -400,14 +388,11 @@ void EffectsRD::set_color(RID p_dest_texture, const Color &p_color, const Rect2i
 	copy.push_constant.set_color[2] = p_color.b;
 	copy.push_constant.set_color[3] = p_color.a;
 
-	int32_t x_groups = (p_region.size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_region.size.height - 1) / 8 + 1;
-
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[p_8bit_dst ? COPY_MODE_SET_COLOR_8BIT : COPY_MODE_SET_COLOR]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_texture), 3);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_region.size.width, p_region.size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -420,8 +405,6 @@ void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back
 	copy.push_constant.section[2] = p_region.size.width;
 	copy.push_constant.section[3] = p_region.size.height;
 
-	int32_t x_groups = (p_region.size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_region.size.height - 1) / 8 + 1;
 	//HORIZONTAL
 	RD::DrawListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[p_8bit_dst ? COPY_MODE_GAUSSIAN_COPY_8BIT : COPY_MODE_GAUSSIAN_COPY]);
@@ -431,7 +414,7 @@ void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back
 	copy.push_constant.flags = base_flags | COPY_FLAG_HORIZONTAL;
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_region.size.width, p_region.size.height, 1);
 
 	RD::get_singleton()->compute_list_add_barrier(compute_list);
 
@@ -442,7 +425,7 @@ void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back
 	copy.push_constant.flags = base_flags;
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_region.size.width, p_region.size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -452,9 +435,6 @@ void EffectsRD::gaussian_glow(RID p_source_rd_texture, RID p_back_texture, const
 	CopyMode copy_mode = p_first_pass && p_auto_exposure.is_valid() ? COPY_MODE_GAUSSIAN_GLOW_AUTO_EXPOSURE : COPY_MODE_GAUSSIAN_GLOW;
 	uint32_t base_flags = 0;
 
-	int32_t x_groups = (p_size.width + 7) / 8;
-	int32_t y_groups = (p_size.height + 7) / 8;
-
 	copy.push_constant.section[2] = p_size.x;
 	copy.push_constant.section[3] = p_size.y;
 
@@ -479,16 +459,13 @@ void EffectsRD::gaussian_glow(RID p_source_rd_texture, RID p_back_texture, const
 	copy.push_constant.flags = base_flags | (p_first_pass ? COPY_FLAG_GLOW_FIRST_PASS : 0) | (p_high_quality ? COPY_FLAG_HIGH_QUALITY_GLOW : 0);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_size.width, p_size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
 void EffectsRD::screen_space_reflection(RID p_diffuse, RID p_normal_roughness, RenderingServer::EnvironmentSSRRoughnessQuality p_roughness_quality, RID p_blur_radius, RID p_blur_radius2, RID p_metallic, const Color &p_metallic_mask, RID p_depth, RID p_scale_depth, RID p_scale_normal, RID p_output, RID p_output_blur, const Size2i &p_screen_size, int p_max_steps, float p_fade_in, float p_fade_out, float p_tolerance, const CameraMatrix &p_camera) {
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 
-	int32_t x_groups = (p_screen_size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_screen_size.height - 1) / 8 + 1;
-
 	{ //scale color and depth to half
 		ssr_scale.push_constant.camera_z_far = p_camera.get_z_far();
 		ssr_scale.push_constant.camera_z_near = p_camera.get_z_near();
@@ -506,7 +483,7 @@ void EffectsRD::screen_space_reflection(RID p_diffuse, RID p_normal_roughness, R
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssr_scale.push_constant, sizeof(ScreenSpaceReflectionScalePushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.width, p_screen_size.height, 1);
 
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 	}
@@ -547,7 +524,7 @@ void EffectsRD::screen_space_reflection(RID p_diffuse, RID p_normal_roughness, R
 		}
 		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_scale_normal), 2);
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.width, p_screen_size.height, 1);
 	}
 
 	if (p_roughness_quality != RS::ENV_SSR_ROUGNESS_QUALITY_DISABLED) {
@@ -585,7 +562,7 @@ void EffectsRD::screen_space_reflection(RID p_diffuse, RID p_normal_roughness, R
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssr_filter.push_constant, sizeof(ScreenSpaceReflectionFilterPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.width, p_screen_size.height, 1);
 
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 
@@ -600,7 +577,7 @@ void EffectsRD::screen_space_reflection(RID p_diffuse, RID p_normal_roughness, R
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssr_filter.push_constant, sizeof(ScreenSpaceReflectionFilterPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.width, p_screen_size.height, 1);
 	}
 
 	RD::get_singleton()->compute_list_end();
@@ -609,9 +586,6 @@ void EffectsRD::screen_space_reflection(RID p_diffuse, RID p_normal_roughness, R
 void EffectsRD::sub_surface_scattering(RID p_diffuse, RID p_diffuse2, RID p_depth, const CameraMatrix &p_camera, const Size2i &p_screen_size, float p_scale, float p_depth_scale, RenderingServer::SubSurfaceScatteringQuality p_quality) {
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 
-	int32_t x_groups = (p_screen_size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_screen_size.height - 1) / 8 + 1;
-
 	Plane p = p_camera.xform4(Plane(1, 0, -1, 1));
 	p.normal /= p.d;
 	float unit_size = p.normal.x;
@@ -635,7 +609,7 @@ void EffectsRD::sub_surface_scattering(RID p_diffuse, RID p_diffuse2, RID p_dept
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &sss.push_constant, sizeof(SubSurfaceScatteringPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.width, p_screen_size.height, 1);
 
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 
@@ -646,7 +620,7 @@ void EffectsRD::sub_surface_scattering(RID p_diffuse, RID p_diffuse2, RID p_dept
 		sss.push_constant.vertical = true;
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &sss.push_constant, sizeof(SubSurfaceScatteringPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.width, p_screen_size.height, 1);
 
 		RD::get_singleton()->compute_list_end();
 	}
@@ -690,15 +664,12 @@ void EffectsRD::make_mipmap(RID p_source_rd_texture, RID p_dest_texture, const S
 	copy.push_constant.section[2] = p_size.width;
 	copy.push_constant.section[3] = p_size.height;
 
-	int32_t x_groups = (p_size.width - 1) / 8 + 1;
-	int32_t y_groups = (p_size.height - 1) / 8 + 1;
-
 	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, copy.pipelines[COPY_MODE_MIPMAP]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_rd_texture), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_texture), 3);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy.push_constant, sizeof(CopyPushConstant));
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_size.width, p_size.height, 1);
 	RD::get_singleton()->compute_list_end();
 }
 
@@ -719,7 +690,7 @@ void EffectsRD::copy_cubemap_to_dp(RID p_source_rd_texture, RID p_dst_framebuffe
 
 	RD::get_singleton()->draw_list_set_push_constant(draw_list, &push_constant, sizeof(CopyToDPPushConstant));
 	RD::get_singleton()->draw_list_draw(draw_list, true);
-	RD::get_singleton()->draw_list_end();
+	RD::get_singleton()->draw_list_end(RD::BARRIER_MASK_RASTER | RD::BARRIER_MASK_TRANSFER);
 }
 
 void EffectsRD::tonemapper(RID p_source_color, RID p_dst_framebuffer, const TonemapSettings &p_settings) {
@@ -804,10 +775,7 @@ void EffectsRD::luminance_reduction(RID p_source_texture, const Size2i p_source_
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &luminance_reduce.push_constant, sizeof(LuminanceReducePushConstant));
 
-		int32_t x_groups = (luminance_reduce.push_constant.source_size[0] - 1) / 8 + 1;
-		int32_t y_groups = (luminance_reduce.push_constant.source_size[1] - 1) / 8 + 1;
-
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, luminance_reduce.push_constant.source_size[0], luminance_reduce.push_constant.source_size[1], 1);
 
 		luminance_reduce.push_constant.source_size[0] = MAX(luminance_reduce.push_constant.source_size[0] / 8, 1);
 		luminance_reduce.push_constant.source_size[1] = MAX(luminance_reduce.push_constant.source_size[1] / 8, 1);
@@ -848,14 +816,12 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_base_texture), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_depth_texture), 1);
 
-	int32_t x_groups = (p_base_texture_size.x - 1) / 8 + 1;
-	int32_t y_groups = (p_base_texture_size.y - 1) / 8 + 1;
 	bokeh.push_constant.size[0] = p_base_texture_size.x;
 	bokeh.push_constant.size[1] = p_base_texture_size.y;
 
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &bokeh.push_constant, sizeof(BokehPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_base_texture_size.x, p_base_texture_size.y, 1);
 	RD::get_singleton()->compute_list_add_barrier(compute_list);
 
 	if (p_bokeh_shape == RS::DOF_BOKEH_BOX || p_bokeh_shape == RS::DOF_BOKEH_HEXAGON) {
@@ -872,8 +838,6 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_halfsize_texture1), 0);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_base_texture), 1);
 
-			x_groups = ((p_base_texture_size.x >> 1) - 1) / 8 + 1;
-			y_groups = ((p_base_texture_size.y >> 1) - 1) / 8 + 1;
 			bokeh.push_constant.size[0] = p_base_texture_size.x >> 1;
 			bokeh.push_constant.size[1] = p_base_texture_size.y >> 1;
 			bokeh.push_constant.half_size = true;
@@ -887,7 +851,7 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &bokeh.push_constant, sizeof(BokehPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, bokeh.push_constant.size[0], bokeh.push_constant.size[1], 1);
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 
 		//third pass
@@ -903,7 +867,7 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &bokeh.push_constant, sizeof(BokehPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, bokeh.push_constant.size[0], bokeh.push_constant.size[1], 1);
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 
 		if (p_quality == RS::DOF_BLUR_QUALITY_VERY_LOW || p_quality == RS::DOF_BLUR_QUALITY_LOW) {
@@ -914,8 +878,6 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_base_texture), 0);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_halfsize_texture2), 1);
 
-			x_groups = (p_base_texture_size.x - 1) / 8 + 1;
-			y_groups = (p_base_texture_size.y - 1) / 8 + 1;
 			bokeh.push_constant.size[0] = p_base_texture_size.x;
 			bokeh.push_constant.size[1] = p_base_texture_size.y;
 			bokeh.push_constant.half_size = false;
@@ -923,7 +885,7 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &bokeh.push_constant, sizeof(BokehPushConstant));
 
-			RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_base_texture_size.x, p_base_texture_size.y, 1);
 		}
 	} else {
 		//circle
@@ -941,15 +903,13 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_halfsize_texture1), 0);
 		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_base_texture), 1);
 
-		x_groups = ((p_base_texture_size.x >> 1) - 1) / 8 + 1;
-		y_groups = ((p_base_texture_size.y >> 1) - 1) / 8 + 1;
 		bokeh.push_constant.size[0] = p_base_texture_size.x >> 1;
 		bokeh.push_constant.size[1] = p_base_texture_size.y >> 1;
 		bokeh.push_constant.half_size = true;
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &bokeh.push_constant, sizeof(BokehPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, bokeh.push_constant.size[0], bokeh.push_constant.size[1], 1);
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 
 		//circle is just one pass, then upscale
@@ -961,8 +921,6 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_base_texture), 0);
 		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_halfsize_texture1), 1);
 
-		x_groups = (p_base_texture_size.x - 1) / 8 + 1;
-		y_groups = (p_base_texture_size.y - 1) / 8 + 1;
 		bokeh.push_constant.size[0] = p_base_texture_size.x;
 		bokeh.push_constant.size[1] = p_base_texture_size.y;
 		bokeh.push_constant.half_size = false;
@@ -970,7 +928,7 @@ void EffectsRD::bokeh_dof(RID p_base_texture, RID p_depth_texture, const Size2i
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &bokeh.push_constant, sizeof(BokehPushConstant));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_base_texture_size.x, p_base_texture_size.y, 1);
 	}
 
 	RD::get_singleton()->compute_list_end();
@@ -995,10 +953,9 @@ void EffectsRD::gather_ssao(RD::ComputeListID p_compute_list, const Vector<RID>
 		RD::get_singleton()->compute_list_bind_uniform_set(p_compute_list, _get_uniform_set_from_image(p_ao_slices[i]), 2);
 		RD::get_singleton()->compute_list_set_push_constant(p_compute_list, &ssao.gather_push_constant, sizeof(SSAOGatherPushConstant));
 
-		int x_groups = ((p_settings.full_screen_size.x >> (p_settings.half_size ? 2 : 1)) - 1) / 8 + 1;
-		int y_groups = ((p_settings.full_screen_size.y >> (p_settings.half_size ? 2 : 1)) - 1) / 8 + 1;
+		Size2i size = Size2i(p_settings.full_screen_size.x >> (p_settings.half_size ? 2 : 1), p_settings.full_screen_size.y >> (p_settings.half_size ? 2 : 1));
 
-		RD::get_singleton()->compute_list_dispatch(p_compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(p_compute_list, size.x, size.y, 1);
 	}
 	RD::get_singleton()->compute_list_add_barrier(p_compute_list);
 }
@@ -1072,10 +1029,9 @@ void EffectsRD::generate_ssao(RID p_depth_buffer, RID p_normal_buffer, RID p_dep
 		}
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssao.downsample_push_constant, sizeof(SSAODownsamplePushConstant));
 
-		int x_groups = (MAX(1, p_settings.full_screen_size.x >> (p_settings.half_size ? 2 : 1)) - 1) / 8 + 1;
-		int y_groups = (MAX(1, p_settings.full_screen_size.y >> (p_settings.half_size ? 2 : 1)) - 1) / 8 + 1;
+		Size2i size(MAX(1, p_settings.full_screen_size.x >> (p_settings.half_size ? 2 : 1)), MAX(1, p_settings.full_screen_size.y >> (p_settings.half_size ? 2 : 1)));
 
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, size.x, size.y, 1);
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 		RD::get_singleton()->draw_command_end_label(); // Downsample SSAO
 	}
@@ -1193,21 +1149,19 @@ void EffectsRD::generate_ssao(RID p_depth_buffer, RID p_normal_buffer, RID p_dep
 			RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, ssao.pipelines[SSAO_GATHER_BASE]);
 			gather_ssao(compute_list, p_ao_pong_slices, p_settings, true);
 			//generate importance map
-			int x_groups = (p_settings.quarter_screen_size.x - 1) / 8 + 1;
-			int y_groups = (p_settings.quarter_screen_size.y - 1) / 8 + 1;
 
 			RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, ssao.pipelines[SSAO_GENERATE_IMPORTANCE_MAP]);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_ao_pong), 0);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_importance_map), 1);
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssao.importance_map_push_constant, sizeof(SSAOImportanceMapPushConstant));
-			RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_settings.quarter_screen_size.x, p_settings.quarter_screen_size.y, 1);
 			RD::get_singleton()->compute_list_add_barrier(compute_list);
 			//process importance map A
 			RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, ssao.pipelines[SSAO_PROCESS_IMPORTANCE_MAPA]);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_importance_map), 0);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_importance_map_pong), 1);
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssao.importance_map_push_constant, sizeof(SSAOImportanceMapPushConstant));
-			RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_settings.quarter_screen_size.x, p_settings.quarter_screen_size.y, 1);
 			RD::get_singleton()->compute_list_add_barrier(compute_list);
 			//process Importance Map B
 			RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, ssao.pipelines[SSAO_PROCESS_IMPORTANCE_MAPB]);
@@ -1215,7 +1169,7 @@ void EffectsRD::generate_ssao(RID p_depth_buffer, RID p_normal_buffer, RID p_dep
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_importance_map), 1);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, ssao.counter_uniform_set, 2);
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssao.importance_map_push_constant, sizeof(SSAOImportanceMapPushConstant));
-			RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_settings.quarter_screen_size.x, p_settings.quarter_screen_size.y, 1);
 			RD::get_singleton()->compute_list_add_barrier(compute_list);
 
 			RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, ssao.pipelines[SSAO_GATHER_ADAPTIVE]);
@@ -1272,10 +1226,8 @@ void EffectsRD::generate_ssao(RID p_depth_buffer, RID p_normal_buffer, RID p_dep
 				}
 				RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssao.blur_push_constant, sizeof(SSAOBlurPushConstant));
 
-				int x_groups = ((p_settings.full_screen_size.x >> (p_settings.half_size ? 2 : 1)) - 1) / 8 + 1;
-				int y_groups = ((p_settings.full_screen_size.y >> (p_settings.half_size ? 2 : 1)) - 1) / 8 + 1;
-
-				RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+				Size2i size(p_settings.full_screen_size.x >> (p_settings.half_size ? 2 : 1), p_settings.full_screen_size.y >> (p_settings.half_size ? 2 : 1));
+				RD::get_singleton()->compute_list_dispatch_threads(compute_list, size.x, size.y, 1);
 			}
 
 			if (p_settings.quality > RS::ENV_SSAO_QUALITY_VERY_LOW) {
@@ -1313,18 +1265,15 @@ void EffectsRD::generate_ssao(RID p_depth_buffer, RID p_normal_buffer, RID p_dep
 
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &ssao.interleave_push_constant, sizeof(SSAOInterleavePushConstant));
 
-		int x_groups = (p_settings.full_screen_size.x - 1) / 8 + 1;
-		int y_groups = (p_settings.full_screen_size.y - 1) / 8 + 1;
-
-		RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_settings.full_screen_size.x, p_settings.full_screen_size.y, 1);
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
 		RD::get_singleton()->draw_command_end_label(); // Interleave
 	}
 	RD::get_singleton()->draw_command_end_label(); //SSAO
-	RD::get_singleton()->compute_list_end();
+	RD::get_singleton()->compute_list_end(RD::BARRIER_MASK_TRANSFER); //wait for upcoming transfer
 
 	int zero[1] = { 0 };
-	RD::get_singleton()->buffer_update(ssao.importance_map_load_counter, 0, sizeof(uint32_t), &zero);
+	RD::get_singleton()->buffer_update(ssao.importance_map_load_counter, 0, sizeof(uint32_t), &zero, 0); //no barrier
 }
 
 void EffectsRD::roughness_limit(RID p_source_normal, RID p_roughness, const Size2i &p_size, float p_curve) {
@@ -1337,12 +1286,9 @@ void EffectsRD::roughness_limit(RID p_source_normal, RID p_roughness, const Size
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_normal), 0);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_roughness), 1);
 
-	int x_groups = (p_size.x - 1) / 8 + 1;
-	int y_groups = (p_size.y - 1) / 8 + 1;
-
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &roughness_limiter.push_constant, sizeof(RoughnessLimiterPushConstant)); //not used but set anyway
 
-	RD::get_singleton()->compute_list_dispatch(compute_list, x_groups, y_groups, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_size.x, p_size.y, 1);
 
 	RD::get_singleton()->compute_list_end();
 }
@@ -1455,7 +1401,7 @@ void EffectsRD::render_sky(RD::DrawListID p_list, float p_time, RID p_fb, RID p_
 	RD::get_singleton()->draw_list_draw(draw_list, true);
 }
 
-void EffectsRD::resolve_gi(RID p_source_depth, RID p_source_normal_roughness, RID p_source_giprobe, RID p_dest_depth, RID p_dest_normal_roughness, RID p_dest_giprobe, Vector2i p_screen_size, int p_samples) {
+void EffectsRD::resolve_gi(RID p_source_depth, RID p_source_normal_roughness, RID p_source_giprobe, RID p_dest_depth, RID p_dest_normal_roughness, RID p_dest_giprobe, Vector2i p_screen_size, int p_samples, uint32_t p_barrier) {
 	ResolvePushConstant push_constant;
 	push_constant.screen_size[0] = p_screen_size.x;
 	push_constant.screen_size[1] = p_screen_size.y;
@@ -1472,19 +1418,26 @@ void EffectsRD::resolve_gi(RID p_source_depth, RID p_source_normal_roughness, RI
 
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(ResolvePushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.x, p_screen_size.y, 1, 8, 8, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_screen_size.x, p_screen_size.y, 1);
 
-	RD::get_singleton()->compute_list_end();
+	RD::get_singleton()->compute_list_end(p_barrier);
 }
 
 void EffectsRD::reduce_shadow(RID p_source_shadow, RID p_dest_shadow, const Size2i &p_source_size, const Rect2i &p_source_rect, int p_shrink_limit, RD::ComputeListID compute_list) {
 	uint32_t push_constant[8] = { (uint32_t)p_source_size.x, (uint32_t)p_source_size.y, (uint32_t)p_source_rect.position.x, (uint32_t)p_source_rect.position.y, (uint32_t)p_shrink_limit, 0, 0, 0 };
 
-	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, shadow_reduce.pipelines[SHADOW_REDUCE_REDUCE]);
-	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_image_pair(p_source_shadow, p_dest_shadow), 0);
+	uint32_t height = p_source_rect.size.height;
+	if (true) { // subgroup support, @TODO must detect them
+		RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, shadow_reduce.pipelines[p_shrink_limit == 1 ? SHADOW_REDUCE_REDUCE_SUBGROUPS_8 : SHADOW_REDUCE_REDUCE_SUBGROUPS]);
+		height /= 2; //cause kernel is 8x4
+	} else {
+		RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, shadow_reduce.pipelines[SHADOW_REDUCE_REDUCE]);
+	}
+	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_texture(p_source_shadow), 0);
+	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_dest_shadow), 1);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(uint32_t) * 8);
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_source_rect.size.width, p_source_rect.size.height, 1, 8, 8, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_source_rect.size.width, height, 1);
 }
 void EffectsRD::filter_shadow(RID p_shadow, RID p_backing_shadow, const Size2i &p_source_size, const Rect2i &p_source_rect, RenderingServer::EnvVolumetricFogShadowFilter p_filter, RD::ComputeListID compute_list, bool p_vertical, bool p_horizontal) {
 	uint32_t push_constant[8] = { (uint32_t)p_source_size.x, (uint32_t)p_source_size.y, (uint32_t)p_source_rect.position.x, (uint32_t)p_source_rect.position.y, 0, 0, 0, 0 };
@@ -1506,9 +1459,10 @@ void EffectsRD::filter_shadow(RID p_shadow, RID p_backing_shadow, const Size2i &
 	if (p_vertical) {
 		push_constant[6] = 1;
 		push_constant[7] = 0;
-		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_image_pair(p_shadow, p_backing_shadow), 0);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_shadow), 0);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_backing_shadow), 1);
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(uint32_t) * 8);
-		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_source_rect.size.width, p_source_rect.size.height, 1, 8, 8, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_source_rect.size.width, p_source_rect.size.height, 1);
 	}
 	if (p_vertical && p_horizontal) {
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
@@ -1516,9 +1470,10 @@ void EffectsRD::filter_shadow(RID p_shadow, RID p_backing_shadow, const Size2i &
 	if (p_horizontal) {
 		push_constant[6] = 0;
 		push_constant[7] = 1;
-		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_compute_uniform_set_from_image_pair(p_backing_shadow, p_shadow), 0);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_backing_shadow), 0);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, _get_uniform_set_from_image(p_shadow), 1);
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(uint32_t) * 8);
-		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_source_rect.size.width, p_source_rect.size.height, 1, 8, 8, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_source_rect.size.width, p_source_rect.size.height, 1);
 	}
 }
 
@@ -2020,6 +1975,8 @@ EffectsRD::EffectsRD() {
 	{
 		Vector<String> shadow_reduce_modes;
 		shadow_reduce_modes.push_back("\n#define MODE_REDUCE\n");
+		shadow_reduce_modes.push_back("\n#define MODE_REDUCE_SUBGROUP\n");
+		shadow_reduce_modes.push_back("\n#define MODE_REDUCE_SUBGROUP\n#define MODE_REDUCE_8\n");
 		shadow_reduce_modes.push_back("\n#define MODE_FILTER\n");
 
 		shadow_reduce.shader.initialize(shadow_reduce_modes);

+ 3 - 1
servers/rendering/renderer_rd/effects_rd.h

@@ -599,6 +599,8 @@ class EffectsRD {
 
 	enum ShadowReduceMode {
 		SHADOW_REDUCE_REDUCE,
+		SHADOW_REDUCE_REDUCE_SUBGROUPS,
+		SHADOW_REDUCE_REDUCE_SUBGROUPS_8,
 		SHADOW_REDUCE_FILTER,
 		SHADOW_REDUCE_MAX
 	};
@@ -763,7 +765,7 @@ public:
 	void merge_specular(RID p_dest_framebuffer, RID p_specular, RID p_base, RID p_reflection);
 	void sub_surface_scattering(RID p_diffuse, RID p_diffuse2, RID p_depth, const CameraMatrix &p_camera, const Size2i &p_screen_size, float p_scale, float p_depth_scale, RS::SubSurfaceScatteringQuality p_quality);
 
-	void resolve_gi(RID p_source_depth, RID p_source_normal_roughness, RID p_source_giprobe, RID p_dest_depth, RID p_dest_normal_roughness, RID p_dest_giprobe, Vector2i p_screen_size, int p_samples);
+	void resolve_gi(RID p_source_depth, RID p_source_normal_roughness, RID p_source_giprobe, RID p_dest_depth, RID p_dest_normal_roughness, RID p_dest_giprobe, Vector2i p_screen_size, int p_samples, uint32_t p_barrier = RD::BARRIER_MASK_ALL);
 
 	void reduce_shadow(RID p_source_shadow, RID p_dest_shadow, const Size2i &p_source_size, const Rect2i &p_source_rect, int p_shrink_limit, RenderingDevice::ComputeListID compute_list);
 	void filter_shadow(RID p_shadow, RID p_backing_shadow, const Size2i &p_source_size, const Rect2i &p_source_rect, RS::EnvVolumetricFogShadowFilter p_filter, RenderingDevice::ComputeListID compute_list, bool p_vertical = true, bool p_horizontal = true);

文件差異過大導致無法顯示
+ 360 - 175
servers/rendering/renderer_rd/renderer_scene_render_forward.cpp


+ 153 - 126
servers/rendering/renderer_rd/renderer_scene_render_forward.h

@@ -50,6 +50,15 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 		MAX_GI_PROBES = 8,
 		MAX_LIGHTMAPS = 8,
 		MAX_GI_PROBES_PER_INSTANCE = 2,
+		INSTANCE_DATA_BUFFER_MIN_SIZE = 4096
+	};
+
+	enum RenderListType {
+		RENDER_LIST_OPAQUE, //used for opaque objects
+		RENDER_LIST_ALPHA, //used for transparent objects
+		RENDER_LIST_SECONDARY, //used for shadows and other objects
+		RENDER_LIST_MAX
+
 	};
 
 	/* Scene Shader */
@@ -245,7 +254,7 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 
 	RID shadow_sampler;
 	RID render_base_uniform_set;
-	RID render_pass_uniform_set;
+	LocalVector<RID> render_pass_uniform_sets;
 	RID sdfgi_pass_uniform_set;
 
 	uint64_t lightmap_texture_array_version = 0xFFFFFFFF;
@@ -257,7 +266,58 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 
 	void _update_render_base_uniform_set();
 	RID _setup_sdfgi_render_pass_uniform_set(RID p_albedo_texture, RID p_emission_texture, RID p_emission_aniso_texture, RID p_geom_facing_texture);
-	RID _setup_render_pass_uniform_set(RID p_render_buffers, RID p_radiance_texture, RID p_shadow_atlas, RID p_reflection_atlas, RID p_cluster_buffer, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, bool p_use_directional_shadow_atlas = false);
+	RID _setup_render_pass_uniform_set(RenderListType p_render_list, RID p_render_buffers, RID p_radiance_texture, RID p_shadow_atlas, RID p_reflection_atlas, RID p_cluster_buffer, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, bool p_use_directional_shadow_atlas = false, int p_index = 0);
+
+	enum PassMode {
+		PASS_MODE_COLOR,
+		PASS_MODE_COLOR_SPECULAR,
+		PASS_MODE_COLOR_TRANSPARENT,
+		PASS_MODE_SHADOW,
+		PASS_MODE_SHADOW_DP,
+		PASS_MODE_DEPTH,
+		PASS_MODE_DEPTH_NORMAL_ROUGHNESS,
+		PASS_MODE_DEPTH_NORMAL_ROUGHNESS_GIPROBE,
+		PASS_MODE_DEPTH_MATERIAL,
+		PASS_MODE_SDF,
+	};
+
+	struct GeometryInstanceSurfaceDataCache;
+	struct RenderElementInfo;
+
+	struct RenderListParameters {
+		GeometryInstanceSurfaceDataCache **elements = nullptr;
+		RenderElementInfo *element_info = nullptr;
+		int element_count = 0;
+		bool reverse_cull = false;
+		PassMode pass_mode = PASS_MODE_COLOR;
+		bool no_gi = false;
+		RID render_pass_uniform_set;
+		bool force_wireframe = false;
+		Vector2 uv_offset;
+		Plane lod_plane;
+		float lod_distance_multiplier = 0.0;
+		float screen_lod_threshold = 0.0;
+		RD::FramebufferFormatID framebuffer_format = 0;
+		uint32_t element_offset = 0;
+		uint32_t barrier = RD::BARRIER_MASK_ALL;
+
+		RenderListParameters(GeometryInstanceSurfaceDataCache **p_elements, RenderElementInfo *p_element_info, int p_element_count, bool p_reverse_cull, PassMode p_pass_mode, bool p_no_gi, RID p_render_pass_uniform_set, bool p_force_wireframe = false, const Vector2 &p_uv_offset = Vector2(), const Plane &p_lod_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, uint32_t p_element_offset = 0, uint32_t p_barrier = RD::BARRIER_MASK_ALL) {
+			elements = p_elements;
+			element_info = p_element_info;
+			element_count = p_element_count;
+			reverse_cull = p_reverse_cull;
+			pass_mode = p_pass_mode;
+			no_gi = p_no_gi;
+			render_pass_uniform_set = p_render_pass_uniform_set;
+			force_wireframe = p_force_wireframe;
+			uv_offset = p_uv_offset;
+			lod_plane = p_lod_plane;
+			lod_distance_multiplier = p_lod_distance_multiplier;
+			screen_lod_threshold = p_screen_lod_threshold;
+			element_offset = p_element_offset;
+			barrier = p_barrier;
+		}
+	};
 
 	struct LightmapData {
 		float normal_xform[12];
@@ -367,9 +427,24 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 			uint32_t pancake_shadows;
 		};
 
+		struct PushConstant {
+			uint32_t base_index; //
+			uint32_t uv_offset; //packed
+			uint32_t pad[2];
+		};
+
+		struct InstanceData {
+			float transform[16];
+			uint32_t flags;
+			uint32_t instance_uniforms_ofs; //base offset in global buffer for instance variables
+			uint32_t gi_offset; //GI information when using lightmapping (VCT or lightmap index)
+			uint32_t layer_mask;
+			float lightmap_uv_scale[4];
+		};
+
 		UBO ubo;
 
-		RID uniform_buffer;
+		LocalVector<RID> uniform_buffers;
 
 		LightmapData lightmaps[MAX_LIGHTMAPS];
 		RID lightmap_ids[MAX_LIGHTMAPS];
@@ -378,6 +453,10 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 		uint32_t max_lightmaps;
 		RID lightmap_buffer;
 
+		RID instance_buffer[RENDER_LIST_MAX];
+		uint32_t instance_buffer_size[RENDER_LIST_MAX] = { 0, 0, 0 };
+		LocalVector<InstanceData> instance_data[RENDER_LIST_MAX];
+
 		LightmapCaptureData *lightmap_captures;
 		uint32_t max_lightmap_captures;
 		RID lightmap_capture_buffer;
@@ -390,10 +469,29 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 		bool used_depth_texture = false;
 		bool used_sss = false;
 
+		struct ShadowPass {
+			uint32_t element_from;
+			uint32_t element_count;
+			bool flip_cull;
+			PassMode pass_mode;
+
+			RID rp_uniform_set;
+			Plane camera_plane;
+			float lod_distance_multiplier;
+			float screen_lod_threshold;
+
+			RID framebuffer;
+			RD::InitialAction initial_depth_action;
+			RD::FinalAction final_depth_action;
+			Rect2i rect;
+		};
+
+		LocalVector<ShadowPass> shadow_passes;
+
 	} scene_state;
 
 	static RendererSceneRenderForward *singleton;
-	uint64_t render_pass;
+
 	double time;
 	RID default_shader;
 	RID default_material;
@@ -407,51 +505,15 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 	RID default_vec4_xform_buffer;
 	RID default_vec4_xform_uniform_set;
 
-	enum PassMode {
-		PASS_MODE_COLOR,
-		PASS_MODE_COLOR_SPECULAR,
-		PASS_MODE_COLOR_TRANSPARENT,
-		PASS_MODE_SHADOW,
-		PASS_MODE_SHADOW_DP,
-		PASS_MODE_DEPTH,
-		PASS_MODE_DEPTH_NORMAL_ROUGHNESS,
-		PASS_MODE_DEPTH_NORMAL_ROUGHNESS_GIPROBE,
-		PASS_MODE_DEPTH_MATERIAL,
-		PASS_MODE_SDF,
-	};
-
-	void _setup_environment(RID p_environment, RID p_render_buffers, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, RID p_reflection_probe, bool p_no_fog, const Size2i &p_screen_size, uint32_t p_cluster_size, uint32_t p_max_cluster_elements, RID p_shadow_atlas, bool p_flip_y, const Color &p_default_bg_color, float p_znear, float p_zfar, bool p_opaque_render_buffers = false, bool p_pancake_shadows = false);
+	void _setup_environment(RID p_environment, RID p_render_buffers, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, RID p_reflection_probe, bool p_no_fog, const Size2i &p_screen_size, uint32_t p_cluster_size, uint32_t p_max_cluster_elements, RID p_shadow_atlas, bool p_flip_y, const Color &p_default_bg_color, float p_znear, float p_zfar, bool p_opaque_render_buffers = false, bool p_pancake_shadows = false, int p_index = 0);
 	void _setup_giprobes(const PagedArray<RID> &p_giprobes);
 	void _setup_lightmaps(const PagedArray<RID> &p_lightmaps, const Transform &p_cam_transform);
 
-	struct GeometryInstanceSurfaceDataCache;
-
-	struct RenderListParameters {
-		GeometryInstanceSurfaceDataCache **elements = nullptr;
-		int element_count = 0;
-		bool reverse_cull = false;
-		PassMode pass_mode = PASS_MODE_COLOR;
-		bool no_gi = false;
-		RID render_pass_uniform_set;
-		bool force_wireframe = false;
-		Vector2 uv_offset;
-		Plane lod_plane;
-		float lod_distance_multiplier = 0.0;
-		float screen_lod_threshold = 0.0;
-		RD::FramebufferFormatID framebuffer_format = 0;
-		RenderListParameters(GeometryInstanceSurfaceDataCache **p_elements, int p_element_count, bool p_reverse_cull, PassMode p_pass_mode, bool p_no_gi, RID p_render_pass_uniform_set, bool p_force_wireframe = false, const Vector2 &p_uv_offset = Vector2(), const Plane &p_lod_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0) {
-			elements = p_elements;
-			element_count = p_element_count;
-			reverse_cull = p_reverse_cull;
-			pass_mode = p_pass_mode;
-			no_gi = p_no_gi;
-			render_pass_uniform_set = p_render_pass_uniform_set;
-			force_wireframe = p_force_wireframe;
-			uv_offset = p_uv_offset;
-			lod_plane = p_lod_plane;
-			lod_distance_multiplier = p_lod_distance_multiplier;
-			screen_lod_threshold = p_screen_lod_threshold;
-		}
+	struct RenderElementInfo {
+		uint32_t repeat : 22;
+		uint32_t uses_lightmap : 1;
+		uint32_t uses_forward_gi : 1;
+		uint32_t lod_index : 8;
 	};
 
 	template <PassMode p_pass_mode>
@@ -465,7 +527,9 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 
 	uint32_t render_list_thread_threshold = 500;
 
-	void _fill_render_list(const PagedArray<GeometryInstance *> &p_instances, PassMode p_pass_mode, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, bool p_using_sdfgi = false, bool p_using_opaque_gi = false);
+	void _update_instance_data_buffer(RenderListType p_render_list);
+	void _fill_instance_data(RenderListType p_render_list, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true);
+	void _fill_render_list(RenderListType p_render_list, const PagedArray<GeometryInstance *> &p_instances, PassMode p_pass_mode, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, bool p_using_sdfgi = false, bool p_using_opaque_gi = false, const Plane &p_lod_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, bool p_append = false);
 
 	Map<Size2i, RID> sdfgi_framebuffer_size_cache;
 
@@ -493,14 +557,17 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 
 		union {
 			struct {
-				uint32_t geometry_id;
-				uint32_t material_id;
-				uint32_t shader_id;
-				uint32_t surface_type : 4;
-				uint32_t uses_forward_gi : 1; //set during addition
-				uint32_t uses_lightmap : 1; //set during addition
-				uint32_t depth_layer : 4; //set during addition
-				uint32_t priority : 8;
+				uint64_t lod_index : 8;
+				uint64_t surface_index : 10;
+				uint64_t geometry_id : 32;
+				uint64_t material_id_low : 14;
+
+				uint64_t material_id_hi : 18;
+				uint64_t shader_id : 32;
+				uint64_t uses_forward_gi : 1;
+				uint64_t uses_lightmap : 1;
+				uint64_t depth_layer : 4;
+				uint64_t priority : 8;
 			};
 			struct {
 				uint64_t sort_key1;
@@ -532,20 +599,20 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 		float lod_model_scale = 1.0;
 		AABB transformed_aabb; //needed for LOD
 		float depth = 0;
-		struct PushConstant {
-			float transform[16];
-			uint32_t flags;
-			uint32_t instance_uniforms_ofs; //base offset in global buffer for instance variables
-			uint32_t gi_offset; //GI information when using lightmapping (VCT or lightmap index)
-			uint32_t layer_mask;
-			float lightmap_uv_scale[4];
-		} push_constant;
+		uint32_t gi_offset_cache = 0;
+		uint32_t flags_cache = 0;
+		bool store_transform_cache = true;
+		int32_t shader_parameters_offset = -1;
+		uint32_t lightmap_slice_index;
+		Rect2 lightmap_uv_scale;
+		uint32_t layer_mask = 1;
 		RID transforms_uniform_set;
 		uint32_t instance_count = 0;
 		RID mesh_instance;
 		bool can_sdfgi = false;
 		//used during setup
 		uint32_t base_flags = 0;
+		Transform transform;
 		RID gi_probes[MAX_GI_PROBES_PER_INSTANCE];
 		RID lightmap_instance;
 		GeometryInstanceLightmapSH *lightmap_sh = nullptr;
@@ -558,21 +625,14 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 			RS::InstanceType base_type;
 
 			RID skeleton;
-
-			uint32_t layer_mask = 1;
-
 			Vector<RID> surface_materials;
 			RID material_override;
-			Transform transform;
 			AABB aabb;
-			int32_t shader_parameters_offset = -1;
 
 			bool use_dynamic_gi = false;
 			bool use_baked_light = false;
 			bool cast_double_sided_shaodows = false;
 			bool mirror = false;
-			Rect2 lightmap_uv_scale;
-			uint32_t lightmap_slice_index = 0;
 			bool dirty_dependencies = false;
 
 			RendererStorage::DependencyTracker dependency_tracker;
@@ -604,16 +664,12 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 	/* Render List */
 
 	struct RenderList {
-		int max_elements;
-
-		GeometryInstanceSurfaceDataCache **elements = nullptr;
-
-		int element_count;
-		int alpha_element_count;
+		LocalVector<GeometryInstanceSurfaceDataCache *> elements;
+		LocalVector<RenderElementInfo> element_info;
 
 		void clear() {
-			element_count = 0;
-			alpha_element_count = 0;
+			elements.clear();
+			element_info.clear();
 		}
 
 		//should eventually be replaced by radix
@@ -624,13 +680,14 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 			}
 		};
 
-		void sort_by_key(bool p_alpha) {
+		void sort_by_key() {
 			SortArray<GeometryInstanceSurfaceDataCache *, SortByKey> sorter;
-			if (p_alpha) {
-				sorter.sort(&elements[max_elements - alpha_element_count], alpha_element_count);
-			} else {
-				sorter.sort(elements, element_count);
-			}
+			sorter.sort(elements.ptr(), elements.size());
+		}
+
+		void sort_by_key_range(uint32_t p_from, uint32_t p_size) {
+			SortArray<GeometryInstanceSurfaceDataCache *, SortByKey> sorter;
+			sorter.sort(elements.ptr() + p_from, p_size);
 		}
 
 		struct SortByDepth {
@@ -639,14 +696,10 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 			}
 		};
 
-		void sort_by_depth(bool p_alpha) { //used for shadows
+		void sort_by_depth() { //used for shadows
 
 			SortArray<GeometryInstanceSurfaceDataCache *, SortByDepth> sorter;
-			if (p_alpha) {
-				sorter.sort(&elements[max_elements - alpha_element_count], alpha_element_count);
-			} else {
-				sorter.sort(elements, element_count);
-			}
+			sorter.sort(elements.ptr(), elements.size());
 		}
 
 		struct SortByReverseDepthAndPriority {
@@ -658,50 +711,24 @@ class RendererSceneRenderForward : public RendererSceneRenderRD {
 		void sort_by_reverse_depth_and_priority(bool p_alpha) { //used for alpha
 
 			SortArray<GeometryInstanceSurfaceDataCache *, SortByReverseDepthAndPriority> sorter;
-			if (p_alpha) {
-				sorter.sort(&elements[max_elements - alpha_element_count], alpha_element_count);
-			} else {
-				sorter.sort(elements, element_count);
-			}
+			sorter.sort(elements.ptr(), elements.size());
 		}
 
 		_FORCE_INLINE_ void add_element(GeometryInstanceSurfaceDataCache *p_element) {
-			if (element_count + alpha_element_count >= max_elements) {
-				return;
-			}
-			elements[element_count] = p_element;
-			element_count++;
-		}
-
-		_FORCE_INLINE_ void add_alpha_element(GeometryInstanceSurfaceDataCache *p_element) {
-			if (element_count + alpha_element_count >= max_elements) {
-				return;
-			}
-			int idx = max_elements - alpha_element_count - 1;
-			elements[idx] = p_element;
-			alpha_element_count++;
-		}
-
-		void init() {
-			element_count = 0;
-			alpha_element_count = 0;
-			elements = memnew_arr(GeometryInstanceSurfaceDataCache *, max_elements);
-		}
-
-		RenderList() {
-			max_elements = 0;
-		}
-
-		~RenderList() {
-			memdelete_arr(elements);
+			elements.push_back(p_element);
 		}
 	};
 
-	RenderList render_list;
+	RenderList render_list[RENDER_LIST_MAX];
 
 protected:
-	virtual void _render_scene(RID p_render_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, int p_directional_light_count, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_cluster_buffer, uint32_t p_cluster_size, uint32_t p_max_cluster_elements, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, const Color &p_default_bg_color, float p_lod_threshold);
-	virtual void _render_shadow(RID p_framebuffer, const PagedArray<GeometryInstance *> &p_instances, const CameraMatrix &p_projection, const Transform &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool use_dp_flip, bool p_use_pancake, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, const Rect2i &p_rect = Rect2i(), bool p_flip_y = false, bool p_clear_region = true, bool p_begin = true, bool p_end = true);
+	virtual void _render_scene(RID p_render_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_cluster_buffer, uint32_t p_cluster_size, uint32_t p_max_cluster_elements, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, const Color &p_default_bg_color, float p_lod_threshold);
+
+	virtual void _render_shadow_begin();
+	virtual void _render_shadow_append(RID p_framebuffer, const PagedArray<GeometryInstance *> &p_instances, const CameraMatrix &p_projection, const Transform &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, const Rect2i &p_rect = Rect2i(), bool p_flip_y = false, bool p_clear_region = true, bool p_begin = true, bool p_end = true);
+	virtual void _render_shadow_process();
+	virtual void _render_shadow_end(uint32_t p_barrier = RD::BARRIER_MASK_ALL);
+
 	virtual void _render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
 	virtual void _render_uv2(const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
 	virtual void _render_sdfgi(RID p_render_buffers, const Vector3i &p_from, const Vector3i &p_size, const AABB &p_bounds, const PagedArray<GeometryInstance *> &p_instances, const RID &p_albedo_texture, const RID &p_emission_texture, const RID &p_emission_aniso_texture, const RID &p_geom_facing_texture);

文件差異過大導致無法顯示
+ 446 - 305
servers/rendering/renderer_rd/renderer_scene_render_rd.cpp


+ 62 - 10
servers/rendering/renderer_rd/renderer_scene_render_rd.h

@@ -109,8 +109,13 @@ protected:
 	void _setup_reflections(const PagedArray<RID> &p_reflections, const Transform &p_camera_inverse_transform, RID p_environment);
 	void _setup_giprobes(RID p_render_buffers, const Transform &p_transform, const PagedArray<RID> &p_gi_probes, uint32_t &r_gi_probes_used);
 
-	virtual void _render_scene(RID p_render_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, int p_directional_light_count, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_cluster_buffer, uint32_t p_cluster_size, uint32_t p_cluster_max_elements, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, const Color &p_default_color, float p_screen_lod_threshold) = 0;
-	virtual void _render_shadow(RID p_framebuffer, const PagedArray<GeometryInstance *> &p_instances, const CameraMatrix &p_projection, const Transform &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool use_dp_flip, bool p_use_pancake, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, const Rect2i &p_rect = Rect2i(), bool p_flip_y = false, bool p_clear_region = true, bool p_begin = true, bool p_end = true) = 0;
+	virtual void _render_scene(RID p_render_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_cluster_buffer, uint32_t p_cluster_size, uint32_t p_cluster_max_elements, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, const Color &p_default_color, float p_screen_lod_threshold) = 0;
+
+	virtual void _render_shadow_begin() = 0;
+	virtual void _render_shadow_append(RID p_framebuffer, const PagedArray<GeometryInstance *> &p_instances, const CameraMatrix &p_projection, const Transform &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, const Rect2i &p_rect = Rect2i(), bool p_flip_y = false, bool p_clear_region = true, bool p_begin = true, bool p_end = true) = 0;
+	virtual void _render_shadow_process() = 0;
+	virtual void _render_shadow_end(uint32_t p_barrier = RD::BARRIER_MASK_ALL) = 0;
+
 	virtual void _render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0;
 	virtual void _render_uv2(const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0;
 	virtual void _render_sdfgi(RID p_render_buffers, const Vector3i &p_from, const Vector3i &p_size, const AABB &p_bounds, const PagedArray<GeometryInstance *> &p_instances, const RID &p_albedo_texture, const RID &p_emission_texture, const RID &p_emission_aniso_texture, const RID &p_geom_facing_texture) = 0;
@@ -132,8 +137,16 @@ protected:
 	void _setup_sky(RID p_environment, RID p_render_buffers, const CameraMatrix &p_projection, const Transform &p_transform, const Size2i p_screen_size);
 	void _update_sky(RID p_environment, const CameraMatrix &p_projection, const Transform &p_transform);
 	void _draw_sky(bool p_can_continue_color, bool p_can_continue_depth, RID p_fb, RID p_environment, const CameraMatrix &p_projection, const Transform &p_transform);
+	void _pre_process_gi(RID p_render_buffers, const Transform &p_transform);
 	void _process_gi(RID p_render_buffers, RID p_normal_roughness_buffer, RID p_gi_probe_buffer, RID p_environment, const CameraMatrix &p_projection, const Transform &p_transform, const PagedArray<RID> &p_gi_probes);
 
+	bool _needs_post_prepass_render(bool p_use_gi);
+	void _post_prepass_render(bool p_use_gi);
+	void _pre_resolve_render(bool p_use_gi);
+
+	void _pre_opaque_render(bool p_use_ssao, bool p_use_gi, RID p_normal_roughness_buffer, RID p_gi_probe_buffer);
+	uint32_t _get_render_state_directional_light_count() const;
+
 	// needed for a single argument calls (material and uv2)
 	PagedArrayPool<GeometryInstance *> cull_argument_pool;
 	PagedArray<GeometryInstance *> cull_argument; //need this to exist
@@ -651,7 +664,7 @@ private:
 
 		RS::LightType light_type = RS::LIGHT_DIRECTIONAL;
 
-		ShadowTransform shadow_transform[4];
+		ShadowTransform shadow_transform[6];
 
 		AABB aabb;
 		RID self;
@@ -1031,8 +1044,14 @@ private:
 		float y_mult = 1.0;
 
 		uint32_t render_pass = 0;
+
+		int32_t cascade_dynamic_light_count[SDFGI::MAX_CASCADES]; //used dynamically
 	};
 
+	void _sdfgi_update_light(RID p_render_buffers, RID p_environment);
+	void _sdfgi_update_probes(RID p_render_buffers, RID p_environment);
+	void _sdfgi_store_probes(RID p_render_buffers);
+
 	RS::EnvironmentSDFGIRayCount sdfgi_ray_count = RS::ENV_SDFGI_RAY_COUNT_16;
 	RS::EnvironmentSDFGIFramesToConverge sdfgi_frames_to_converge = RS::ENV_SDFGI_CONVERGE_IN_10_FRAMES;
 	RS::EnvironmentSDFGIFramesToUpdateLight sdfgi_frames_to_update_light = RS::ENV_SDFGI_UPDATE_LIGHT_IN_4_FRAMES;
@@ -1460,6 +1479,41 @@ private:
 
 	} cluster;
 
+	struct RenderState {
+		RID render_buffers;
+		Transform cam_transform;
+		CameraMatrix cam_projection;
+		bool cam_ortogonal = false;
+		const PagedArray<GeometryInstance *> *instances = nullptr;
+		const PagedArray<RID> *lights = nullptr;
+		const PagedArray<RID> *reflection_probes = nullptr;
+		const PagedArray<RID> *gi_probes = nullptr;
+		const PagedArray<RID> *decals = nullptr;
+		const PagedArray<RID> *lightmaps = nullptr;
+		RID environment;
+		RID camera_effects;
+		RID shadow_atlas;
+		RID reflection_atlas;
+		RID reflection_probe;
+		int reflection_probe_pass = 0;
+		float screen_lod_threshold = 0.0;
+
+		const RenderShadowData *render_shadows = nullptr;
+		int render_shadow_count = 0;
+		const RenderSDFGIData *render_sdfgi_regions = nullptr;
+		int render_sdfgi_region_count = 0;
+		const RenderSDFGIUpdateData *sdfgi_update_data = nullptr;
+
+		uint32_t directional_light_count = 0;
+		uint32_t gi_probe_count = 0;
+
+		LocalVector<int> cube_shadows;
+		LocalVector<int> shadows;
+		LocalVector<int> directional_shadows;
+
+		bool depth_prepass_used;
+	} render_state;
+
 	struct VolumetricFog {
 		uint32_t width = 0;
 		uint32_t height = 0;
@@ -1547,6 +1601,10 @@ private:
 	uint32_t max_cluster_elements = 512;
 	bool low_end = false;
 
+	void _render_shadow_pass(RID p_light, RID p_shadow_atlas, int p_pass, const PagedArray<GeometryInstance *> &p_instances, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0, float p_screen_lod_threshold = 0.0, bool p_open_pass = true, bool p_close_pass = true, bool p_clear_region = true);
+	void _render_sdfgi_region(RID p_render_buffers, int p_region, const PagedArray<GeometryInstance *> &p_instances);
+	void _render_sdfgi_static_lights(RID p_render_buffers, uint32_t p_cascade_count, const uint32_t *p_cascade_indices, const PagedArray<RID> *p_positional_light_cull_result);
+
 public:
 	virtual Transform geometry_instance_get_transform(GeometryInstance *p_instance) = 0;
 	virtual AABB geometry_instance_get_aabb(GeometryInstance *p_instance) = 0;
@@ -1594,7 +1652,6 @@ public:
 	virtual int sdfgi_get_pending_region_count(RID p_render_buffers) const;
 	virtual AABB sdfgi_get_pending_region_bounds(RID p_render_buffers, int p_region) const;
 	virtual uint32_t sdfgi_get_pending_region_cascade(RID p_render_buffers, int p_region) const;
-	virtual void sdfgi_update_probes(RID p_render_buffers, RID p_environment, const Vector<RID> &p_directional_lights, const RID *p_positional_light_instances, uint32_t p_positional_light_count);
 	RID sdfgi_get_ubo() const { return gi.sdfgi_ubo; }
 	/* SKY API */
 
@@ -1997,15 +2054,10 @@ public:
 	float render_buffers_get_volumetric_fog_end(RID p_render_buffers);
 	float render_buffers_get_volumetric_fog_detail_spread(RID p_render_buffers);
 
-	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold);
-
-	void render_shadow(RID p_light, RID p_shadow_atlas, int p_pass, const PagedArray<GeometryInstance *> &p_instances, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0, float p_screen_lod_threshold = 0.0);
+	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);
 
 	void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
 
-	void render_sdfgi(RID p_render_buffers, int p_region, const PagedArray<GeometryInstance *> &p_instances);
-	void render_sdfgi_static_lights(RID p_render_buffers, uint32_t p_cascade_count, const uint32_t *p_cascade_indices, const PagedArray<RID> *p_positional_light_cull_result);
-
 	void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray<GeometryInstance *> &p_instances);
 
 	virtual void set_scene_pass(uint64_t p_pass) {

+ 8 - 8
servers/rendering/renderer_rd/renderer_storage_rd.cpp

@@ -3098,7 +3098,7 @@ void RendererStorageRD::update_mesh_instances() {
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(SkeletonShader::PushConstant));
 
 			//dispatch without barrier, so all is done at the same time
-			RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.vertex_count, 1, 1, 64, 1, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.vertex_count, 1, 1);
 		}
 
 		mi->dirty = false;
@@ -4555,7 +4555,7 @@ void RendererStorageRD::_particles_process(Particles *p_particles, float p_delta
 
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(ParticlesShader::PushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_particles->amount, 1, 1, 64, 1, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_particles->amount, 1, 1);
 
 	RD::get_singleton()->compute_list_end();
 }
@@ -4609,7 +4609,7 @@ void RendererStorageRD::particles_set_view_axis(RID p_particles, const Vector3 &
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_sort_uniform_set, 1);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1, 64, 1, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
 
 	RD::get_singleton()->compute_list_end();
 
@@ -4621,7 +4621,7 @@ void RendererStorageRD::particles_set_view_axis(RID p_particles, const Vector3 &
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_sort_uniform_set, 1);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1, 64, 1, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
 
 	RD::get_singleton()->compute_list_end();
 }
@@ -4728,7 +4728,7 @@ void RendererStorageRD::update_particles() {
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_copy_uniform_set, 0);
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
 
-			RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1, 64, 1, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
 
 			RD::get_singleton()->compute_list_end();
 		}
@@ -6980,7 +6980,7 @@ void RendererStorageRD::render_target_sdf_process(RID p_render_target) {
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, rt->sdf_buffer_process_uniform_sets[1], 0); //fill [0]
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(RenderTargetSDF::PushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.size[0], push_constant.size[1], 1, 8, 8, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.size[0], push_constant.size[1], 1);
 
 	/* Process */
 
@@ -6996,7 +6996,7 @@ void RendererStorageRD::render_target_sdf_process(RID p_render_target) {
 		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, rt->sdf_buffer_process_uniform_sets[swap ? 1 : 0], 0);
 		push_constant.stride = stride;
 		RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(RenderTargetSDF::PushConstant));
-		RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.size[0], push_constant.size[1], 1, 8, 8, 1);
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.size[0], push_constant.size[1], 1);
 		stride /= 2;
 		swap = !swap;
 		RD::get_singleton()->compute_list_add_barrier(compute_list);
@@ -7007,7 +7007,7 @@ void RendererStorageRD::render_target_sdf_process(RID p_render_target) {
 	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, rt_sdf.pipelines[shrink ? RenderTargetSDF::SHADER_STORE_SHRINK : RenderTargetSDF::SHADER_STORE]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, rt->sdf_buffer_process_uniform_sets[swap ? 1 : 0], 0);
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(RenderTargetSDF::PushConstant));
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.size[0], push_constant.size[1], 1, 8, 8, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, push_constant.size[0], push_constant.size[1], 1);
 
 	RD::get_singleton()->compute_list_end();
 }

+ 12 - 8
servers/rendering/renderer_rd/renderer_storage_rd.h

@@ -1482,13 +1482,7 @@ public:
 		return s->lod_count > 0;
 	}
 
-	_FORCE_INLINE_ RID mesh_surface_get_index_array(void *p_surface) const {
-		Mesh::Surface *s = reinterpret_cast<Mesh::Surface *>(p_surface);
-
-		return s->index_array;
-	}
-
-	_FORCE_INLINE_ RID mesh_surface_get_index_array_with_lod(void *p_surface, float p_model_scale, float p_distance_threshold, float p_lod_threshold) const {
+	_FORCE_INLINE_ uint32_t mesh_surface_get_lod(void *p_surface, float p_model_scale, float p_distance_threshold, float p_lod_threshold) const {
 		Mesh::Surface *s = reinterpret_cast<Mesh::Surface *>(p_surface);
 
 		int32_t current_lod = -1;
@@ -1500,9 +1494,19 @@ public:
 			current_lod = i;
 		}
 		if (current_lod == -1) {
+			return 0;
+		} else {
+			return current_lod + 1;
+		}
+	}
+
+	_FORCE_INLINE_ RID mesh_surface_get_index_array(void *p_surface, uint32_t p_lod) const {
+		Mesh::Surface *s = reinterpret_cast<Mesh::Surface *>(p_surface);
+
+		if (p_lod == 0) {
 			return s->index_array;
 		} else {
-			return s->lods[current_lod].index_array;
+			return s->lods[p_lod - 1].index_array;
 		}
 	}
 

+ 18 - 1
servers/rendering/renderer_rd/shader_rd.cpp

@@ -301,6 +301,7 @@ void ShaderRD::_compile_variant(uint32_t p_variant, Version *p_version) {
 
 		builder.append(compute_codev.get_data()); // version info (if exists)
 		builder.append("\n"); //make sure defines begin at newline
+		builder.append(base_compute_defines.get_data());
 		builder.append(general_defines.get_data());
 		builder.append(variant_defines[p_variant].get_data());
 
@@ -401,7 +402,6 @@ RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_versio
 
 			builder.append(fragment_codev.get_data()); // version info (if exists)
 			builder.append("\n"); //make sure defines begin at newline
-
 			builder.append(general_defines.get_data());
 			builder.append(variant_defines[i].get_data());
 			for (int j = 0; j < version->custom_defines.size(); j++) {
@@ -440,6 +440,7 @@ RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_versio
 
 			builder.append(compute_codev.get_data()); // version info (if exists)
 			builder.append("\n"); //make sure defines begin at newline
+			builder.append(base_compute_defines.get_data());
 			builder.append(general_defines.get_data());
 			builder.append(variant_defines[i].get_data());
 
@@ -596,6 +597,22 @@ bool ShaderRD::is_variant_enabled(int p_variant) const {
 	return variants_enabled[p_variant];
 }
 
+ShaderRD::ShaderRD() {
+	// Do not feel forced to use this, in most cases it makes little to no difference.
+	bool use_32_threads = false;
+	if (RD::get_singleton()->get_device_vendor_name() == "NVIDIA") {
+		use_32_threads = true;
+	}
+	String base_compute_define_text;
+	if (use_32_threads) {
+		base_compute_define_text = "\n#define NATIVE_LOCAL_GROUP_SIZE 32\n#define NATIVE_LOCAL_SIZE_2D_X 8\n#define NATIVE_LOCAL_SIZE_2D_Y 4\n";
+	} else {
+		base_compute_define_text = "\n#define NATIVE_LOCAL_GROUP_SIZE 64\n#define NATIVE_LOCAL_SIZE_2D_X 8\n#define NATIVE_LOCAL_SIZE_2D_Y 8\n";
+	}
+
+	base_compute_defines = base_compute_define_text.ascii();
+}
+
 void ShaderRD::initialize(const Vector<String> &p_variant_defines, const String &p_general_defines) {
 	ERR_FAIL_COND(variant_defines.size());
 	ERR_FAIL_COND(p_variant_defines.size() == 0);

+ 3 - 1
servers/rendering/renderer_rd/shader_rd.h

@@ -99,8 +99,10 @@ class ShaderRD {
 
 	const char *name;
 
+	CharString base_compute_defines;
+
 protected:
-	ShaderRD() {}
+	ShaderRD();
 	void setup(const char *p_vertex_code, const char *p_fragment_code, const char *p_compute_code, const char *p_name);
 
 public:

+ 52 - 39
servers/rendering/renderer_rd/shaders/scene_forward.glsl

@@ -89,12 +89,6 @@ MATERIAL_UNIFORMS
 } material;
 #endif
 
-/* clang-format off */
-
-VERTEX_SHADER_GLOBALS
-
-/* clang-format on */
-
 invariant gl_Position;
 
 #ifdef MODE_DUAL_PARABOLOID
@@ -103,28 +97,43 @@ layout(location = 8) out float dp_clip;
 
 #endif
 
+layout(location = 9) out flat uint instance_index;
+
+/* clang-format off */
+
+VERTEX_SHADER_GLOBALS
+
+/* clang-format on */
+
 void main() {
 	vec4 instance_custom = vec4(0.0);
 #if defined(COLOR_USED)
 	color_interp = color_attrib;
 #endif
 
-	mat4 world_matrix = draw_call.transform;
+	uint instance_index = draw_call.instance_index;
+
+	bool is_multimesh = bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH);
+	if (!is_multimesh) {
+		instance_index += gl_InstanceIndex;
+	}
+
+	mat4 world_matrix = instances.data[instance_index].transform;
 
 	mat3 world_normal_matrix;
-	if (bool(draw_call.flags & INSTANCE_FLAGS_NON_UNIFORM_SCALE)) {
+	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_NON_UNIFORM_SCALE)) {
 		world_normal_matrix = inverse(mat3(world_matrix));
 	} else {
 		world_normal_matrix = mat3(world_matrix);
 	}
 
-	if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH)) {
+	if (is_multimesh) {
 		//multimesh, instances are for it
-		uint offset = (draw_call.flags >> INSTANCE_FLAGS_MULTIMESH_STRIDE_SHIFT) & INSTANCE_FLAGS_MULTIMESH_STRIDE_MASK;
+		uint offset = (instances.data[instance_index].flags >> INSTANCE_FLAGS_MULTIMESH_STRIDE_SHIFT) & INSTANCE_FLAGS_MULTIMESH_STRIDE_MASK;
 		offset *= gl_InstanceIndex;
 
 		mat4 matrix;
-		if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_FORMAT_2D)) {
+		if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_FORMAT_2D)) {
 			matrix = mat4(transforms.data[offset + 0], transforms.data[offset + 1], vec4(0.0, 0.0, 1.0, 0.0), vec4(0.0, 0.0, 0.0, 1.0));
 			offset += 2;
 		} else {
@@ -132,14 +141,14 @@ void main() {
 			offset += 3;
 		}
 
-		if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_HAS_COLOR)) {
+		if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_HAS_COLOR)) {
 #ifdef COLOR_USED
 			color_interp *= transforms.data[offset];
 #endif
 			offset += 1;
 		}
 
-		if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA)) {
+		if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA)) {
 			instance_custom = transforms.data[offset];
 		}
 
@@ -161,7 +170,7 @@ void main() {
 #endif
 
 #if 0
-	if (bool(draw_call.flags & INSTANCE_FLAGS_SKELETON)) {
+	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_SKELETON)) {
 		//multimesh, instances are for it
 
 		uvec2 bones_01 = uvec2(bone_attrib.x & 0xFFFF, bone_attrib.x >> 16) * 3;
@@ -304,7 +313,8 @@ VERTEX_SHADER_CODE
 #endif
 #ifdef MODE_RENDER_MATERIAL
 	if (scene_data.material_uv2_mode) {
-		gl_Position.xy = (uv2_attrib.xy + draw_call.lightmap_uv_scale.xy) * 2.0 - 1.0;
+		vec2 uv_offset = unpackHalf2x16(draw_call.uv_offset);
+		gl_Position.xy = (uv2_attrib.xy + uv_offset) * 2.0 - 1.0;
 		gl_Position.z = 0.00001;
 		gl_Position.w = 1.0;
 	}
@@ -350,9 +360,11 @@ layout(location = 8) in float dp_clip;
 
 #endif
 
+layout(location = 9) in flat uint instance_index;
+
 //defines to keep compatibility with vertex
 
-#define world_matrix draw_call.transform
+#define world_matrix instances.data[instance_index].transform
 #define projection_matrix scene_data.projection_matrix
 
 #if defined(ENABLE_SSS) && defined(ENABLE_TRANSMITTANCE)
@@ -1770,7 +1782,7 @@ vec4 fog_process(vec3 vertex) {
 		}
 	}
 
-	float fog_amount = 1.0 - exp(vertex.z * scene_data.fog_density);
+	float fog_amount = 1.0 - exp(min(0.0, vertex.z * scene_data.fog_density));
 
 	if (abs(scene_data.fog_height_density) > 0.001) {
 		float y = (scene_data.camera_matrix * vec4(vertex, 1.0)).y;
@@ -2083,7 +2095,7 @@ FRAGMENT_SHADER_CODE
 #endif
 				uint decal_index = 32 * i + bit;
 
-				if (!bool(decals.data[decal_index].mask & draw_call.layer_mask)) {
+				if (!bool(decals.data[decal_index].mask & instances.data[instance_index].layer_mask)) {
 					continue; //not masked
 				}
 
@@ -2210,8 +2222,8 @@ FRAGMENT_SHADER_CODE
 #ifdef USE_LIGHTMAP
 
 	//lightmap
-	if (bool(draw_call.flags & INSTANCE_FLAGS_USE_LIGHTMAP_CAPTURE)) { //has lightmap capture
-		uint index = draw_call.gi_offset;
+	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_LIGHTMAP_CAPTURE)) { //has lightmap capture
+		uint index = instances.data[instance_index].gi_offset;
 
 		vec3 wnormal = mat3(scene_data.camera_matrix) * normal;
 		const float c1 = 0.429043;
@@ -2230,12 +2242,12 @@ FRAGMENT_SHADER_CODE
 						  2.0 * c2 * lightmap_captures.data[index].sh[1].rgb * wnormal.y +
 						  2.0 * c2 * lightmap_captures.data[index].sh[2].rgb * wnormal.z);
 
-	} else if (bool(draw_call.flags & INSTANCE_FLAGS_USE_LIGHTMAP)) { // has actual lightmap
-		bool uses_sh = bool(draw_call.flags & INSTANCE_FLAGS_USE_SH_LIGHTMAP);
-		uint ofs = draw_call.gi_offset & 0xFFFF;
+	} else if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_LIGHTMAP)) { // has actual lightmap
+		bool uses_sh = bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_SH_LIGHTMAP);
+		uint ofs = instances.data[instance_index].gi_offset & 0xFFFF;
 		vec3 uvw;
-		uvw.xy = uv2 * draw_call.lightmap_uv_scale.zw + draw_call.lightmap_uv_scale.xy;
-		uvw.z = float((draw_call.gi_offset >> 16) & 0xFFFF);
+		uvw.xy = uv2 * instances.data[instance_index].lightmap_uv_scale.zw + instances.data[instance_index].lightmap_uv_scale.xy;
+		uvw.z = float((instances.data[instance_index].gi_offset >> 16) & 0xFFFF);
 
 		if (uses_sh) {
 			uvw.z *= 4.0; //SH textures use 4 times more data
@@ -2244,7 +2256,7 @@ FRAGMENT_SHADER_CODE
 			vec3 lm_light_l1_0 = textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw + vec3(0.0, 0.0, 2.0), 0.0).rgb;
 			vec3 lm_light_l1p1 = textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw + vec3(0.0, 0.0, 3.0), 0.0).rgb;
 
-			uint idx = draw_call.gi_offset >> 20;
+			uint idx = instances.data[instance_index].gi_offset >> 20;
 			vec3 n = normalize(lightmaps.data[idx].normal_xform * normal);
 
 			ambient_light += lm_light_l0 * 0.282095f;
@@ -2264,7 +2276,7 @@ FRAGMENT_SHADER_CODE
 	}
 #elif defined(USE_FORWARD_GI)
 
-	if (bool(draw_call.flags & INSTANCE_FLAGS_USE_SDFGI)) { //has lightmap capture
+	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_SDFGI)) { //has lightmap capture
 
 		//make vertex orientation the world one, but still align to camera
 		vec3 cam_pos = mat3(scene_data.camera_matrix) * vertex;
@@ -2336,9 +2348,9 @@ FRAGMENT_SHADER_CODE
 		}
 	}
 
-	if (bool(draw_call.flags & INSTANCE_FLAGS_USE_GIPROBE)) { // process giprobes
+	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_GIPROBE)) { // process giprobes
 
-		uint index1 = draw_call.gi_offset & 0xFFFF;
+		uint index1 = instances.data[instance_index].gi_offset & 0xFFFF;
 		vec3 ref_vec = normalize(reflect(normalize(vertex), normal));
 		//find arbitrary tangent and bitangent, then build a matrix
 		vec3 v0 = abs(normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);
@@ -2350,7 +2362,7 @@ FRAGMENT_SHADER_CODE
 		vec4 spec_accum = vec4(0.0);
 		gi_probe_compute(index1, vertex, normal, ref_vec, normal_mat, roughness * roughness, ambient_light, specular_light, spec_accum, amb_accum);
 
-		uint index2 = draw_call.gi_offset >> 16;
+		uint index2 = instances.data[instance_index].gi_offset >> 16;
 
 		if (index2 != 0xFFFF) {
 			gi_probe_compute(index2, vertex, normal, ref_vec, normal_mat, roughness * roughness, ambient_light, specular_light, spec_accum, amb_accum);
@@ -2369,7 +2381,7 @@ FRAGMENT_SHADER_CODE
 	}
 #elif !defined(LOW_END_MODE)
 
-	if (bool(draw_call.flags & INSTANCE_FLAGS_USE_GI_BUFFERS)) { //use GI buffers
+	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_GI_BUFFERS)) { //use GI buffers
 
 		vec2 coord;
 
@@ -2448,7 +2460,7 @@ FRAGMENT_SHADER_CODE
 #endif
 				uint reflection_index = 32 * i + bit;
 
-				if (!bool(reflections.data[reflection_index].mask & draw_call.layer_mask)) {
+				if (!bool(reflections.data[reflection_index].mask & instances.data[instance_index].layer_mask)) {
 					continue; //not masked
 				}
 
@@ -2519,7 +2531,7 @@ FRAGMENT_SHADER_CODE
 				break;
 			}
 
-			if (!bool(directional_lights.data[i].mask & draw_call.layer_mask)) {
+			if (!bool(directional_lights.data[i].mask & instances.data[instance_index].layer_mask)) {
 				continue; //not masked
 			}
 
@@ -2838,7 +2850,7 @@ FRAGMENT_SHADER_CODE
 				break;
 			}
 
-			if (!bool(directional_lights.data[i].mask & draw_call.layer_mask)) {
+			if (!bool(directional_lights.data[i].mask & instances.data[instance_index].layer_mask)) {
 				continue; //not masked
 			}
 
@@ -2968,7 +2980,7 @@ FRAGMENT_SHADER_CODE
 #endif
 					uint light_index = 32 * i + bit;
 
-					if (!bool(omni_lights.data[light_index].mask & draw_call.layer_mask)) {
+					if (!bool(omni_lights.data[light_index].mask & instances.data[instance_index].layer_mask)) {
 						continue; //not masked
 					}
 
@@ -3041,7 +3053,7 @@ FRAGMENT_SHADER_CODE
 
 					uint light_index = 32 * i + bit;
 
-					if (!bool(spot_lights.data[light_index].mask & draw_call.layer_mask)) {
+					if (!bool(spot_lights.data[light_index].mask & instances.data[instance_index].layer_mask)) {
 						continue; //not masked
 					}
 
@@ -3214,9 +3226,9 @@ FRAGMENT_SHADER_CODE
 		normal_roughness_output_buffer = vec4(normal * 0.5 + 0.5, roughness);
 
 #ifdef MODE_RENDER_GIPROBE
-		if (bool(draw_call.flags & INSTANCE_FLAGS_USE_GIPROBE)) { // process giprobes
-			uint index1 = draw_call.gi_offset & 0xFFFF;
-			uint index2 = draw_call.gi_offset >> 16;
+		if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_GIPROBE)) { // process giprobes
+			uint index1 = instances.data[instance_index].gi_offset & 0xFFFF;
+			uint index2 = instances.data[instance_index].gi_offset >> 16;
 			giprobe_buffer.x = index1 & 0xFF;
 			giprobe_buffer.y = index2 & 0xFF;
 		} else {
@@ -3275,6 +3287,7 @@ FRAGMENT_SHADER_CODE
 
 	// Draw "fixed" fog before volumetric fog to ensure volumetric fog can appear in front of the sky.
 	frag_color.rgb = mix(frag_color.rgb, fog.rgb, fog.a);
+	;
 
 #endif //MODE_MULTIPLE_RENDER_TARGETS
 

+ 142 - 134
servers/rendering/renderer_rd/shaders/scene_forward_inc.glsl

@@ -21,12 +21,10 @@
 #endif
 
 layout(push_constant, binding = 0, std430) uniform DrawCall {
-	mat4 transform;
-	uint flags;
-	uint instance_uniforms_ofs; //base offset in global buffer for instance variables
-	uint gi_offset; //GI information when using lightmapping (VCT or lightmap index)
-	uint layer_mask;
-	vec4 lightmap_uv_scale;
+	uint instance_index;
+	uint uv_offset;
+	uint pad0;
+	uint pad1;
 }
 draw_call;
 
@@ -45,96 +43,13 @@ draw_call;
 #define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_REPEAT 10
 #define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_REPEAT 11
 
-layout(set = 0, binding = 1) uniform sampler material_samplers[12];
-
-layout(set = 0, binding = 2) uniform sampler shadow_sampler;
-
 #define SDFGI_MAX_CASCADES 8
 
-layout(set = 0, binding = 3, std140) uniform SceneData {
-	mat4 projection_matrix;
-	mat4 inv_projection_matrix;
-
-	mat4 camera_matrix;
-	mat4 inv_camera_matrix;
-
-	vec2 viewport_size;
-	vec2 screen_pixel_size;
-
-	uint cluster_shift;
-	uint cluster_width;
-	uint cluster_type_size;
-	uint max_cluster_element_count_div_32;
-
-	//use vec4s because std140 doesnt play nice with vec2s, z and w are wasted
-	vec4 directional_penumbra_shadow_kernel[32];
-	vec4 directional_soft_shadow_kernel[32];
-	vec4 penumbra_shadow_kernel[32];
-	vec4 soft_shadow_kernel[32];
-
-	uint directional_penumbra_shadow_samples;
-	uint directional_soft_shadow_samples;
-	uint penumbra_shadow_samples;
-	uint soft_shadow_samples;
-
-	vec4 ambient_light_color_energy;
+/* Set 1: Base Pass (never changes) */
 
-	float ambient_color_sky_mix;
-	bool use_ambient_light;
-	bool use_ambient_cubemap;
-	bool use_reflection_cubemap;
-
-	mat3 radiance_inverse_xform;
-
-	vec2 shadow_atlas_pixel_size;
-	vec2 directional_shadow_pixel_size;
-
-	uint directional_light_count;
-	float dual_paraboloid_side;
-	float z_far;
-	float z_near;
-
-	bool ssao_enabled;
-	float ssao_light_affect;
-	float ssao_ao_affect;
-	bool roughness_limiter_enabled;
-
-	float roughness_limiter_amount;
-	float roughness_limiter_limit;
-	uvec2 roughness_limiter_pad;
-
-	vec4 ao_color;
-
-	mat4 sdf_to_bounds;
-
-	ivec3 sdf_offset;
-	bool material_uv2_mode;
-
-	ivec3 sdf_size;
-	bool gi_upscale_for_msaa;
-
-	bool volumetric_fog_enabled;
-	float volumetric_fog_inv_length;
-	float volumetric_fog_detail_spread;
-	uint volumetric_fog_pad;
-
-	bool fog_enabled;
-	float fog_density;
-	float fog_height;
-	float fog_height_density;
-
-	vec3 fog_light_color;
-	float fog_sun_scatter;
-
-	float fog_aerial_perspective;
-
-	float time;
-	float reflection_multiplier; // one normally, zero when rendering reflections
-
-	bool pancake_shadows;
-}
+layout(set = 0, binding = 1) uniform sampler material_samplers[12];
 
-scene_data;
+layout(set = 0, binding = 2) uniform sampler shadow_sampler;
 
 #define INSTANCE_FLAGS_USE_GI_BUFFERS (1 << 6)
 #define INSTANCE_FLAGS_USE_SDFGI (1 << 7)
@@ -153,22 +68,22 @@ scene_data;
 #define INSTANCE_FLAGS_SKELETON (1 << 19)
 #define INSTANCE_FLAGS_NON_UNIFORM_SCALE (1 << 20)
 
-layout(set = 0, binding = 4, std430) restrict readonly buffer OmniLights {
+layout(set = 0, binding = 3, std430) restrict readonly buffer OmniLights {
 	LightData data[];
 }
 omni_lights;
 
-layout(set = 0, binding = 5, std430) restrict readonly buffer SpotLights {
+layout(set = 0, binding = 4, std430) restrict readonly buffer SpotLights {
 	LightData data[];
 }
 spot_lights;
 
-layout(set = 0, binding = 6) buffer restrict readonly ReflectionProbeData {
+layout(set = 0, binding = 5) buffer restrict readonly ReflectionProbeData {
 	ReflectionData data[];
 }
 reflections;
 
-layout(set = 0, binding = 7, std140) uniform DirectionalLights {
+layout(set = 0, binding = 6, std140) uniform DirectionalLights {
 	DirectionalLightData data[MAX_DIRECTIONAL_LIGHT_DATA_STRUCTS];
 }
 directional_lights;
@@ -180,7 +95,7 @@ struct Lightmap {
 	mat3 normal_xform;
 };
 
-layout(set = 0, binding = 8, std140) restrict readonly buffer Lightmaps {
+layout(set = 0, binding = 7, std140) restrict readonly buffer Lightmaps {
 	Lightmap data[];
 }
 lightmaps;
@@ -189,20 +104,20 @@ struct LightmapCapture {
 	vec4 sh[9];
 };
 
-layout(set = 0, binding = 9, std140) restrict readonly buffer LightmapCaptures {
+layout(set = 0, binding = 8, std140) restrict readonly buffer LightmapCaptures {
 	LightmapCapture data[];
 }
 lightmap_captures;
 
-layout(set = 0, binding = 10) uniform texture2D decal_atlas;
-layout(set = 0, binding = 11) uniform texture2D decal_atlas_srgb;
+layout(set = 0, binding = 9) uniform texture2D decal_atlas;
+layout(set = 0, binding = 10) uniform texture2D decal_atlas_srgb;
 
-layout(set = 0, binding = 12, std430) restrict readonly buffer Decals {
+layout(set = 0, binding = 11, std430) restrict readonly buffer Decals {
 	DecalData data[];
 }
 decals;
 
-layout(set = 0, binding = 13, std430) restrict readonly buffer GlobalVariableData {
+layout(set = 0, binding = 12, std430) restrict readonly buffer GlobalVariableData {
 	vec4 data[];
 }
 global_variables;
@@ -216,7 +131,7 @@ struct SDFGIProbeCascadeData {
 	float to_cell; // 1/bounds * grid_size
 };
 
-layout(set = 0, binding = 14, std140) uniform SDFGI {
+layout(set = 0, binding = 13, std140) uniform SDFGI {
 	vec3 grid_size;
 	uint max_cascades;
 
@@ -246,47 +161,140 @@ sdfgi;
 
 #endif //LOW_END_MODE
 
-// decal atlas
+/* Set 2: Render Pass (changes per render pass) */
 
-/* Set 1, Radiance */
+layout(set = 1, binding = 0, std140) uniform SceneData {
+	mat4 projection_matrix;
+	mat4 inv_projection_matrix;
+
+	mat4 camera_matrix;
+	mat4 inv_camera_matrix;
+
+	vec2 viewport_size;
+	vec2 screen_pixel_size;
+
+	uint cluster_shift;
+	uint cluster_width;
+	uint cluster_type_size;
+	uint max_cluster_element_count_div_32;
+
+	//use vec4s because std140 doesnt play nice with vec2s, z and w are wasted
+	vec4 directional_penumbra_shadow_kernel[32];
+	vec4 directional_soft_shadow_kernel[32];
+	vec4 penumbra_shadow_kernel[32];
+	vec4 soft_shadow_kernel[32];
+
+	uint directional_penumbra_shadow_samples;
+	uint directional_soft_shadow_samples;
+	uint penumbra_shadow_samples;
+	uint soft_shadow_samples;
+
+	vec4 ambient_light_color_energy;
+
+	float ambient_color_sky_mix;
+	bool use_ambient_light;
+	bool use_ambient_cubemap;
+	bool use_reflection_cubemap;
+
+	mat3 radiance_inverse_xform;
+
+	vec2 shadow_atlas_pixel_size;
+	vec2 directional_shadow_pixel_size;
+
+	uint directional_light_count;
+	float dual_paraboloid_side;
+	float z_far;
+	float z_near;
+
+	bool ssao_enabled;
+	float ssao_light_affect;
+	float ssao_ao_affect;
+	bool roughness_limiter_enabled;
+
+	float roughness_limiter_amount;
+	float roughness_limiter_limit;
+	uvec2 roughness_limiter_pad;
+
+	vec4 ao_color;
+
+	mat4 sdf_to_bounds;
+
+	ivec3 sdf_offset;
+	bool material_uv2_mode;
+
+	ivec3 sdf_size;
+	bool gi_upscale_for_msaa;
+
+	bool volumetric_fog_enabled;
+	float volumetric_fog_inv_length;
+	float volumetric_fog_detail_spread;
+	uint volumetric_fog_pad;
+
+	bool fog_enabled;
+	float fog_density;
+	float fog_height;
+	float fog_height_density;
+
+	vec3 fog_light_color;
+	float fog_sun_scatter;
+
+	float fog_aerial_perspective;
+
+	float time;
+	float reflection_multiplier; // one normally, zero when rendering reflections
+
+	bool pancake_shadows;
+}
+
+scene_data;
+
+struct InstanceData {
+	mat4 transform;
+	uint flags;
+	uint instance_uniforms_ofs; //base offset in global buffer for instance variables
+	uint gi_offset; //GI information when using lightmapping (VCT or lightmap index)
+	uint layer_mask;
+	vec4 lightmap_uv_scale;
+};
+
+layout(set = 1, binding = 1, std430) buffer restrict readonly InstanceDataBuffer {
+	InstanceData data[];
+}
+instances;
 
 #ifdef USE_RADIANCE_CUBEMAP_ARRAY
 
-layout(set = 1, binding = 0) uniform textureCubeArray radiance_cubemap;
+layout(set = 1, binding = 2) uniform textureCubeArray radiance_cubemap;
 
 #else
 
-layout(set = 1, binding = 0) uniform textureCube radiance_cubemap;
+layout(set = 1, binding = 2) uniform textureCube radiance_cubemap;
 
 #endif
 
-/* Set 2, Reflection and Shadow Atlases (view dependent) */
-
-layout(set = 1, binding = 1) uniform textureCubeArray reflection_atlas;
+layout(set = 1, binding = 3) uniform textureCubeArray reflection_atlas;
 
-layout(set = 1, binding = 2) uniform texture2D shadow_atlas;
+layout(set = 1, binding = 4) uniform texture2D shadow_atlas;
 
-layout(set = 1, binding = 3) uniform texture2D directional_shadow_atlas;
+layout(set = 1, binding = 5) uniform texture2D directional_shadow_atlas;
 
-layout(set = 1, binding = 4) uniform texture2DArray lightmap_textures[MAX_LIGHTMAP_TEXTURES];
+layout(set = 1, binding = 6) uniform texture2DArray lightmap_textures[MAX_LIGHTMAP_TEXTURES];
 
-#ifndef LOW_END_MODE
-layout(set = 1, binding = 5) uniform texture3D gi_probe_textures[MAX_GI_PROBES];
+#ifndef LOW_END_MOD
+layout(set = 1, binding = 7) uniform texture3D gi_probe_textures[MAX_GI_PROBES];
 #endif
 
-layout(set = 1, binding = 6, std430) buffer restrict readonly ClusterBuffer {
+layout(set = 1, binding = 8, std430) buffer restrict readonly ClusterBuffer {
 	uint data[];
 }
 cluster_buffer;
 
-/* Set 3, Render Buffers */
-
 #ifdef MODE_RENDER_SDF
 
-layout(r16ui, set = 1, binding = 7) uniform restrict writeonly uimage3D albedo_volume_grid;
-layout(r32ui, set = 1, binding = 8) uniform restrict writeonly uimage3D emission_grid;
-layout(r32ui, set = 1, binding = 9) uniform restrict writeonly uimage3D emission_aniso_grid;
-layout(r32ui, set = 1, binding = 10) uniform restrict uimage3D geom_facing_grid;
+layout(r16ui, set = 1, binding = 9) uniform restrict writeonly uimage3D albedo_volume_grid;
+layout(r32ui, set = 1, binding = 10) uniform restrict writeonly uimage3D emission_grid;
+layout(r32ui, set = 1, binding = 11) uniform restrict writeonly uimage3D emission_aniso_grid;
+layout(r32ui, set = 1, binding = 12) uniform restrict uimage3D geom_facing_grid;
 
 //still need to be present for shaders that use it, so remap them to something
 #define depth_buffer shadow_atlas
@@ -295,17 +303,17 @@ layout(r32ui, set = 1, binding = 10) uniform restrict uimage3D geom_facing_grid;
 
 #else
 
-layout(set = 1, binding = 7) uniform texture2D depth_buffer;
-layout(set = 1, binding = 8) uniform texture2D color_buffer;
+layout(set = 1, binding = 9) uniform texture2D depth_buffer;
+layout(set = 1, binding = 10) uniform texture2D color_buffer;
 
 #ifndef LOW_END_MODE
 
-layout(set = 1, binding = 9) uniform texture2D normal_roughness_buffer;
-layout(set = 1, binding = 10) uniform texture2D ao_buffer;
-layout(set = 1, binding = 11) uniform texture2D ambient_buffer;
-layout(set = 1, binding = 12) uniform texture2D reflection_buffer;
-layout(set = 1, binding = 13) uniform texture2DArray sdfgi_lightprobe_texture;
-layout(set = 1, binding = 14) uniform texture3D sdfgi_occlusion_cascades;
+layout(set = 1, binding = 11) uniform texture2D normal_roughness_buffer;
+layout(set = 1, binding = 12) uniform texture2D ao_buffer;
+layout(set = 1, binding = 13) uniform texture2D ambient_buffer;
+layout(set = 1, binding = 14) uniform texture2D reflection_buffer;
+layout(set = 1, binding = 15) uniform texture2DArray sdfgi_lightprobe_texture;
+layout(set = 1, binding = 16) uniform texture3D sdfgi_occlusion_cascades;
 
 struct GIProbeData {
 	mat4 xform;
@@ -323,22 +331,22 @@ struct GIProbeData {
 	uint mipmaps;
 };
 
-layout(set = 1, binding = 15, std140) uniform GIProbes {
+layout(set = 1, binding = 17, std140) uniform GIProbes {
 	GIProbeData data[MAX_GI_PROBES];
 }
 gi_probes;
 
-layout(set = 1, binding = 16) uniform texture3D volumetric_fog_texture;
+layout(set = 1, binding = 18) uniform texture3D volumetric_fog_texture;
 
 #endif // LOW_END_MODE
 
 #endif
 
-/* Set 4 Skeleton & Instancing (Multimesh) */
+/* Set 2 Skeleton & Instancing (can change per item) */
 
 layout(set = 2, binding = 0, std430) restrict readonly buffer Transforms {
 	vec4 data[];
 }
 transforms;
 
-/* Set 5 User Material */
+/* Set 3 User Material */

+ 61 - 3
servers/rendering/renderer_rd/shaders/shadow_reduce.glsl

@@ -6,8 +6,20 @@ VERSION_DEFINES
 
 #define BLOCK_SIZE 8
 
+#ifdef MODE_REDUCE_SUBGROUP
+
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+
+//nvidia friendly, max 32
+layout(local_size_x = 8, local_size_y = 4, local_size_z = 1) in;
+
+#else
+
 layout(local_size_x = BLOCK_SIZE, local_size_y = BLOCK_SIZE, local_size_z = 1) in;
 
+#endif
+
 #ifdef MODE_REDUCE
 
 shared float tmp_data[BLOCK_SIZE * BLOCK_SIZE];
@@ -16,8 +28,12 @@ const uint unswizzle_table[BLOCK_SIZE] = uint[](0, 0, 0, 1, 0, 2, 1, 3);
 
 #endif
 
-layout(r32f, set = 0, binding = 0) uniform restrict readonly image2D source_depth;
-layout(r32f, set = 0, binding = 1) uniform restrict writeonly image2D dst_depth;
+#if defined(MODE_REDUCE) || defined(MODE_REDUCE_SUBGROUP)
+layout(set = 0, binding = 0) uniform sampler2D source_depth;
+#else
+layout(r16, set = 0, binding = 0) uniform restrict readonly image2D source_depth;
+#endif
+layout(r16, set = 1, binding = 0) uniform restrict writeonly image2D dst_depth;
 
 layout(push_constant, binding = 1, std430) uniform Params {
 	ivec2 source_size;
@@ -29,6 +45,48 @@ layout(push_constant, binding = 1, std430) uniform Params {
 params;
 
 void main() {
+#ifdef MODE_REDUCE_SUBGROUP
+
+	uvec2 local_pos = gl_LocalInvocationID.xy;
+	ivec2 image_offset = params.source_offset;
+	ivec2 image_pos = image_offset + ivec2(gl_GlobalInvocationID.xy * ivec2(1, 2));
+
+	float depth = texelFetch(source_depth, min(image_pos, params.source_size - ivec2(1)), 0).r;
+	depth += texelFetch(source_depth, min(image_pos + ivec2(0, 1), params.source_size - ivec2(1)), 0).r;
+	depth *= 0.5;
+
+#ifdef MODE_REDUCE_8
+	//fast version, reduce all
+	float depth_average = subgroupAdd(depth) / 32.0;
+	if (local_pos == uvec2(0)) {
+		imageStore(dst_depth, image_pos / 8, vec4(depth_average));
+	}
+#else
+	//bit slower version, reduce by regions
+	uint group_size = (8 / params.min_size);
+	uvec2 group_id = local_pos / (8 / params.min_size);
+
+	uvec4 mask;
+	float depth_average = 0;
+
+	while (true) {
+		uvec2 first = subgroupBroadcastFirst(group_id);
+		mask = subgroupBallot(first == group_id);
+		if (first == group_id) {
+			depth_average = subgroupAdd(depth);
+			break;
+		}
+	}
+
+	depth_average /= float(group_size * group_size);
+
+	if (local_pos == group_id) {
+		imageStore(dst_depth, image_pos / int(group_size), vec4(depth_average));
+	}
+#endif
+
+#endif
+
 #ifdef MODE_REDUCE
 
 	uvec2 pos = gl_LocalInvocationID.xy;
@@ -36,7 +94,7 @@ void main() {
 	ivec2 image_offset = params.source_offset;
 	ivec2 image_pos = image_offset + ivec2(gl_GlobalInvocationID.xy);
 	uint dst_t = swizzle_table[pos.y] * BLOCK_SIZE + swizzle_table[pos.x];
-	tmp_data[dst_t] = imageLoad(source_depth, min(image_pos, params.source_size - ivec2(1))).r;
+	tmp_data[dst_t] = texelFetch(source_depth, min(image_pos, params.source_size - ivec2(1)), 0).r;
 	ivec2 image_size = params.source_size;
 
 	uint t = pos.y * BLOCK_SIZE + pos.x;

+ 121 - 87
servers/rendering/renderer_scene_cull.cpp

@@ -1906,6 +1906,9 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 			RS::LightOmniShadowMode shadow_mode = RSG::storage->light_omni_get_shadow_mode(p_instance->base);
 
 			if (shadow_mode == RS::LIGHT_OMNI_SHADOW_DUAL_PARABOLOID || !scene_render->light_instances_can_render_shadow_cube()) {
+				if (max_shadows_used + 2 > MAX_UPDATE_SHADOWS) {
+					return true;
+				}
 				for (int i = 0; i < 2; i++) {
 					//using this one ensures that raster deferred will have it
 					RENDER_TIMESTAMP("Culling Shadow Paraboloid" + itos(i));
@@ -1922,7 +1925,6 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 					planes.write[4] = light_transform.xform(Plane(Vector3(0, -1, z).normalized(), radius));
 					planes.write[5] = light_transform.xform(Plane(Vector3(0, 0, -z), 0));
 
-					geometry_instances_to_shadow_render.clear();
 					instance_shadow_cull_result.clear();
 
 					Vector<Vector3> points = Geometry3D::compute_convex_mesh_points(&planes[0], planes.size());
@@ -1943,6 +1945,8 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 
 					Plane near_plane(light_transform.origin, light_transform.basis.get_axis(2) * z);
 
+					RendererSceneRender::RenderShadowData &shadow_data = render_shadow_data[max_shadows_used++];
+
 					for (int j = 0; j < (int)instance_shadow_cull_result.size(); j++) {
 						Instance *instance = instance_shadow_cull_result[j];
 						if (!instance->visible || !((1 << instance->base_type) & RS::INSTANCE_GEOMETRY_MASK) || !static_cast<InstanceGeometryData *>(instance->base_data)->can_cast_shadows) {
@@ -1957,16 +1961,21 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 							}
 						}
 
-						geometry_instances_to_shadow_render.push_back(static_cast<InstanceGeometryData *>(instance->base_data)->geometry_instance);
+						shadow_data.instances.push_back(static_cast<InstanceGeometryData *>(instance->base_data)->geometry_instance);
 					}
 
 					RSG::storage->update_mesh_instances();
 
 					scene_render->light_instance_set_shadow_transform(light->instance, CameraMatrix(), light_transform, radius, 0, i, 0);
-					scene_render->render_shadow(light->instance, p_shadow_atlas, i, geometry_instances_to_shadow_render);
+					shadow_data.light = light->instance;
+					shadow_data.pass = i;
 				}
 			} else { //shadow cube
 
+				if (max_shadows_used + 6 > MAX_UPDATE_SHADOWS) {
+					return true;
+				}
+
 				real_t radius = RSG::storage->light_get_param(p_instance->base, RS::LIGHT_PARAM_RANGE);
 				CameraMatrix cm;
 				cm.set_perspective(90, 1, 0.01, radius);
@@ -1996,7 +2005,6 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 
 					Vector<Plane> planes = cm.get_projection_planes(xform);
 
-					geometry_instances_to_shadow_render.clear();
 					instance_shadow_cull_result.clear();
 
 					Vector<Vector3> points = Geometry3D::compute_convex_mesh_points(&planes[0], planes.size());
@@ -2015,7 +2023,7 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 
 					p_scenario->indexers[Scenario::INDEXER_GEOMETRY].convex_query(planes.ptr(), planes.size(), points.ptr(), points.size(), cull_convex);
 
-					Plane near_plane(xform.origin, -xform.basis.get_axis(2));
+					RendererSceneRender::RenderShadowData &shadow_data = render_shadow_data[max_shadows_used++];
 
 					for (int j = 0; j < (int)instance_shadow_cull_result.size(); j++) {
 						Instance *instance = instance_shadow_cull_result[j];
@@ -2030,22 +2038,28 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 							}
 						}
 
-						geometry_instances_to_shadow_render.push_back(static_cast<InstanceGeometryData *>(instance->base_data)->geometry_instance);
+						shadow_data.instances.push_back(static_cast<InstanceGeometryData *>(instance->base_data)->geometry_instance);
 					}
 
 					RSG::storage->update_mesh_instances();
 					scene_render->light_instance_set_shadow_transform(light->instance, cm, xform, radius, 0, i, 0);
-					scene_render->render_shadow(light->instance, p_shadow_atlas, i, geometry_instances_to_shadow_render);
+
+					shadow_data.light = light->instance;
+					shadow_data.pass = i;
 				}
 
 				//restore the regular DP matrix
-				scene_render->light_instance_set_shadow_transform(light->instance, CameraMatrix(), light_transform, radius, 0, 0, 0);
+				//scene_render->light_instance_set_shadow_transform(light->instance, CameraMatrix(), light_transform, radius, 0, 0, 0);
 			}
 
 		} break;
 		case RS::LIGHT_SPOT: {
 			RENDER_TIMESTAMP("Culling Spot Light");
 
+			if (max_shadows_used + 1 > MAX_UPDATE_SHADOWS) {
+				return true;
+			}
+
 			real_t radius = RSG::storage->light_get_param(p_instance->base, RS::LIGHT_PARAM_RANGE);
 			real_t angle = RSG::storage->light_get_param(p_instance->base, RS::LIGHT_PARAM_SPOT_ANGLE);
 
@@ -2054,7 +2068,6 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 
 			Vector<Plane> planes = cm.get_projection_planes(light_transform);
 
-			geometry_instances_to_shadow_render.clear();
 			instance_shadow_cull_result.clear();
 
 			Vector<Vector3> points = Geometry3D::compute_convex_mesh_points(&planes[0], planes.size());
@@ -2073,7 +2086,7 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 
 			p_scenario->indexers[Scenario::INDEXER_GEOMETRY].convex_query(planes.ptr(), planes.size(), points.ptr(), points.size(), cull_convex);
 
-			Plane near_plane(light_transform.origin, -light_transform.basis.get_axis(2));
+			RendererSceneRender::RenderShadowData &shadow_data = render_shadow_data[max_shadows_used++];
 
 			for (int j = 0; j < (int)instance_shadow_cull_result.size(); j++) {
 				Instance *instance = instance_shadow_cull_result[j];
@@ -2088,13 +2101,14 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 						RSG::storage->mesh_instance_check_for_update(instance->mesh_instance);
 					}
 				}
-				geometry_instances_to_shadow_render.push_back(static_cast<InstanceGeometryData *>(instance->base_data)->geometry_instance);
+				shadow_data.instances.push_back(static_cast<InstanceGeometryData *>(instance->base_data)->geometry_instance);
 			}
 
 			RSG::storage->update_mesh_instances();
 
 			scene_render->light_instance_set_shadow_transform(light->instance, cm, light_transform, radius, 0, 0, 0);
-			scene_render->render_shadow(light->instance, p_shadow_atlas, 0, geometry_instances_to_shadow_render);
+			shadow_data.light = light->instance;
+			shadow_data.pass = 0;
 
 		} break;
 	}
@@ -2147,14 +2161,13 @@ void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_
 
 	RID environment = _render_get_environment(p_camera, p_scenario);
 
-	_prepare_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->visible_layers, p_scenario, p_shadow_atlas, RID(), p_screen_lod_threshold);
-	_render_scene(p_render_buffers, camera->transform, camera_matrix, ortho, environment, camera->effects, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
+	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
 #endif
 }
 
 void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 	// render for AR/VR interface
-
+#if 0
 	Camera *camera = camera_owner.getornull(p_camera);
 	ERR_FAIL_COND(!camera);
 
@@ -2234,6 +2247,7 @@ void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_
 
 	// And render our scene...
 	_render_scene(p_render_buffers, cam_transform, camera_matrix, false, environment, camera->effects, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
+#endif
 };
 
 void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data) {
@@ -2452,7 +2466,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 	}
 }
 
-void RendererSceneCull::_prepare_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, float p_screen_lod_threshold, bool p_using_shadows) {
+void RendererSceneCull::_render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
 	// Note, in stereo rendering:
 	// - p_cam_transform will be a transform in the middle of our two eyes
 	// - p_cam_projection is a wider frustrum that encompasses both eyes
@@ -2466,6 +2480,7 @@ void RendererSceneCull::_prepare_scene(const Transform p_cam_transform, const Ca
 	scene_render->set_scene_pass(render_pass);
 
 	if (p_render_buffers.is_valid()) {
+		//no rendering code here, this is only to set up what needs to be done, request regions, etc.
 		scene_render->sdfgi_update(p_render_buffers, p_environment, p_cam_transform.origin); //update conditions for SDFGI (whether its used or not)
 	}
 
@@ -2596,62 +2611,28 @@ void RendererSceneCull::_prepare_scene(const Transform p_cam_transform, const Ca
 
 	//render shadows
 
-	for (uint32_t i = 0; i < cull.shadow_count; i++) {
-		for (uint32_t j = 0; j < cull.shadows[i].cascade_count; j++) {
-			const Cull::Shadow::Cascade &c = cull.shadows[i].cascades[j];
-			//			print_line("shadow " + itos(i) + " cascade " + itos(j) + " elements: " + itos(c.cull_result.size()));
-			scene_render->light_instance_set_shadow_transform(cull.shadows[i].light_instance, c.projection, c.transform, c.zfar, c.split, j, c.shadow_texel_size, c.bias_scale, c.range_begin, c.uv_scale);
-			scene_render->render_shadow(cull.shadows[i].light_instance, p_shadow_atlas, j, frustum_cull_result.directional_shadows[i].cascade_geometry_instances[j], near_plane, p_cam_projection.get_lod_multiplier(), p_screen_lod_threshold);
-		}
-	}
+	max_shadows_used = 0;
 
-	//render SDFGI
+	if (p_using_shadows) { //setup shadow maps
 
-	{
-		if (cull.sdfgi.region_count > 0) {
-			//update regions
-			for (uint32_t i = 0; i < cull.sdfgi.region_count; i++) {
-				scene_render->render_sdfgi(p_render_buffers, i, frustum_cull_result.sdfgi_region_geometry_instances[i]);
-			}
-			//check if static lights were culled
-			bool static_lights_culled = false;
-			for (uint32_t i = 0; i < cull.sdfgi.cascade_light_count; i++) {
-				if (frustum_cull_result.sdfgi_cascade_lights[i].size()) {
-					static_lights_culled = true;
-					break;
-				}
-			}
+		// Directional Shadows
 
-			if (static_lights_culled) {
-				scene_render->render_sdfgi_static_lights(p_render_buffers, cull.sdfgi.cascade_light_count, cull.sdfgi.cascade_light_index, frustum_cull_result.sdfgi_cascade_lights);
+		for (uint32_t i = 0; i < cull.shadow_count; i++) {
+			for (uint32_t j = 0; j < cull.shadows[i].cascade_count; j++) {
+				const Cull::Shadow::Cascade &c = cull.shadows[i].cascades[j];
+				//			print_line("shadow " + itos(i) + " cascade " + itos(j) + " elements: " + itos(c.cull_result.size()));
+				scene_render->light_instance_set_shadow_transform(cull.shadows[i].light_instance, c.projection, c.transform, c.zfar, c.split, j, c.shadow_texel_size, c.bias_scale, c.range_begin, c.uv_scale);
+				if (max_shadows_used == MAX_UPDATE_SHADOWS) {
+					continue;
+				}
+				render_shadow_data[max_shadows_used].light = cull.shadows[i].light_instance;
+				render_shadow_data[max_shadows_used].pass = j;
+				render_shadow_data[max_shadows_used].instances.merge_unordered(frustum_cull_result.directional_shadows[i].cascade_geometry_instances[j]);
+				max_shadows_used++;
 			}
 		}
 
-		if (p_render_buffers.is_valid()) {
-			scene_render->sdfgi_update_probes(p_render_buffers, p_environment, directional_lights, scenario->dynamic_lights.ptr(), scenario->dynamic_lights.size());
-		}
-	}
-
-	//light_samplers_culled=0;
-
-	/*
-	print_line("OT: "+rtos( (OS::get_singleton()->get_ticks_usec()-t)/1000.0));
-	print_line("OTO: "+itos(p_scenario->octree.get_octant_count()));
-	print_line("OTE: "+itos(p_scenario->octree.get_elem_count()));
-	print_line("OTP: "+itos(p_scenario->octree.get_pair_count()));
-	*/
-
-	/* STEP 3 - PROCESS PORTALS, VALIDATE ROOMS */
-	//removed, will replace with culling
-
-	/* STEP 4 - REMOVE FURTHER CULLED OBJECTS, ADD LIGHTS */
-
-	/* STEP 5 - PROCESS POSITIONAL LIGHTS */
-
-	if (p_using_shadows) { //setup shadow maps
-
-		//SortArray<Instance*,_InstanceLightsort> sorter;
-		//sorter.sort(light_cull_result,light_cull_count);
+		// Positional Shadowss
 		for (uint32_t i = 0; i < (uint32_t)frustum_cull_result.lights.size(); i++) {
 			Instance *ins = frustum_cull_result.lights[i];
 
@@ -2738,12 +2719,49 @@ void RendererSceneCull::_prepare_scene(const Transform p_cam_transform, const Ca
 
 			bool redraw = scene_render->shadow_atlas_update_light(p_shadow_atlas, light->instance, coverage, light->last_version);
 
-			if (redraw) {
+			if (redraw && max_shadows_used < MAX_UPDATE_SHADOWS) {
 				//must redraw!
 				RENDER_TIMESTAMP(">Rendering Light " + itos(i));
 				light->shadow_dirty = _light_instance_update_shadow(ins, p_cam_transform, p_cam_projection, p_cam_orthogonal, p_cam_vaspect, p_shadow_atlas, scenario, p_screen_lod_threshold);
 				RENDER_TIMESTAMP("<Rendering Light " + itos(i));
+			} else {
+				light->shadow_dirty = redraw;
+			}
+		}
+	}
+
+	//render SDFGI
+
+	{
+		sdfgi_update_data.update_static = false;
+
+		if (cull.sdfgi.region_count > 0) {
+			//update regions
+			for (uint32_t i = 0; i < cull.sdfgi.region_count; i++) {
+				render_sdfgi_data[i].instances.merge_unordered(frustum_cull_result.sdfgi_region_geometry_instances[i]);
+				render_sdfgi_data[i].region = i;
 			}
+			//check if static lights were culled
+			bool static_lights_culled = false;
+			for (uint32_t i = 0; i < cull.sdfgi.cascade_light_count; i++) {
+				if (frustum_cull_result.sdfgi_cascade_lights[i].size()) {
+					static_lights_culled = true;
+					break;
+				}
+			}
+
+			if (static_lights_culled) {
+				sdfgi_update_data.static_cascade_count = cull.sdfgi.cascade_light_count;
+				sdfgi_update_data.static_cascade_indices = cull.sdfgi.cascade_light_index;
+				sdfgi_update_data.static_positional_lights = frustum_cull_result.sdfgi_cascade_lights;
+				sdfgi_update_data.update_static = true;
+			}
+		}
+
+		if (p_render_buffers.is_valid()) {
+			sdfgi_update_data.directional_lights = &directional_lights;
+			sdfgi_update_data.positional_light_instances = scenario->dynamic_lights.ptr();
+			sdfgi_update_data.positional_light_count = scenario->dynamic_lights.size();
 		}
 	}
 
@@ -2751,6 +2769,28 @@ void RendererSceneCull::_prepare_scene(const Transform p_cam_transform, const Ca
 	for (int i = 0; i < directional_lights.size(); i++) {
 		frustum_cull_result.light_instances.push_back(directional_lights[i]);
 	}
+
+	RID camera_effects;
+	if (p_force_camera_effects.is_valid()) {
+		camera_effects = p_force_camera_effects;
+	} else {
+		camera_effects = scenario->camera_effects;
+	}
+	/* PROCESS GEOMETRY AND DRAW SCENE */
+
+	RENDER_TIMESTAMP("Render Scene ");
+	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);
+
+	for (uint32_t i = 0; i < max_shadows_used; i++) {
+		render_shadow_data[i].instances.clear();
+	}
+	max_shadows_used = 0;
+
+	for (uint32_t i = 0; i < cull.sdfgi.region_count; i++) {
+		render_sdfgi_data[i].instances.clear();
+	}
+
+	//	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold,const RenderShadowData *p_render_shadows,int p_render_shadow_count,const RenderSDFGIData *p_render_sdfgi_regions,int p_render_sdfgi_region_count,const RenderSDFGIStaticLightData *p_render_sdfgi_static_lights=nullptr) = 0;
 }
 
 RID RendererSceneCull::_render_get_environment(RID p_camera, RID p_scenario) {
@@ -2774,21 +2814,6 @@ RID RendererSceneCull::_render_get_environment(RID p_camera, RID p_scenario) {
 	return RID();
 }
 
-void RendererSceneCull::_render_scene(RID p_render_buffers, const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, RID p_environment, RID p_force_camera_effects, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold) {
-	Scenario *scenario = scenario_owner.getornull(p_scenario);
-
-	RID camera_effects;
-	if (p_force_camera_effects.is_valid()) {
-		camera_effects = p_force_camera_effects;
-	} else {
-		camera_effects = scenario->camera_effects;
-	}
-	/* PROCESS GEOMETRY AND DRAW SCENE */
-
-	RENDER_TIMESTAMP("Render Scene ");
-	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold);
-}
-
 void RendererSceneCull::render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas) {
 #ifndef _3D_DISABLED
 
@@ -2801,7 +2826,7 @@ void RendererSceneCull::render_empty_scene(RID p_render_buffers, RID p_scenario,
 		environment = scenario->fallback_environment;
 	}
 	RENDER_TIMESTAMP("Render Empty Scene ");
-	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, scenario->reflection_atlas, RID(), 0, 0);
+	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
 #endif
 }
 
@@ -2864,8 +2889,7 @@ bool RendererSceneCull::_render_reflection_probe_step(Instance *p_instance, int
 		}
 
 		RENDER_TIMESTAMP("Render Reflection Probe, Step " + itos(p_step));
-		_prepare_scene(xform, cm, false, false, RID(), RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, lod_threshold, use_shadows);
-		_render_scene(RID(), xform, cm, false, RID(), RID(), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, p_step, lod_threshold);
+		_render_scene(xform, cm, false, false, RID(), RID(), RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);
 
 	} else {
 		//do roughness postprocess step until it believes it's done
@@ -3493,7 +3517,12 @@ RendererSceneCull::RendererSceneCull() {
 	instance_cull_result.set_page_pool(&instance_cull_page_pool);
 	instance_shadow_cull_result.set_page_pool(&instance_cull_page_pool);
 
-	geometry_instances_to_shadow_render.set_page_pool(&geometry_instance_cull_page_pool);
+	for (uint32_t i = 0; i < MAX_UPDATE_SHADOWS; i++) {
+		render_shadow_data[i].instances.set_page_pool(&geometry_instance_cull_page_pool);
+	}
+	for (uint32_t i = 0; i < SDFGI_MAX_CASCADES * SDFGI_MAX_REGIONS_PER_CASCADE; i++) {
+		render_sdfgi_data[i].instances.set_page_pool(&geometry_instance_cull_page_pool);
+	}
 
 	frustum_cull_result.init(&rid_cull_page_pool, &geometry_instance_cull_page_pool, &instance_cull_page_pool);
 	frustum_cull_result_threads.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
@@ -3510,7 +3539,12 @@ RendererSceneCull::~RendererSceneCull() {
 	instance_cull_result.reset();
 	instance_shadow_cull_result.reset();
 
-	geometry_instances_to_shadow_render.reset();
+	for (uint32_t i = 0; i < MAX_UPDATE_SHADOWS; i++) {
+		render_shadow_data[i].instances.reset();
+	}
+	for (uint32_t i = 0; i < SDFGI_MAX_CASCADES * SDFGI_MAX_REGIONS_PER_CASCADE; i++) {
+		render_sdfgi_data[i].instances.reset();
+	}
 
 	frustum_cull_result.reset();
 	for (uint32_t i = 0; i < frustum_cull_result_threads.size(); i++) {

+ 9 - 4
servers/rendering/renderer_scene_cull.h

@@ -54,7 +54,8 @@ public:
 	enum {
 		SDFGI_MAX_CASCADES = 8,
 		SDFGI_MAX_REGIONS_PER_CASCADE = 3,
-		MAX_INSTANCE_PAIRS = 32
+		MAX_INSTANCE_PAIRS = 32,
+		MAX_UPDATE_SHADOWS = 512
 	};
 
 	uint64_t render_pass;
@@ -696,7 +697,6 @@ public:
 
 	PagedArray<Instance *> instance_cull_result;
 	PagedArray<Instance *> instance_shadow_cull_result;
-	PagedArray<RendererSceneRender::GeometryInstance *> geometry_instances_to_shadow_render;
 
 	struct FrustumCullResult {
 		PagedArray<RendererSceneRender::GeometryInstance *> geometry_instances;
@@ -816,6 +816,12 @@ public:
 	FrustumCullResult frustum_cull_result;
 	LocalVector<FrustumCullResult> frustum_cull_result_threads;
 
+	RendererSceneRender::RenderShadowData render_shadow_data[MAX_UPDATE_SHADOWS];
+	uint32_t max_shadows_used = 0;
+
+	RendererSceneRender::RenderSDFGIData render_sdfgi_data[SDFGI_MAX_CASCADES * SDFGI_MAX_REGIONS_PER_CASCADE];
+	RendererSceneRender::RenderSDFGIUpdateData sdfgi_update_data;
+
 	uint32_t thread_cull_threshold = 200;
 
 	RID_PtrOwner<Instance> instance_owner;
@@ -924,8 +930,7 @@ public:
 	void _frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);
 
 	bool _render_reflection_probe_step(Instance *p_instance, int p_step);
-	void _prepare_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, float p_screen_lod_threshold, bool p_using_shadows = true);
-	void _render_scene(RID p_render_buffers, const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, RID p_environment, RID p_force_camera_effects, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold);
+	void _render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
 	void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas);
 
 	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);

+ 23 - 5
servers/rendering/renderer_scene_render.h

@@ -87,7 +87,6 @@ public:
 	virtual int sdfgi_get_pending_region_count(RID p_render_buffers) const = 0;
 	virtual AABB sdfgi_get_pending_region_bounds(RID p_render_buffers, int p_region) const = 0;
 	virtual uint32_t sdfgi_get_pending_region_cascade(RID p_render_buffers, int p_region) const = 0;
-	virtual void sdfgi_update_probes(RID p_render_buffers, RID p_environment, const Vector<RID> &p_directional_lights, const RID *p_positional_light_instances, uint32_t p_positional_light_count) = 0;
 
 	/* SKY API */
 
@@ -195,12 +194,31 @@ public:
 
 	virtual void gi_probe_set_quality(RS::GIProbeQuality) = 0;
 
-	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold) = 0;
+	struct RenderShadowData {
+		RID light;
+		int pass = 0;
+		PagedArray<GeometryInstance *> instances;
+	};
+
+	struct RenderSDFGIData {
+		int region = 0;
+		PagedArray<GeometryInstance *> instances;
+	};
+
+	struct RenderSDFGIUpdateData {
+		bool update_static = false;
+		uint32_t static_cascade_count;
+		uint32_t *static_cascade_indices;
+		PagedArray<RID> *static_positional_lights;
+
+		const Vector<RID> *directional_lights;
+		const RID *positional_light_instances;
+		uint32_t positional_light_count;
+	};
+
+	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;
 
-	virtual void render_shadow(RID p_light, RID p_shadow_atlas, int p_pass, const PagedArray<GeometryInstance *> &p_instances, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0, float p_screen_lod_threshold = 0.0) = 0;
 	virtual void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0;
-	virtual void render_sdfgi(RID p_render_buffers, int p_region, const PagedArray<GeometryInstance *> &p_instances) = 0;
-	virtual void render_sdfgi_static_lights(RID p_render_buffers, uint32_t p_cascade_count, const uint32_t *p_cascade_indices, const PagedArray<RID> *p_positional_lights) = 0;
 	virtual void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray<GeometryInstance *> &p_instances) = 0;
 
 	virtual void set_scene_pass(uint64_t p_pass) = 0;

+ 1 - 0
servers/rendering/renderer_storage.h

@@ -98,6 +98,7 @@ public:
 
 			while (to_clean_up.size()) {
 				to_clean_up.front()->get().first->instances.erase(to_clean_up.front()->get().second);
+				dependencies.erase(to_clean_up.front()->get().first);
 				to_clean_up.pop_front();
 			}
 		}

+ 7 - 5
servers/rendering/rendering_device.cpp

@@ -240,10 +240,6 @@ void RenderingDevice::_compute_list_set_push_constant(ComputeListID p_list, cons
 	compute_list_set_push_constant(p_list, p_data.ptr(), p_data_size);
 }
 
-void RenderingDevice::compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads, uint32_t p_x_local_group, uint32_t p_y_local_group, uint32_t p_z_local_group) {
-	compute_list_dispatch(p_list, (p_x_threads - 1) / p_x_local_group + 1, (p_y_threads - 1) / p_y_local_group + 1, (p_z_threads - 1) / p_z_local_group + 1);
-}
-
 void RenderingDevice::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("texture_create", "format", "view", "data"), &RenderingDevice::_texture_create, DEFVAL(Array()));
 	ClassDB::bind_method(D_METHOD("texture_create_shared", "view", "with_texture"), &RenderingDevice::_texture_create_shared);
@@ -319,7 +315,7 @@ void RenderingDevice::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("draw_list_end", "post_barrier"), &RenderingDevice::draw_list_end, DEFVAL(BARRIER_MASK_ALL));
 
-	ClassDB::bind_method(D_METHOD("compute_list_begin"), &RenderingDevice::compute_list_begin);
+	ClassDB::bind_method(D_METHOD("compute_list_begin", "allow_draw_overlap"), &RenderingDevice::compute_list_begin, DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("compute_list_bind_compute_pipeline", "compute_list", "compute_pipeline"), &RenderingDevice::compute_list_bind_compute_pipeline);
 	ClassDB::bind_method(D_METHOD("compute_list_set_push_constant", "compute_list", "buffer", "size_bytes"), &RenderingDevice::_compute_list_set_push_constant);
 	ClassDB::bind_method(D_METHOD("compute_list_bind_uniform_set", "compute_list", "uniform_set", "set_index"), &RenderingDevice::compute_list_bind_uniform_set);
@@ -352,10 +348,15 @@ void RenderingDevice::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("draw_command_insert_label", "name", "color"), &RenderingDevice::draw_command_insert_label);
 	ClassDB::bind_method(D_METHOD("draw_command_end_label"), &RenderingDevice::draw_command_end_label);
 
+	ClassDB::bind_method(D_METHOD("get_device_vendor_name"), &RenderingDevice::get_device_vendor_name);
+	ClassDB::bind_method(D_METHOD("get_device_name"), &RenderingDevice::get_device_name);
+	ClassDB::bind_method(D_METHOD("get_device_pipeline_cache_uuid"), &RenderingDevice::get_device_pipeline_cache_uuid);
+
 	BIND_CONSTANT(BARRIER_MASK_RASTER);
 	BIND_CONSTANT(BARRIER_MASK_COMPUTE);
 	BIND_CONSTANT(BARRIER_MASK_TRANSFER);
 	BIND_CONSTANT(BARRIER_MASK_ALL);
+	BIND_CONSTANT(BARRIER_MASK_NO_BARRIER);
 
 	BIND_ENUM_CONSTANT(DATA_FORMAT_R4G4_UNORM_PACK8);
 	BIND_ENUM_CONSTANT(DATA_FORMAT_R4G4B4A4_UNORM_PACK16);
@@ -760,6 +761,7 @@ void RenderingDevice::_bind_methods() {
 
 	BIND_ENUM_CONSTANT(INITIAL_ACTION_CLEAR); //start rendering and clear the framebuffer (supply params)
 	BIND_ENUM_CONSTANT(INITIAL_ACTION_CLEAR_REGION); //start rendering and clear the framebuffer (supply params)
+	BIND_ENUM_CONSTANT(INITIAL_ACTION_CLEAR_REGION_CONTINUE); //continue rendering and clear the framebuffer (supply params)
 	BIND_ENUM_CONSTANT(INITIAL_ACTION_KEEP); //start rendering); but keep attached color texture contents (depth will be cleared)
 	BIND_ENUM_CONSTANT(INITIAL_ACTION_DROP); //start rendering); ignore what is there); just write above it
 	BIND_ENUM_CONSTANT(INITIAL_ACTION_CONTINUE); //continue rendering (framebuffer must have been left in "continue" state as final action previously)

+ 8 - 2
servers/rendering/rendering_device.h

@@ -343,6 +343,7 @@ public:
 		BARRIER_MASK_RASTER = 1,
 		BARRIER_MASK_COMPUTE = 2,
 		BARRIER_MASK_TRANSFER = 4,
+		BARRIER_MASK_NO_BARRIER = 8,
 		BARRIER_MASK_ALL = BARRIER_MASK_RASTER | BARRIER_MASK_COMPUTE | BARRIER_MASK_TRANSFER
 	};
 
@@ -944,6 +945,7 @@ public:
 	enum InitialAction {
 		INITIAL_ACTION_CLEAR, //start rendering and clear the whole framebuffer (region or not) (supply params)
 		INITIAL_ACTION_CLEAR_REGION, //start rendering and clear the framebuffer in the specified region (supply params)
+		INITIAL_ACTION_CLEAR_REGION_CONTINUE, //countinue rendering and clear the framebuffer in the specified region (supply params)
 		INITIAL_ACTION_KEEP, //start rendering, but keep attached color texture contents (depth will be cleared)
 		INITIAL_ACTION_DROP, //start rendering, ignore what is there, just write above it
 		INITIAL_ACTION_CONTINUE, //continue rendering (framebuffer must have been left in "continue" state as final action previously)
@@ -983,12 +985,12 @@ public:
 
 	typedef int64_t ComputeListID;
 
-	virtual ComputeListID compute_list_begin() = 0;
+	virtual ComputeListID compute_list_begin(bool p_allow_draw_overlap = false) = 0;
 	virtual void compute_list_bind_compute_pipeline(ComputeListID p_list, RID p_compute_pipeline) = 0;
 	virtual void compute_list_bind_uniform_set(ComputeListID p_list, RID p_uniform_set, uint32_t p_index) = 0;
 	virtual void compute_list_set_push_constant(ComputeListID p_list, const void *p_data, uint32_t p_data_size) = 0;
 	virtual void compute_list_dispatch(ComputeListID p_list, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) = 0;
-	virtual void compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads, uint32_t p_x_local_group, uint32_t p_y_local_group, uint32_t p_z_local_group);
+	virtual void compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads) = 0;
 	virtual void compute_list_dispatch_indirect(ComputeListID p_list, RID p_buffer, uint32_t p_offset) = 0;
 	virtual void compute_list_add_barrier(ComputeListID p_list) = 0;
 
@@ -1078,6 +1080,10 @@ public:
 	virtual void draw_command_insert_label(String p_label_name, const Color p_color = Color(1, 1, 1, 1)) = 0;
 	virtual void draw_command_end_label() = 0;
 
+	virtual String get_device_vendor_name() const = 0;
+	virtual String get_device_name() const = 0;
+	virtual String get_device_pipeline_cache_uuid() const = 0;
+
 	static RenderingDevice *get_singleton();
 	RenderingDevice();
 

部分文件因文件數量過多而無法顯示