Sfoglia il codice sorgente

Merge pull request #70065 from clayjohn/GLES3-attribs

Use instanced array buffer instead of UBO for canvas item batching
Rémi Verschelde 2 anni fa
parent
commit
47ef0549ee

+ 72 - 106
drivers/gles3/rasterizer_canvas_gles3.cpp

@@ -568,9 +568,8 @@ void RasterizerCanvasGLES3::_render_items(RID p_to_render_target, int p_item_cou
 	_new_batch(batch_broken, index);
 
 	// Override the start position and index as we want to start from where we finished off last time.
-	state.canvas_instance_batches[state.current_batch_index].start = r_last_index * sizeof(InstanceData);
+	state.canvas_instance_batches[state.current_batch_index].start = r_last_index;
 	index = 0;
-	_align_instance_data_buffer(index);
 
 	for (int i = 0; i < p_item_count; i++) {
 		Item *ci = items[i];
@@ -630,14 +629,14 @@ void RasterizerCanvasGLES3::_render_items(RID p_to_render_target, int p_item_cou
 	}
 
 	// Copy over all data needed for rendering.
-	glBindBuffer(GL_UNIFORM_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].ubo);
+	glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
 #ifdef WEB_ENABLED
-	glBufferSubData(GL_UNIFORM_BUFFER, r_last_index * sizeof(InstanceData), sizeof(InstanceData) * index, state.instance_data_array);
+	glBufferSubData(GL_ARRAY_BUFFER, r_last_index * sizeof(InstanceData), sizeof(InstanceData) * index, state.instance_data_array);
 #else
 	// On Desktop and mobile we map the memory without synchronizing for maximum speed.
-	void *ubo = glMapBufferRange(GL_UNIFORM_BUFFER, r_last_index * sizeof(InstanceData), index * sizeof(InstanceData), GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
-	memcpy(ubo, state.instance_data_array, index * sizeof(InstanceData));
-	glUnmapBuffer(GL_UNIFORM_BUFFER);
+	void *buffer = glMapBufferRange(GL_ARRAY_BUFFER, r_last_index * sizeof(InstanceData), index * sizeof(InstanceData), GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+	memcpy(buffer, state.instance_data_array, index * sizeof(InstanceData));
+	glUnmapBuffer(GL_ARRAY_BUFFER);
 #endif
 
 	glDisable(GL_SCISSOR_TEST);
@@ -664,7 +663,17 @@ void RasterizerCanvasGLES3::_render_items(RID p_to_render_target, int p_item_cou
 		uint64_t specialization = 0;
 		specialization |= uint64_t(state.canvas_instance_batches[i].lights_disabled);
 		specialization |= uint64_t(!GLES3::Config::get_singleton()->float_texture_supported) << 1;
-		bool success = _bind_material(material_data, variant, specialization);
+		RID shader_version = data.canvas_shader_default_version;
+
+		if (material_data) {
+			if (material_data->shader_data->version.is_valid() && material_data->shader_data->valid) {
+				// Bind uniform buffer and textures
+				material_data->bind_uniforms();
+				shader_version = material_data->shader_data->version;
+			}
+		}
+
+		bool success = GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(shader_version, variant, specialization);
 		if (!success) {
 			continue;
 		}
@@ -1217,26 +1226,15 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 
 	_bind_canvas_texture(state.canvas_instance_batches[p_index].tex, state.canvas_instance_batches[p_index].filter, state.canvas_instance_batches[p_index].repeat);
 
-	// Bind the region of the UBO used by this batch.
-	// If region exceeds the boundary of the UBO, just ignore.
-	uint32_t range_bytes = data.max_instances_per_batch * sizeof(InstanceData);
-	if (state.canvas_instance_batches[p_index].start >= (data.max_instances_per_ubo - 1) * sizeof(InstanceData)) {
-		return;
-	} else if (state.canvas_instance_batches[p_index].start >= (data.max_instances_per_ubo - data.max_instances_per_batch) * sizeof(InstanceData)) {
-		// If we have less than a full batch at the end, we can just draw it anyway.
-		// OpenGL will complain about the UBO being smaller than expected, but it should render fine.
-		range_bytes = (data.max_instances_per_ubo - 1) * sizeof(InstanceData) - state.canvas_instance_batches[p_index].start;
-	}
-
-	uint32_t range_start = state.canvas_instance_batches[p_index].start;
-	glBindBufferRange(GL_UNIFORM_BUFFER, INSTANCE_UNIFORM_LOCATION, state.canvas_instance_data_buffers[state.current_buffer].ubo, range_start, range_bytes);
-
 	switch (state.canvas_instance_batches[p_index].command_type) {
 		case Item::Command::TYPE_RECT:
 		case Item::Command::TYPE_NINEPATCH: {
 			glBindVertexArray(data.indexed_quad_array);
-			glDrawElements(GL_TRIANGLES, state.canvas_instance_batches[p_index].instance_count * 6, GL_UNSIGNED_INT, 0);
-			glBindBuffer(GL_UNIFORM_BUFFER, 0);
+			glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
+			uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
+			_enable_attributes(range_start, false);
+
+			glDrawElementsInstanced(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0, state.canvas_instance_batches[p_index].instance_count);
 			glBindVertexArray(0);
 
 		} break;
@@ -1248,18 +1246,21 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 			ERR_FAIL_COND(!pb);
 
 			glBindVertexArray(pb->vertex_array);
+			glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
+
+			uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
+			_enable_attributes(range_start, false);
 
 			if (pb->color_disabled && pb->color != Color(1.0, 1.0, 1.0, 1.0)) {
 				glVertexAttrib4f(RS::ARRAY_COLOR, pb->color.r, pb->color.g, pb->color.b, pb->color.a);
 			}
 
 			if (pb->index_buffer != 0) {
-				glDrawElements(prim[polygon->primitive], pb->count, GL_UNSIGNED_INT, nullptr);
+				glDrawElementsInstanced(prim[polygon->primitive], pb->count, GL_UNSIGNED_INT, nullptr, 1);
 			} else {
-				glDrawArrays(prim[polygon->primitive], 0, pb->count);
+				glDrawArraysInstanced(prim[polygon->primitive], 0, pb->count, 1);
 			}
 			glBindVertexArray(0);
-			glBindBuffer(GL_UNIFORM_BUFFER, 0);
 
 			if (pb->color_disabled && pb->color != Color(1.0, 1.0, 1.0, 1.0)) {
 				// Reset so this doesn't pollute other draw calls.
@@ -1269,14 +1270,16 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 
 		case Item::Command::TYPE_PRIMITIVE: {
 			glBindVertexArray(data.canvas_quad_array);
+			glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
+			uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
+			_enable_attributes(range_start, true);
+
 			const GLenum primitive[5] = { GL_POINTS, GL_POINTS, GL_LINES, GL_TRIANGLES, GL_TRIANGLES };
 			int instance_count = state.canvas_instance_batches[p_index].instance_count;
-			if (instance_count > 1) {
+			ERR_FAIL_COND(instance_count <= 0);
+			if (instance_count >= 1) {
 				glDrawArraysInstanced(primitive[state.canvas_instance_batches[p_index].primitive_points], 0, state.canvas_instance_batches[p_index].primitive_points, instance_count);
-			} else {
-				glDrawArrays(primitive[state.canvas_instance_batches[p_index].primitive_points], 0, state.canvas_instance_batches[p_index].primitive_points);
 			}
-			glBindBuffer(GL_UNIFORM_BUFFER, 0);
 
 		} break;
 
@@ -1370,6 +1373,11 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 				index_array_gl = mesh_storage->mesh_surface_get_index_buffer(surface, 0);
 				bool use_index_buffer = false;
 				glBindVertexArray(vertex_array_gl);
+				glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
+
+				uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
+				_enable_attributes(range_start, false, instance_count);
+
 				if (index_array_gl != 0) {
 					glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_array_gl);
 					use_index_buffer = true;
@@ -1396,20 +1404,13 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 				}
 
 				GLenum primitive_gl = prim[int(primitive)];
-				if (instance_count == 1) {
-					if (use_index_buffer) {
-						glDrawElements(primitive_gl, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), mesh_storage->mesh_surface_get_index_type(surface), 0);
-					} else {
-						glDrawArrays(primitive_gl, 0, mesh_storage->mesh_surface_get_vertices_drawn_count(surface));
-					}
-				} else if (instance_count > 1) {
-					if (use_index_buffer) {
-						glDrawElementsInstanced(primitive_gl, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), mesh_storage->mesh_surface_get_index_type(surface), 0, instance_count);
-					} else {
-						glDrawArraysInstanced(primitive_gl, 0, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), instance_count);
-					}
-				}
 
+				if (use_index_buffer) {
+					glDrawElementsInstanced(primitive_gl, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), mesh_storage->mesh_surface_get_index_type(surface), 0, instance_count);
+				} else {
+					glDrawArraysInstanced(primitive_gl, 0, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), instance_count);
+				}
+				glBindBuffer(GL_ARRAY_BUFFER, 0);
 				glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
 				if (instance_count > 1) {
 					glDisableVertexAttribArray(5);
@@ -1429,7 +1430,7 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 }
 
 void RasterizerCanvasGLES3::_add_to_batch(uint32_t &r_index, bool &r_batch_broken) {
-	if (r_index >= data.max_instances_per_ubo - 1) {
+	if (r_index >= data.max_instances_per_buffer - 1) {
 		ERR_PRINT_ONCE("Trying to draw too many items. Please increase maximum number of items in the project settings 'rendering/gl_compatibility/item_buffer_size'");
 		return;
 	}
@@ -1457,27 +1458,25 @@ void RasterizerCanvasGLES3::_new_batch(bool &r_batch_broken, uint32_t &r_index)
 	// Copy the properties of the current batch, we will manually update the things that changed.
 	Batch new_batch = state.canvas_instance_batches[state.current_batch_index];
 	new_batch.instance_count = 0;
-	new_batch.start = state.canvas_instance_batches[state.current_batch_index].start + state.canvas_instance_batches[state.current_batch_index].instance_count * sizeof(InstanceData);
+	new_batch.start = state.canvas_instance_batches[state.current_batch_index].start + state.canvas_instance_batches[state.current_batch_index].instance_count;
 
 	state.current_batch_index++;
 	state.canvas_instance_batches.push_back(new_batch);
-	_align_instance_data_buffer(r_index);
 }
 
-bool RasterizerCanvasGLES3::_bind_material(GLES3::CanvasMaterialData *p_material_data, CanvasShaderGLES3::ShaderVariant p_variant, uint64_t p_specialization) {
-	if (p_material_data) {
-		if (p_material_data->shader_data->version.is_valid() && p_material_data->shader_data->valid) {
-			// Bind uniform buffer and textures
-			p_material_data->bind_uniforms();
-			return GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(p_material_data->shader_data->version, p_variant, p_specialization);
-		} else {
-			return GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(data.canvas_shader_default_version, p_variant, p_specialization);
-		}
-	} else {
-		return GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(data.canvas_shader_default_version, p_variant, p_specialization);
+void RasterizerCanvasGLES3::_enable_attributes(uint32_t p_start, bool p_primitive, uint32_t p_rate) {
+	uint32_t split = p_primitive ? 11 : 12;
+	for (uint32_t i = 6; i < split; i++) {
+		glEnableVertexAttribArray(i);
+		glVertexAttribPointer(i, 4, GL_FLOAT, GL_FALSE, sizeof(InstanceData), CAST_INT_TO_UCHAR_PTR(p_start + (i - 6) * 4 * sizeof(float)));
+		glVertexAttribDivisor(i, p_rate);
+	}
+	for (uint32_t i = split; i <= 13; i++) {
+		glEnableVertexAttribArray(i);
+		glVertexAttribIPointer(i, 4, GL_UNSIGNED_INT, sizeof(InstanceData), CAST_INT_TO_UCHAR_PTR(p_start + (i - 6) * 4 * sizeof(float)));
+		glVertexAttribDivisor(i, p_rate);
 	}
 }
-
 RID RasterizerCanvasGLES3::light_create() {
 	CanvasLight canvas_light;
 	return canvas_light_owner.make_rid(canvas_light);
@@ -2416,8 +2415,8 @@ void RasterizerCanvasGLES3::_allocate_instance_data_buffer() {
 	GLuint new_buffers[3];
 	glGenBuffers(3, new_buffers);
 	// Batch UBO.
-	glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[0]);
-	glBufferData(GL_UNIFORM_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
+	glBindBuffer(GL_ARRAY_BUFFER, new_buffers[0]);
+	glBufferData(GL_ARRAY_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
 	// Light uniform buffer.
 	glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[1]);
 	glBufferData(GL_UNIFORM_BUFFER, sizeof(LightUniform) * data.max_lights_per_render, nullptr, GL_STREAM_DRAW);
@@ -2427,36 +2426,16 @@ void RasterizerCanvasGLES3::_allocate_instance_data_buffer() {
 
 	state.current_buffer = (state.current_buffer + 1);
 	DataBuffer db;
-	db.ubo = new_buffers[0];
+	db.buffer = new_buffers[0];
 	db.light_ubo = new_buffers[1];
 	db.state_ubo = new_buffers[2];
 	db.last_frame_used = RSG::rasterizer->get_frame_number();
 	state.canvas_instance_data_buffers.insert(state.current_buffer, db);
 	state.current_buffer = state.current_buffer % state.canvas_instance_data_buffers.size();
+	glBindBuffer(GL_ARRAY_BUFFER, 0);
 	glBindBuffer(GL_UNIFORM_BUFFER, 0);
 }
 
-// Batch start positions need to be aligned to the device's GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT.
-// This needs to be called anytime a new batch is created.
-void RasterizerCanvasGLES3::_align_instance_data_buffer(uint32_t &r_index) {
-	if (GLES3::Config::get_singleton()->uniform_buffer_offset_alignment > int(sizeof(InstanceData))) {
-		uint32_t offset = state.canvas_instance_batches[state.current_batch_index].start % GLES3::Config::get_singleton()->uniform_buffer_offset_alignment;
-		if (offset > 0) {
-			// uniform_buffer_offset_alignment can be 4, 16, 32, or 256. Our instance batches are 128 bytes.
-			// Accordingly, this branch is only triggered if we are 128 bytes off.
-			uint32_t offset_bytes = GLES3::Config::get_singleton()->uniform_buffer_offset_alignment - offset;
-			state.canvas_instance_batches[state.current_batch_index].start += offset_bytes;
-			// Offset the instance array so it stays in sync with batch start points.
-			// This creates gaps in the instance buffer with wasted space, but we can't help it.
-			r_index += offset_bytes / sizeof(InstanceData);
-			if (r_index > 0) {
-				// In this case we need to copy over the basic data.
-				state.instance_data_array[r_index] = state.instance_data_array[r_index - 1];
-			}
-		}
-	}
-}
-
 void RasterizerCanvasGLES3::set_time(double p_time) {
 	state.time = p_time;
 }
@@ -2601,12 +2580,12 @@ RasterizerCanvasGLES3::RasterizerCanvasGLES3() {
 		data.max_instances_per_batch = 128;
 	} else {
 		data.max_lights_per_render = 256;
-		data.max_instances_per_batch = 512;
+		data.max_instances_per_batch = 2048;
 	}
 
 	// Reserve 3 Uniform Buffers for instance data Frame N, N+1 and N+2
-	data.max_instances_per_ubo = MAX(data.max_instances_per_batch, uint32_t(GLOBAL_GET("rendering/gl_compatibility/item_buffer_size")));
-	data.max_instance_buffer_size = data.max_instances_per_ubo * sizeof(InstanceData); // 16,384 instances * 128 bytes = 2,097,152 bytes = 2,048 kb
+	data.max_instances_per_buffer = MAX(data.max_instances_per_batch, uint32_t(GLOBAL_GET("rendering/gl_compatibility/item_buffer_size")));
+	data.max_instance_buffer_size = data.max_instances_per_buffer * sizeof(InstanceData); // 16,384 instances * 128 bytes = 2,097,152 bytes = 2,048 kb
 	state.canvas_instance_data_buffers.resize(3);
 	state.canvas_instance_batches.reserve(200);
 
@@ -2614,8 +2593,8 @@ RasterizerCanvasGLES3::RasterizerCanvasGLES3() {
 		GLuint new_buffers[3];
 		glGenBuffers(3, new_buffers);
 		// Batch UBO.
-		glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[0]);
-		glBufferData(GL_UNIFORM_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
+		glBindBuffer(GL_ARRAY_BUFFER, new_buffers[0]);
+		glBufferData(GL_ARRAY_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
 		// Light uniform buffer.
 		glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[1]);
 		glBufferData(GL_UNIFORM_BUFFER, sizeof(LightUniform) * data.max_lights_per_render, nullptr, GL_STREAM_DRAW);
@@ -2623,41 +2602,28 @@ RasterizerCanvasGLES3::RasterizerCanvasGLES3() {
 		glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[2]);
 		glBufferData(GL_UNIFORM_BUFFER, sizeof(StateBuffer), nullptr, GL_STREAM_DRAW);
 		DataBuffer db;
-		db.ubo = new_buffers[0];
+		db.buffer = new_buffers[0];
 		db.light_ubo = new_buffers[1];
 		db.state_ubo = new_buffers[2];
 		db.last_frame_used = 0;
 		db.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 		state.canvas_instance_data_buffers[i] = db;
 	}
+	glBindBuffer(GL_ARRAY_BUFFER, 0);
 	glBindBuffer(GL_UNIFORM_BUFFER, 0);
 
-	state.instance_data_array = memnew_arr(InstanceData, data.max_instances_per_ubo);
+	state.instance_data_array = memnew_arr(InstanceData, data.max_instances_per_buffer);
 	state.light_uniforms = memnew_arr(LightUniform, data.max_lights_per_render);
 
 	{
-		const uint32_t no_of_instances = data.max_instances_per_batch;
-
+		const uint32_t indices[6] = { 0, 2, 1, 3, 2, 0 };
 		glGenVertexArrays(1, &data.indexed_quad_array);
 		glBindVertexArray(data.indexed_quad_array);
 		glBindBuffer(GL_ARRAY_BUFFER, data.canvas_quad_vertices);
-
-		const uint32_t num_indices = 6;
-		const uint32_t quad_indices[num_indices] = { 0, 2, 1, 3, 2, 0 };
-
-		const uint32_t total_indices = no_of_instances * num_indices;
-		uint32_t *indices = new uint32_t[total_indices];
-		for (uint32_t i = 0; i < total_indices; i++) {
-			uint32_t quad = i / num_indices;
-			uint32_t quad_local = i % num_indices;
-			indices[i] = quad_indices[quad_local] + quad * num_indices;
-		}
-
 		glGenBuffers(1, &data.indexed_quad_buffer);
 		glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, data.indexed_quad_buffer);
-		glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(uint32_t) * total_indices, indices, GL_STATIC_DRAW);
+		glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(uint32_t) * 6, indices, GL_STATIC_DRAW);
 		glBindVertexArray(0);
-		delete[] indices;
 	}
 
 	String global_defines;

+ 3 - 2
drivers/gles3/rasterizer_canvas_gles3.h

@@ -245,7 +245,7 @@ public:
 		uint32_t max_lights_per_render = 256;
 		uint32_t max_lights_per_item = 16;
 		uint32_t max_instances_per_batch = 512;
-		uint32_t max_instances_per_ubo = 16384;
+		uint32_t max_instances_per_buffer = 16384;
 		uint32_t max_instance_buffer_size = 16384 * 128;
 	} data;
 
@@ -278,7 +278,7 @@ public:
 	// We track them and ensure that they don't get reused until at least 2 frames have passed
 	// to avoid the GPU stalling to wait for a resource to become available.
 	struct DataBuffer {
-		GLuint ubo = 0;
+		GLuint buffer = 0;
 		GLuint light_ubo = 0;
 		GLuint state_ubo = 0;
 		uint64_t last_frame_used = -3;
@@ -359,6 +359,7 @@ public:
 	void _add_to_batch(uint32_t &r_index, bool &r_batch_broken);
 	void _allocate_instance_data_buffer();
 	void _align_instance_data_buffer(uint32_t &r_index);
+	void _enable_attributes(uint32_t p_start, bool p_primitive, uint32_t p_rate = 1);
 
 	void set_time(double p_time);
 

+ 160 - 59
drivers/gles3/shaders/canvas.glsl

@@ -11,6 +11,7 @@ mode_instanced = #define USE_ATTRIBUTES \n#define USE_INSTANCING
 
 DISABLE_LIGHTING = false
 USE_RGBA_SHADOWS = false
+SINGLE_INSTANCE = false
 
 #[vertex]
 
@@ -25,9 +26,74 @@ layout(location = 1) in highp vec4 instance_xform0;
 layout(location = 2) in highp vec4 instance_xform1;
 layout(location = 5) in highp uvec4 instance_color_custom_data; // Color packed into xy, custom_data packed into zw for compatibility with 3D
 
+#endif // USE_INSTANCING
+
+#endif // USE_ATTRIBUTES
+
+#include "stdlib_inc.glsl"
+
+layout(location = 6) in highp vec4 attrib_A;
+layout(location = 7) in highp vec4 attrib_B;
+layout(location = 8) in highp vec4 attrib_C;
+layout(location = 9) in highp vec4 attrib_D;
+layout(location = 10) in highp vec4 attrib_E;
+#ifdef USE_PRIMITIVE
+layout(location = 11) in highp uvec4 attrib_F;
+#else
+layout(location = 11) in highp vec4 attrib_F;
+#endif
+layout(location = 12) in highp uvec4 attrib_G;
+layout(location = 13) in highp uvec4 attrib_H;
+
+#define read_draw_data_world_x attrib_A.xy
+#define read_draw_data_world_y attrib_A.zw
+#define read_draw_data_world_ofs attrib_B.xy
+#define read_draw_data_color_texture_pixel_size attrib_B.zw
+
+#ifdef USE_PRIMITIVE
+
+#define read_draw_data_point_a attrib_C.xy
+#define read_draw_data_point_b attrib_C.zw
+#define read_draw_data_point_c attrib_D.xy
+#define read_draw_data_uv_a attrib_D.zw
+#define read_draw_data_uv_b attrib_E.xy
+#define read_draw_data_uv_c attrib_E.zw
+
+#define read_draw_data_color_a_rg attrib_F.x
+#define read_draw_data_color_a_ba attrib_F.y
+#define read_draw_data_color_b_rg attrib_F.z
+#define read_draw_data_color_b_ba attrib_F.w
+#define read_draw_data_color_c_rg attrib_G.x
+#define read_draw_data_color_c_ba attrib_G.y
+
+#else
+
+#define read_draw_data_modulation attrib_C
+#define read_draw_data_ninepatch_margins attrib_D
+#define read_draw_data_dst_rect attrib_E
+#define read_draw_data_src_rect attrib_F
+
 #endif
 
+#define read_draw_data_flags attrib_G.z
+#define read_draw_data_specular_shininess attrib_G.w
+#define read_draw_data_lights attrib_H
+
+// Varyings so the per-instance info can be used in the fragment shader
+flat out vec4 varying_A;
+flat out vec2 varying_B;
+#ifndef USE_PRIMITIVE
+flat out vec4 varying_C;
+#ifndef USE_ATTRIBUTES
+#ifdef USE_NINEPATCH
+
+flat out vec2 varying_D;
+#endif
+flat out vec4 varying_E;
+#endif
 #endif
+flat out uvec2 varying_F;
+flat out uvec4 varying_G;
 
 // This needs to be outside clang-format so the ubo comment is in the right place
 #ifdef MATERIAL_UNIFORMS_USED
@@ -39,12 +105,10 @@ layout(std140) uniform MaterialUniforms{ //ubo:4
 #endif
 /* clang-format on */
 #include "canvas_uniforms_inc.glsl"
-#include "stdlib_inc.glsl"
 
 out vec2 uv_interp;
 out vec4 color_interp;
 out vec2 vertex_interp;
-flat out int draw_data_instance;
 
 #ifdef USE_NINEPATCH
 
@@ -55,35 +119,46 @@ out vec2 pixel_size_interp;
 #GLOBALS
 
 void main() {
+	varying_A = vec4(read_draw_data_world_x, read_draw_data_world_y);
+	varying_B = read_draw_data_color_texture_pixel_size;
+#ifndef USE_PRIMITIVE
+	varying_C = read_draw_data_ninepatch_margins;
+
+#ifndef USE_ATTRIBUTES
+#ifdef USE_NINEPATCH
+	varying_D = vec2(read_draw_data_dst_rect.z, read_draw_data_dst_rect.w);
+#endif // USE_NINEPATCH
+	varying_E = read_draw_data_src_rect;
+#endif // !USE_ATTRIBUTES
+#endif // USE_PRIMITIVE
+
+	varying_F = uvec2(read_draw_data_flags, read_draw_data_specular_shininess);
+	varying_G = read_draw_data_lights;
+
 	vec4 instance_custom = vec4(0.0);
 
 #ifdef USE_PRIMITIVE
-	draw_data_instance = gl_InstanceID;
 	vec2 vertex;
 	vec2 uv;
 	vec4 color;
 
 	if (gl_VertexID % 3 == 0) {
-		vertex = draw_data[draw_data_instance].point_a;
-		uv = draw_data[draw_data_instance].uv_a;
-		color = vec4(unpackHalf2x16(draw_data[draw_data_instance].color_a_rg), unpackHalf2x16(draw_data[draw_data_instance].color_a_ba));
+		vertex = read_draw_data_point_a;
+		uv = read_draw_data_uv_a;
+		color = vec4(unpackHalf2x16(read_draw_data_color_a_rg), unpackHalf2x16(read_draw_data_color_a_ba));
 	} else if (gl_VertexID % 3 == 1) {
-		vertex = draw_data[draw_data_instance].point_b;
-		uv = draw_data[draw_data_instance].uv_b;
-		color = vec4(unpackHalf2x16(draw_data[draw_data_instance].color_b_rg), unpackHalf2x16(draw_data[draw_data_instance].color_b_ba));
+		vertex = read_draw_data_point_b;
+		uv = read_draw_data_uv_b;
+		color = vec4(unpackHalf2x16(read_draw_data_color_b_rg), unpackHalf2x16(read_draw_data_color_b_ba));
 	} else {
-		vertex = draw_data[draw_data_instance].point_c;
-		uv = draw_data[draw_data_instance].uv_c;
-		color = vec4(unpackHalf2x16(draw_data[draw_data_instance].color_c_rg), unpackHalf2x16(draw_data[draw_data_instance].color_c_ba));
+		vertex = read_draw_data_point_c;
+		uv = read_draw_data_uv_c;
+		color = vec4(unpackHalf2x16(read_draw_data_color_c_rg), unpackHalf2x16(read_draw_data_color_c_ba));
 	}
 
 #elif defined(USE_ATTRIBUTES)
-	draw_data_instance = gl_InstanceID;
-#ifdef USE_INSTANCING
-	draw_data_instance = 0;
-#endif
 	vec2 vertex = vertex_attrib;
-	vec4 color = color_attrib * draw_data[draw_data_instance].modulation;
+	vec4 color = color_attrib * read_draw_data_modulation;
 	vec2 uv = uv_attrib;
 
 #ifdef USE_INSTANCING
@@ -93,30 +168,29 @@ void main() {
 #endif
 
 #else
-	draw_data_instance = gl_VertexID / 6;
 	vec2 vertex_base_arr[6] = vec2[](vec2(0.0, 0.0), vec2(0.0, 1.0), vec2(1.0, 1.0), vec2(1.0, 0.0), vec2(0.0, 0.0), vec2(1.0, 1.0));
 	vec2 vertex_base = vertex_base_arr[gl_VertexID % 6];
 
-	vec2 uv = draw_data[draw_data_instance].src_rect.xy + abs(draw_data[draw_data_instance].src_rect.zw) * ((draw_data[draw_data_instance].flags & FLAGS_TRANSPOSE_RECT) != uint(0) ? vertex_base.yx : vertex_base.xy);
-	vec4 color = draw_data[draw_data_instance].modulation;
-	vec2 vertex = draw_data[draw_data_instance].dst_rect.xy + abs(draw_data[draw_data_instance].dst_rect.zw) * mix(vertex_base, vec2(1.0, 1.0) - vertex_base, lessThan(draw_data[draw_data_instance].src_rect.zw, vec2(0.0, 0.0)));
+	vec2 uv = read_draw_data_src_rect.xy + abs(read_draw_data_src_rect.zw) * ((read_draw_data_flags & FLAGS_TRANSPOSE_RECT) != uint(0) ? vertex_base.yx : vertex_base.xy);
+	vec4 color = read_draw_data_modulation;
+	vec2 vertex = read_draw_data_dst_rect.xy + abs(read_draw_data_dst_rect.zw) * mix(vertex_base, vec2(1.0, 1.0) - vertex_base, lessThan(read_draw_data_src_rect.zw, vec2(0.0, 0.0)));
 
 #endif
 
-	mat4 model_matrix = mat4(vec4(draw_data[draw_data_instance].world_x, 0.0, 0.0), vec4(draw_data[draw_data_instance].world_y, 0.0, 0.0), vec4(0.0, 0.0, 1.0, 0.0), vec4(draw_data[draw_data_instance].world_ofs, 0.0, 1.0));
+	mat4 model_matrix = mat4(vec4(read_draw_data_world_x, 0.0, 0.0), vec4(read_draw_data_world_y, 0.0, 0.0), vec4(0.0, 0.0, 1.0, 0.0), vec4(read_draw_data_world_ofs, 0.0, 1.0));
 
 #ifdef USE_INSTANCING
 	model_matrix = model_matrix * transpose(mat4(instance_xform0, instance_xform1, vec4(0.0, 0.0, 1.0, 0.0), vec4(0.0, 0.0, 0.0, 1.0)));
 #endif // USE_INSTANCING
 
 #if !defined(USE_ATTRIBUTES) && !defined(USE_PRIMITIVE)
-	if (bool(draw_data[draw_data_instance].flags & FLAGS_USING_PARTICLES)) {
+	if (bool(read_draw_data_flags & FLAGS_USING_PARTICLES)) {
 		//scale by texture size
-		vertex /= draw_data[draw_data_instance].color_texture_pixel_size;
+		vertex /= read_draw_data_color_texture_pixel_size;
 	}
 #endif
 
-	vec2 color_texture_pixel_size = draw_data[draw_data_instance].color_texture_pixel_size.xy;
+	vec2 color_texture_pixel_size = read_draw_data_color_texture_pixel_size;
 
 #ifdef USE_POINT_SIZE
 	float point_size = 1.0;
@@ -126,7 +200,7 @@ void main() {
 	}
 
 #ifdef USE_NINEPATCH
-	pixel_size_interp = abs(draw_data[draw_data_instance].dst_rect.zw) * vertex_base;
+	pixel_size_interp = abs(read_draw_data_dst_rect.zw) * vertex_base;
 #endif
 
 #if !defined(SKIP_TRANSFORM_USED)
@@ -159,6 +233,46 @@ void main() {
 #include "canvas_uniforms_inc.glsl"
 #include "stdlib_inc.glsl"
 
+in vec2 uv_interp;
+in vec2 vertex_interp;
+in vec4 color_interp;
+
+#ifdef USE_NINEPATCH
+
+in vec2 pixel_size_interp;
+
+#endif
+
+// Can all be flat as they are the same for the whole batched instance
+flat in vec4 varying_A;
+flat in vec2 varying_B;
+#define read_draw_data_world_x varying_A.xy
+#define read_draw_data_world_y varying_A.zw
+#define read_draw_data_color_texture_pixel_size varying_B
+
+#ifndef USE_PRIMITIVE
+flat in vec4 varying_C;
+#define read_draw_data_ninepatch_margins varying_C
+
+#ifndef USE_ATTRIBUTES
+#ifdef USE_NINEPATCH
+
+flat in vec2 varying_D;
+#define read_draw_data_dst_rect_z varying_D.x
+#define read_draw_data_dst_rect_w varying_D.y
+#endif
+
+flat in vec4 varying_E;
+#define read_draw_data_src_rect varying_E
+#endif // USE_ATTRIBUTES
+#endif // USE_PRIMITIVE
+
+flat in uvec2 varying_F;
+flat in uvec4 varying_G;
+#define read_draw_data_flags varying_F.x
+#define read_draw_data_specular_shininess varying_F.y
+#define read_draw_data_lights varying_G
+
 #ifndef DISABLE_LIGHTING
 uniform sampler2D atlas_texture; //texunit:-2
 uniform sampler2D shadow_atlas_texture; //texunit:-3
@@ -170,17 +284,6 @@ uniform sampler2D specular_texture; //texunit:-7
 
 uniform sampler2D color_texture; //texunit:0
 
-in vec2 uv_interp;
-in vec4 color_interp;
-in vec2 vertex_interp;
-flat in int draw_data_instance;
-
-#ifdef USE_NINEPATCH
-
-in vec2 pixel_size_interp;
-
-#endif
-
 layout(location = 0) out vec4 frag_color;
 
 #ifdef MATERIAL_UNIFORMS_USED
@@ -366,7 +469,7 @@ float map_ninepatch_axis(float pixel, float draw_size, float tex_pixel_size, flo
 	} else if (pixel >= draw_size - margin_end) {
 		return (tex_size - (draw_size - pixel)) * tex_pixel_size;
 	} else {
-		if (!bool(draw_data[draw_data_instance].flags & FLAGS_NINEPACH_DRAW_CENTER)) {
+		if (!bool(read_draw_data_flags & FLAGS_NINEPACH_DRAW_CENTER)) {
 			draw_center--;
 		}
 
@@ -414,28 +517,26 @@ void main() {
 
 	int draw_center = 2;
 	uv = vec2(
-			map_ninepatch_axis(pixel_size_interp.x, abs(draw_data[draw_data_instance].dst_rect.z), draw_data[draw_data_instance].color_texture_pixel_size.x, draw_data[draw_data_instance].ninepatch_margins.x, draw_data[draw_data_instance].ninepatch_margins.z, int(draw_data[draw_data_instance].flags >> FLAGS_NINEPATCH_H_MODE_SHIFT) & 0x3, draw_center),
-			map_ninepatch_axis(pixel_size_interp.y, abs(draw_data[draw_data_instance].dst_rect.w), draw_data[draw_data_instance].color_texture_pixel_size.y, draw_data[draw_data_instance].ninepatch_margins.y, draw_data[draw_data_instance].ninepatch_margins.w, int(draw_data[draw_data_instance].flags >> FLAGS_NINEPATCH_V_MODE_SHIFT) & 0x3, draw_center));
+			map_ninepatch_axis(pixel_size_interp.x, abs(read_draw_data_dst_rect_z), read_draw_data_color_texture_pixel_size.x, read_draw_data_ninepatch_margins.x, read_draw_data_ninepatch_margins.z, int(read_draw_data_flags >> FLAGS_NINEPATCH_H_MODE_SHIFT) & 0x3, draw_center),
+			map_ninepatch_axis(pixel_size_interp.y, abs(read_draw_data_dst_rect_w), read_draw_data_color_texture_pixel_size.y, read_draw_data_ninepatch_margins.y, read_draw_data_ninepatch_margins.w, int(read_draw_data_flags >> FLAGS_NINEPATCH_V_MODE_SHIFT) & 0x3, draw_center));
 
 	if (draw_center == 0) {
 		color.a = 0.0;
 	}
 
-	uv = uv * draw_data[draw_data_instance].src_rect.zw + draw_data[draw_data_instance].src_rect.xy; //apply region if needed
+	uv = uv * read_draw_data_src_rect.zw + read_draw_data_src_rect.xy; //apply region if needed
 
 #endif
-	if (bool(draw_data[draw_data_instance].flags & FLAGS_CLIP_RECT_UV)) {
-		uv = clamp(uv, draw_data[draw_data_instance].src_rect.xy, draw_data[draw_data_instance].src_rect.xy + abs(draw_data[draw_data_instance].src_rect.zw));
+	if (bool(read_draw_data_flags & FLAGS_CLIP_RECT_UV)) {
+		uv = clamp(uv, read_draw_data_src_rect.xy, read_draw_data_src_rect.xy + abs(read_draw_data_src_rect.zw));
 	}
 
 #endif
 
 #ifndef USE_PRIMITIVE
-	if (bool(draw_data[draw_data_instance].flags & FLAGS_USE_MSDF)) {
-		float px_range = draw_data[draw_data_instance].ninepatch_margins.x;
-		float outline_thickness = draw_data[draw_data_instance].ninepatch_margins.y;
-		//float reserved1 = draw_data[draw_data_instance].ninepatch_margins.z;
-		//float reserved2 = draw_data[draw_data_instance].ninepatch_margins.w;
+	if (bool(read_draw_data_flags & FLAGS_USE_MSDF)) {
+		float px_range = read_draw_data_ninepatch_margins.x;
+		float outline_thickness = read_draw_data_ninepatch_margins.y;
 
 		vec4 msdf_sample = texture(color_texture, uv);
 		vec2 msdf_size = vec2(textureSize(color_texture, 0));
@@ -451,7 +552,7 @@ void main() {
 			float a = clamp(d * px_size + 0.5, 0.0, 1.0);
 			color.a = a * color.a;
 		}
-	} else if (bool(draw_data[draw_data_instance].flags & FLAGS_USE_LCD)) {
+	} else if (bool(read_draw_data_flags & FLAGS_USE_LCD)) {
 		vec4 lcd_sample = texture(color_texture, uv);
 		if (lcd_sample.a == 1.0) {
 			color.rgb = lcd_sample.rgb * color.a;
@@ -465,7 +566,7 @@ void main() {
 		color *= texture(color_texture, uv);
 	}
 
-	uint light_count = (draw_data[draw_data_instance].flags >> uint(FLAGS_LIGHT_COUNT_SHIFT)) & uint(0xF); //max 16 lights
+	uint light_count = (read_draw_data_flags >> uint(FLAGS_LIGHT_COUNT_SHIFT)) & uint(0xF); //max 16 lights
 	bool using_light = light_count > 0u || directional_light_count > 0u;
 
 	vec3 normal;
@@ -476,7 +577,7 @@ void main() {
 	bool normal_used = false;
 #endif
 
-	if (normal_used || (using_light && bool(draw_data[draw_data_instance].flags & FLAGS_DEFAULT_NORMAL_MAP_USED))) {
+	if (normal_used || (using_light && bool(read_draw_data_flags & FLAGS_DEFAULT_NORMAL_MAP_USED))) {
 		normal.xy = texture(normal_texture, uv).xy * vec2(2.0, -2.0) - vec2(1.0, -1.0);
 		normal.z = sqrt(1.0 - dot(normal.xy, normal.xy));
 		normal_used = true;
@@ -493,9 +594,9 @@ void main() {
 	bool specular_shininess_used = false;
 #endif
 
-	if (specular_shininess_used || (using_light && normal_used && bool(draw_data[draw_data_instance].flags & FLAGS_DEFAULT_SPECULAR_MAP_USED))) {
+	if (specular_shininess_used || (using_light && normal_used && bool(read_draw_data_flags & FLAGS_DEFAULT_SPECULAR_MAP_USED))) {
 		specular_shininess = texture(specular_texture, uv);
-		specular_shininess *= godot_unpackUnorm4x8(draw_data[draw_data_instance].specular_shininess);
+		specular_shininess *= godot_unpackUnorm4x8(read_draw_data_specular_shininess);
 		specular_shininess_used = true;
 	} else {
 		specular_shininess = vec4(1.0);
@@ -507,7 +608,7 @@ void main() {
 	vec2 screen_uv = vec2(0.0);
 #endif
 
-	vec2 color_texture_pixel_size = draw_data[draw_data_instance].color_texture_pixel_size.xy;
+	vec2 color_texture_pixel_size = read_draw_data_color_texture_pixel_size.xy;
 
 	vec3 light_vertex = vec3(vertex, 0.0);
 	vec2 shadow_vertex = vertex;
@@ -529,7 +630,7 @@ void main() {
 
 	if (normal_used) {
 		//convert by item transform
-		normal.xy = mat2(normalize(draw_data[draw_data_instance].world_x), normalize(draw_data[draw_data_instance].world_y)) * normal.xy;
+		normal.xy = mat2(normalize(read_draw_data_world_x), normalize(read_draw_data_world_y)) * normal.xy;
 		//convert by canvas transform
 		normal = normalize((canvas_normal_transform * vec4(normal, 0.0)).xyz);
 	}
@@ -591,15 +692,15 @@ void main() {
 		uint light_base;
 		if (i < 8u) {
 			if (i < 4u) {
-				light_base = draw_data[draw_data_instance].lights[0];
+				light_base = read_draw_data_lights[0];
 			} else {
-				light_base = draw_data[draw_data_instance].lights[1];
+				light_base = read_draw_data_lights[1];
 			}
 		} else {
 			if (i < 12u) {
-				light_base = draw_data[draw_data_instance].lights[2];
+				light_base = read_draw_data_lights[2];
 			} else {
-				light_base = draw_data[draw_data_instance].lights[3];
+				light_base = read_draw_data_lights[3];
 			}
 		}
 		light_base >>= (i & 3u) * 8u;

+ 0 - 36
drivers/gles3/shaders/canvas_uniforms_inc.glsl

@@ -27,38 +27,6 @@
 #define FLAGS_USE_MSDF uint(1 << 28)
 #define FLAGS_USE_LCD uint(1 << 29)
 
-// must be always 128 bytes long
-struct DrawData {
-	vec2 world_x;
-	vec2 world_y;
-	vec2 world_ofs;
-	vec2 color_texture_pixel_size;
-#ifdef USE_PRIMITIVE
-	vec2 point_a;
-	vec2 point_b;
-	vec2 point_c;
-	vec2 uv_a;
-	vec2 uv_b;
-	vec2 uv_c;
-	uint color_a_rg;
-	uint color_a_ba;
-	uint color_b_rg;
-	uint color_b_ba;
-	uint color_c_rg;
-	uint color_c_ba;
-#else
-	vec4 modulation;
-	vec4 ninepatch_margins;
-	vec4 dst_rect; //for built-in rect and UV
-	vec4 src_rect;
-	uint pad;
-	uint pad2;
-#endif
-	uint flags;
-	uint specular_shininess;
-	uvec4 lights;
-};
-
 layout(std140) uniform GlobalShaderUniformData { //ubo:1
 	vec4 global_shader_uniforms[MAX_GLOBAL_SHADER_UNIFORMS];
 };
@@ -116,7 +84,3 @@ layout(std140) uniform LightData { //ubo:2
 	Light light_array[MAX_LIGHTS];
 };
 #endif // DISABLE_LIGHTING
-layout(std140) uniform DrawDataInstances { //ubo:3
-
-	DrawData draw_data[MAX_DRAW_DATA_INSTANCES];
-};

+ 0 - 2
drivers/gles3/storage/config.cpp

@@ -90,8 +90,6 @@ Config::Config() {
 	glGetIntegerv(GL_MAX_UNIFORM_BLOCK_SIZE, &max_uniform_buffer_size);
 	glGetIntegerv(GL_MAX_VIEWPORT_DIMS, max_viewport_size);
 
-	glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_offset_alignment);
-
 	support_anisotropic_filter = extensions.has("GL_EXT_texture_filter_anisotropic");
 	if (support_anisotropic_filter) {
 		glGetFloatv(_GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &anisotropic_level);

+ 0 - 2
drivers/gles3/storage/config.h

@@ -67,8 +67,6 @@ public:
 	int max_renderable_lights = 0;
 	int max_lights_per_object = 0;
 
-	int uniform_buffer_offset_alignment = 0;
-
 	// TODO implement wireframe in OpenGL
 	// bool generate_wireframes;