cluster_render.glsl 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #[vertex]
  2. #version 450
  3. VERSION_DEFINES
  4. layout(location = 0) in vec3 vertex_attrib;
  5. layout(location = 0) out float depth_interp;
  6. layout(location = 1) out flat uint element_index;
  7. layout(push_constant, binding = 0, std430) uniform Params {
  8. uint base_index;
  9. uint pad0;
  10. uint pad1;
  11. uint pad2;
  12. }
  13. params;
  14. layout(set = 0, binding = 1, std140) uniform State {
  15. mat4 projection;
  16. float inv_z_far;
  17. uint screen_to_clusters_shift; // shift to obtain coordinates in block indices
  18. uint cluster_screen_width; //
  19. uint cluster_data_size; // how much data for a single cluster takes
  20. uint cluster_depth_offset;
  21. uint pad0;
  22. uint pad1;
  23. uint pad2;
  24. }
  25. state;
  26. struct RenderElement {
  27. uint type; //0-4
  28. bool touches_near;
  29. bool touches_far;
  30. uint original_index;
  31. mat3x4 transform_inv;
  32. vec3 scale;
  33. uint pad;
  34. };
  35. layout(set = 0, binding = 2, std430) buffer restrict readonly RenderElements {
  36. RenderElement data[];
  37. }
  38. render_elements;
  39. void main() {
  40. element_index = params.base_index + gl_InstanceIndex;
  41. vec3 vertex = vertex_attrib;
  42. vertex *= render_elements.data[element_index].scale;
  43. vertex = vec4(vertex, 1.0) * render_elements.data[element_index].transform_inv;
  44. depth_interp = -vertex.z;
  45. gl_Position = state.projection * vec4(vertex, 1.0);
  46. }
  47. #[fragment]
  48. #version 450
  49. VERSION_DEFINES
  50. #if defined(GL_KHR_shader_subgroup_ballot) && defined(GL_KHR_shader_subgroup_arithmetic) && defined(GL_KHR_shader_subgroup_vote)
  51. #extension GL_KHR_shader_subgroup_ballot : enable
  52. #extension GL_KHR_shader_subgroup_arithmetic : enable
  53. #extension GL_KHR_shader_subgroup_vote : enable
  54. #define USE_SUBGROUPS
  55. #endif
  56. layout(location = 0) in float depth_interp;
  57. layout(location = 1) in flat uint element_index;
  58. layout(set = 0, binding = 1, std140) uniform State {
  59. mat4 projection;
  60. float inv_z_far;
  61. uint screen_to_clusters_shift; // shift to obtain coordinates in block indices
  62. uint cluster_screen_width; //
  63. uint cluster_data_size; // how much data for a single cluster takes
  64. uint cluster_depth_offset;
  65. uint pad0;
  66. uint pad1;
  67. uint pad2;
  68. }
  69. state;
  70. //cluster data is layout linearly, each cell contains the follow information:
  71. // - list of bits for every element to mark as used, so (max_elem_count/32)*4 uints
  72. // - a uint for each element to mark the depth bits used when rendering (0-31)
  73. layout(set = 0, binding = 3, std430) buffer restrict ClusterRender {
  74. uint data[];
  75. }
  76. cluster_render;
  77. void main() {
  78. //convert from screen to cluster
  79. uvec2 cluster = uvec2(gl_FragCoord.xy) >> state.screen_to_clusters_shift;
  80. //get linear cluster offset from screen poss
  81. uint cluster_offset = cluster.x + state.cluster_screen_width * cluster.y;
  82. //multiply by data size to position at the beginning of the element list for this cluster
  83. cluster_offset *= state.cluster_data_size;
  84. //find the current element in the list and plot the bit to mark it as used
  85. uint usage_write_offset = cluster_offset + (element_index >> 5);
  86. uint usage_write_bit = 1 << (element_index & 0x1F);
  87. #ifdef USE_SUBGROUPS
  88. uint cluster_thread_group_index;
  89. if (!gl_HelperInvocation) {
  90. //http://advances.realtimerendering.com/s2017/2017_Sig_Improved_Culling_final.pdf
  91. uvec4 mask;
  92. while (true) {
  93. // find the cluster offset of the first active thread
  94. // threads that did break; go inactive and no longer count
  95. uint first = subgroupBroadcastFirst(cluster_offset);
  96. // update the mask for thread that match this cluster
  97. mask = subgroupBallot(first == cluster_offset);
  98. if (first == cluster_offset) {
  99. // This thread belongs to the group of threads that match this offset,
  100. // so exit the loop.
  101. break;
  102. }
  103. }
  104. cluster_thread_group_index = subgroupBallotExclusiveBitCount(mask);
  105. if (cluster_thread_group_index == 0) {
  106. atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);
  107. }
  108. }
  109. #else
  110. if (!gl_HelperInvocation) {
  111. atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);
  112. }
  113. #endif
  114. //find the current element in the depth usage list and mark the current depth as used
  115. float unit_depth = depth_interp * state.inv_z_far;
  116. uint z_bit = clamp(uint(floor(unit_depth * 32.0)), 0, 31);
  117. uint z_write_offset = cluster_offset + state.cluster_depth_offset + element_index;
  118. uint z_write_bit = 1 << z_bit;
  119. #ifdef USE_SUBGROUPS
  120. if (!gl_HelperInvocation) {
  121. z_write_bit = subgroupOr(z_write_bit); //merge all Zs
  122. if (cluster_thread_group_index == 0) {
  123. atomicOr(cluster_render.data[z_write_offset], z_write_bit);
  124. }
  125. }
  126. #else
  127. if (!gl_HelperInvocation) {
  128. atomicOr(cluster_render.data[z_write_offset], z_write_bit);
  129. }
  130. #endif
  131. }