cs_gdr_stream_compaction.sc 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. /*
  2. * Copyright 2018 Kostas Anagnostou. All rights reserved.
  3. * License: https://github.com/bkaradzic/bgfx/blob/master/LICENSE
  4. */
  5. #include "bgfx_compute.sh"
  6. //the per drawcall data that is constant (noof indices and offsets to vertex/index buffers)
  7. BUFFER_RO(drawcallConstData, uint, 0);
  8. //instance data for all instances (pre culling)
  9. BUFFER_RO(instanceDataIn, vec4, 1);
  10. //per instance visibility (output of culling pass)
  11. BUFFER_RO(instancePredicates, bool, 2);
  12. //how many instances per drawcall
  13. BUFFER_RW(drawcallInstanceCount, uint, 3);
  14. //drawcall data that will drive drawIndirect
  15. BUFFER_RW(drawcallData, uvec4, 4);
  16. //culled instance data
  17. BUFFER_WO(instanceDataOut, vec4, 5);
  18. uniform vec4 u_cullingConfig;
  19. // Based on Parallel Prefix Sum (Scan) with CUDA by Mark Harris
  20. SHARED uint temp[2048];
  21. NUM_THREADS(1024, 1, 1)
  22. void main()
  23. {
  24. uint tID = gl_GlobalInvocationID.x;
  25. int NoofInstancesPowOf2 = int(u_cullingConfig.y);
  26. int NoofDrawcalls = int(u_cullingConfig.w);
  27. int offset = 1;
  28. bool predicate = instancePredicates[2 * tID];
  29. temp[2 * tID] = uint(predicate ? 1 : 0);
  30. predicate = instancePredicates[2 * tID + 1];
  31. temp[2 * tID + 1] = uint(predicate ? 1 : 0);
  32. int d;
  33. //perform reduction
  34. for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1)
  35. {
  36. barrier();
  37. if (tID < d)
  38. {
  39. int ai = int(offset * (2 * tID + 1) - 1);
  40. int bi = int(offset * (2 * tID + 2) - 1);
  41. temp[bi] += temp[ai];
  42. }
  43. offset *= 2;
  44. }
  45. // clear the last element
  46. if (tID == 0)
  47. {
  48. temp[NoofInstancesPowOf2 - 1] = 0;
  49. }
  50. // perform downsweep and build scan
  51. for ( d = 1; d < NoofInstancesPowOf2; d *= 2)
  52. {
  53. offset >>= 1;
  54. barrier();
  55. if (tID < d)
  56. {
  57. int ai = int(offset * (2 * tID + 1) - 1);
  58. int bi = int(offset * (2 * tID + 2) - 1);
  59. int t = int(temp[ai]);
  60. temp[ai] = temp[bi];
  61. temp[bi] += t;
  62. }
  63. }
  64. barrier();
  65. int index = int(2 * tID);
  66. // scatter results
  67. predicate = instancePredicates[index];
  68. if (predicate)
  69. {
  70. instanceDataOut[4 * temp[index] ] = instanceDataIn[4 * index ];
  71. instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1];
  72. instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2];
  73. instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3];
  74. }
  75. index = int(2 * tID + 1);
  76. predicate = instancePredicates[index];
  77. if (predicate)
  78. {
  79. instanceDataOut[4 * temp[index] ] = instanceDataIn[4 * index ];
  80. instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1];
  81. instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2];
  82. instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3];
  83. }
  84. if (tID == 0)
  85. {
  86. uint startInstance = 0;
  87. //copy data to indirect buffer, could possible be done in a different compute shader
  88. for (int k = 0; k < NoofDrawcalls; k++)
  89. {
  90. drawIndexedIndirect(
  91. drawcallData,
  92. k,
  93. drawcallConstData[ k * 3 ], //number of indices
  94. drawcallInstanceCount[k], //number of instances
  95. drawcallConstData[ k * 3 + 1 ], //offset into the index buffer
  96. drawcallConstData[ k * 3 + 2 ], //offset into the vertex buffer
  97. startInstance //offset into the instance buffer
  98. );
  99. startInstance += drawcallInstanceCount[k];
  100. drawcallInstanceCount[k] = 0;
  101. }
  102. }
  103. }