bc4.glsl 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #[versions]
  2. unsigned = "";
  3. signed = "#define SNORM";
  4. #[compute]
  5. #version 450
  6. #VERSION_DEFINES
  7. shared vec2 g_minMaxValues[4u * 4u * 4u];
  8. shared uvec2 g_mask[4u * 4u];
  9. layout(binding = 0) uniform sampler2D srcTex;
  10. layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
  11. layout(push_constant, std430) uniform Params {
  12. uint p_channelIdx;
  13. uint p_padding[3];
  14. }
  15. params;
  16. layout(local_size_x = 4, //
  17. local_size_y = 4, //
  18. local_size_z = 4) in;
  19. /// Each block is 16 pixels
  20. /// Each thread works on 4 pixels
  21. /// Therefore each block needs 4 threads, generating 8 masks
  22. /// At the end these 8 masks get merged into 2 and results written to output
  23. ///
  24. /// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
  25. ///
  26. /// A: It's a sweetspot.
  27. /// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
  28. /// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
  29. /// overhead, and also more LDS usage which reduces occupancy.
  30. /// - Long threads (e.g. 1 thread per block) misses parallelism opportunities
  31. void main() {
  32. float minVal, maxVal;
  33. vec4 srcPixel;
  34. const uint blockThreadId = gl_LocalInvocationID.x;
  35. const uvec2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
  36. for (uint i = 0u; i < 4u; ++i) {
  37. const uvec2 pixelsToLoad = pixelsToLoadBase + uvec2(i, blockThreadId);
  38. const vec4 value = texelFetch(srcTex, ivec2(pixelsToLoad), 0).xyzw;
  39. srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w);
  40. srcPixel[i] *= 255.0f;
  41. }
  42. minVal = min(srcPixel.x, min(srcPixel.y, srcPixel.z));
  43. maxVal = max(srcPixel.x, max(srcPixel.y, srcPixel.z));
  44. minVal = min(minVal, srcPixel.w);
  45. maxVal = max(maxVal, srcPixel.w);
  46. const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u);
  47. const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y;
  48. g_minMaxValues[minMaxIdxBase + blockThreadId] = vec2(minVal, maxVal);
  49. g_mask[maskIdxBase] = uvec2(0u, 0u);
  50. memoryBarrierShared();
  51. barrier();
  52. // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
  53. for (uint i = 0u; i < 4u; ++i) {
  54. minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal);
  55. maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal);
  56. }
  57. // determine bias and emit color indices
  58. // given the choice of maxVal/minVal, these indices are optimal:
  59. // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
  60. float dist = maxVal - minVal;
  61. float dist4 = dist * 4.0f;
  62. float dist2 = dist * 2.0f;
  63. float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2);
  64. bias -= minVal * 7;
  65. uint mask0 = 0u, mask1 = 0u;
  66. for (uint i = 0u; i < 4u; ++i) {
  67. float a = srcPixel[i] * 7.0f + bias;
  68. int ind = 0;
  69. // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
  70. if (a >= dist4) {
  71. ind = 4;
  72. a -= dist4;
  73. }
  74. if (a >= dist2) {
  75. ind += 2;
  76. a -= dist2;
  77. }
  78. if (a >= dist) {
  79. ind += 1;
  80. }
  81. // turn linear scale into DXT index (0/1 are extremal pts)
  82. ind = -ind & 7;
  83. ind ^= (2 > ind) ? 1 : 0;
  84. // write index
  85. const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u;
  86. if (bits < 32u) {
  87. mask0 |= uint(ind) << bits;
  88. if (bits + 3u > 32u) {
  89. mask1 |= uint(ind) >> (32u - bits);
  90. }
  91. } else {
  92. mask1 |= uint(ind) << (bits - 32u);
  93. }
  94. }
  95. if (mask0 != 0u) {
  96. atomicOr(g_mask[maskIdxBase].x, mask0);
  97. }
  98. if (mask1 != 0u) {
  99. atomicOr(g_mask[maskIdxBase].y, mask1);
  100. }
  101. memoryBarrierShared();
  102. barrier();
  103. if (blockThreadId == 0u) {
  104. // Save data
  105. uvec2 outputBytes;
  106. #ifdef SNORM
  107. outputBytes.x =
  108. packSnorm4x8(vec4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f,
  109. minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f));
  110. #else
  111. outputBytes.x = packUnorm4x8(
  112. vec4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f));
  113. #endif
  114. outputBytes.x |= g_mask[maskIdxBase].x;
  115. outputBytes.y = g_mask[maskIdxBase].y;
  116. uvec2 dstUV = gl_GlobalInvocationID.yz;
  117. imageStore(dstTexture, ivec2(dstUV), uvec4(outputBytes.xy, 0u, 0u));
  118. }
  119. }