PixelPacking.hlsli 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. //
  2. // Copyright (c) Microsoft. All rights reserved.
  3. // This code is licensed under the MIT License (MIT).
  4. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  5. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  6. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  7. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  8. //
  9. // Developed by Minigraph
  10. //
  11. // Author: James Stanard
  12. //
  13. // RGBM is a good way to pack HDR values into R8G8B8A8_UNORM
  14. uint PackRGBM( float3 rgb, float PeakValue = 16.0 )
  15. {
  16. rgb = saturate(rgb / PeakValue);
  17. float maxVal = max(max(1e-6, rgb.x), max(rgb.y, rgb.z));
  18. maxVal = ceil(maxVal * 255.0);
  19. float divisor = (255 * 255.0) / maxVal;
  20. #if _XBOX_ONE
  21. uint RGBM = (uint)maxVal;
  22. RGBM = __XB_PackF32ToU8(rgb.r * divisor + 0.5, 3, RGBM);
  23. RGBM = __XB_PackF32ToU8(rgb.g * divisor + 0.5, 2, RGBM);
  24. RGBM = __XB_PackF32ToU8(rgb.b * divisor + 0.5, 1, RGBM);
  25. return RGBM;
  26. #else
  27. uint M = (uint)maxVal;
  28. uint R = (uint)(rgb.r * divisor + 0.5);
  29. uint G = (uint)(rgb.g * divisor + 0.5);
  30. uint B = (uint)(rgb.b * divisor + 0.5);
  31. return R << 24 | G << 16 | B << 8 | M;
  32. #endif
  33. }
  34. float3 UnpackRGBM( uint p, float PeakValue = 16.0 )
  35. {
  36. #if _XBOX_ONE
  37. float R = __XB_UnpackByte3(p);
  38. float G = __XB_UnpackByte2(p);
  39. float B = __XB_UnpackByte1(p);
  40. float M = __XB_UnpackByte0(p);
  41. #else
  42. uint R = p >> 24;
  43. uint G = (p >> 16) & 0xFF;
  44. uint B = (p >> 8) & 0xFF;
  45. uint M = p & 0xFF;
  46. #endif
  47. return float3(R, G, B) * M * PeakValue / (255.0 * 255.0);
  48. }
  49. // RGBE packs 9 bits per color channel while encoding the multiplier as a perfect power of 2 (just the exponent)
  50. // What's nice about this is that it gives you a lot more range than RGBM. This isn't proven to be bitwise
  51. // compatible with DXGI_FORMAT_R9B9G9E5_SHAREDEXP, but if it's not, it could be made so.
  52. uint PackRGBE(float3 rgb)
  53. {
  54. float MaxChannel = max(rgb.r, max(rgb.g, rgb.b));
  55. // NextPow2 has to have the biggest exponent plus 1 (and nothing in the mantissa)
  56. float NextPow2 = asfloat((asuint(MaxChannel) + 0x800000) & 0x7F800000);
  57. // By adding NextPow2, all channels have the same exponent, shifting their mantissa bits
  58. // to the right to accomodate it. This also shifts in the implicit '1' bit of all channels.
  59. // The largest channel will always have the high bit set.
  60. rgb += NextPow2;
  61. #if _XBOX_ONE
  62. uint R = __XB_UBFE(9, 14, asuint(rgb.r));
  63. uint G = __XB_UBFE(9, 14, asuint(rgb.g));
  64. uint B = __XB_UBFE(9, 14, asuint(rgb.b));
  65. #else
  66. uint R = (asuint(rgb.r) << 9) >> 23;
  67. uint G = (asuint(rgb.g) << 9) >> 23;
  68. uint B = (asuint(rgb.b) << 9) >> 23;
  69. #endif
  70. uint E = f32tof16(NextPow2) << 17;
  71. return R | G << 9 | B << 18 | E;
  72. }
  73. float3 UnpackRGBE(uint p)
  74. {
  75. #if _XBOX_ONE
  76. float Pow2 = f16tof32(__XB_UBFE(5, 27, p) << 10);
  77. float R = asfloat(asuint(Pow2) | __XB_UBFE(9, 0, p) << 14);
  78. float G = asfloat(asuint(Pow2) | __XB_UBFE(9, 9, p) << 14);
  79. float B = asfloat(asuint(Pow2) | __XB_UBFE(9, 18, p) << 14);
  80. #else
  81. float Pow2 = f16tof32((p >> 27) << 10);
  82. float R = asfloat(asuint(Pow2) | (p << 14) & 0x7FC000);
  83. float G = asfloat(asuint(Pow2) | (p << 5) & 0x7FC000);
  84. float B = asfloat(asuint(Pow2) | (p >> 4) & 0x7FC000);
  85. #endif
  86. return float3(R, G, B) - Pow2;
  87. }
  88. // Like LogLuv "minus the log". The intention is to store Y in 16-bit float, and UV as 8-bit
  89. // unorm values. This is corrected from the widely publicized LogLuv encoding which used
  90. // the wrong color primaries for sRGB and Rec.709. Note the correct coefficients for computing
  91. // luminance ("Y") in the 2nd row of RGBtoXYZ.
  92. float3 EncodeYUV(float3 RGB)
  93. {
  94. /*
  95. // Start with the right RGBtoXYZ matrix for your color space (this one is sRGB D65)
  96. // http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
  97. static const float3x3 RGBtoXYZ =
  98. {
  99. 0.4124564, 0.3575761, 0.1804375,
  100. 0.2126729, 0.7151522, 0.0721750, <-- The color primaries determining luminance
  101. 0.0193339, 0.1191920, 0.9503041
  102. };
  103. // Compute u' and v'. These pack chrominance into two normalized channels.
  104. // u' = 4X / (X + 15Y + 3Z)
  105. // v' = 9Y / (X + 15Y + 3Z)
  106. // Expand visible spectrum from (0, 0.62) to (0, 1)
  107. // u" = u' / 0.62
  108. // v" = v' / 0.62
  109. // If we compute these two values...
  110. // X' = 4 / 9 * X
  111. // XYZ' = (X + 15 * Y + 3 * Z) * 0.62 / 9
  112. // ...we can derive our final Yu"v" from X', Y, and XYZ'
  113. // u" = X' / XYZ'
  114. // v" = Y / XYZ'
  115. // We can compute (X', Y, XYZ') by multiplying XYZ by this matrix
  116. static const float3x3 FixupMatrix =
  117. {
  118. 4.0 / 9.0, 0.0, 0.0,
  119. 0.0, 1.0, 0.0,
  120. 0.62 / 9.0, 15.0 * 0.62 / 9.0, 3.0 * 0.62 / 9.0
  121. };
  122. // But we should just concatenate the two matrices
  123. static const float3x3 EncodeMatrix = mul(FixupMatrix, RGBtoXYZ);
  124. */
  125. static const float3x3 EncodeMatrix =
  126. {
  127. 0.1833140, 0.1589227, 0.0801944,
  128. 0.2126729, 0.7151522, 0.0721750,
  129. 0.2521713, 0.7882566, 0.2834072
  130. };
  131. float3 Xp_Y_XYZp = mul(EncodeMatrix, RGB);
  132. float Y = Xp_Y_XYZp.y;
  133. float2 UV = saturate(Xp_Y_XYZp.xy / max(Xp_Y_XYZp.z, 1e-6));
  134. return float3(Y, UV);
  135. }
  136. float3 DecodeYUV(float3 YUV)
  137. {
  138. // Inverse of EncodeMatrix
  139. static const float3x3 DecodeMatrix =
  140. {
  141. 7.6649220, 0.9555189, -2.4122494,
  142. -2.2120164, 1.6682305, 0.2010778,
  143. -0.6677212, -5.4901517, 5.1156056
  144. };
  145. // Reverse of operations
  146. float Y = YUV.x;
  147. float XYZp = YUV.x / max(YUV.z, 1e-6);
  148. float Xp = YUV.y * XYZp;
  149. return mul(DecodeMatrix, float3(Xp, Y, XYZp));
  150. }
  151. // If you can't write Y and UV to separate buffers (R16_FLOAT, R8G8_UNORM), then
  152. // you can pack them into R32_UINT.
  153. uint PackYUV(float3 YUV)
  154. {
  155. uint Y = f32tof16(YUV.x);
  156. #if _XBOX_ONE
  157. uint p = __XB_PackF32ToU8(YUV.y * 255.0 + 0.5, 3, Y);
  158. return __XB_PackF32ToU8(YUV.z * 255.0 + 0.5, 2, p);
  159. #else
  160. uint U = (uint)(YUV.y * 255.0 + 0.5);
  161. uint V = (uint)(YUV.z * 255.0 + 0.5);
  162. return Y | U << 24 | V << 16;
  163. #endif
  164. }
  165. float3 UnpackYUV(uint YUV)
  166. {
  167. float Y = f16tof32(YUV);
  168. #if _XBOX_ONE
  169. float U = __XB_UnpackByte3(YUV) / 255.0;
  170. float V = __XB_UnpackByte2(YUV) / 255.0;
  171. #else
  172. float U = (YUV >> 24) / 255.0;
  173. float V = ((YUV >> 16) & 0xFF) / 255.0;
  174. #endif
  175. return float3(Y, U, V);
  176. }
  177. // To understand this, know that all math on YUV should really be done on
  178. // Y, Y*U, Y*V. You can add and blend all three of those values the same
  179. // as RGB, but for compact encoding, you only want to store Y, U, and V.
  180. float3 AddYUV( float3 YUV1, float3 YUV2 )
  181. {
  182. // Luminance is simply added; chrominance becomes a weighted average
  183. float Y = YUV1.x + YUV2.x;
  184. float2 UV = (YUV1.yz * YUV1.x + YUV2.yz * YUV2.x) / Y;
  185. return float3(Y, UV);
  186. }
  187. float3 LerpYUV( float3 YUV1, float3 YUV2, float t )
  188. {
  189. // To rescale a YUV value, you just have to rescale Y. Chroma remains the same.
  190. // After scaling luminance, you can add the two colors together. This version of
  191. // the math (as opposed to the possibly more readable code commented out below) is
  192. // more efficient. But it's interesting to note that if you kept values as Y, Y*U, Y*V,
  193. // you could simply add or lerp them.
  194. YUV1.x *= (1 - t);
  195. YUV2.x *= t;
  196. return AddYUV(YUV1, YUV2);
  197. //float Y = lerp(YUV1.x, YUV2.x, t);
  198. //float2 UV = lerp(YUV1.yz * YUV1.x, YUV2.yz * YUV2.x, t) / Y;
  199. //return float3(Y, UV);
  200. }
  201. // The standard 32-bit HDR color format. Each float has a 5-bit exponent and no sign bit.
  202. uint Pack_R11G11B10_FLOAT( float3 rgb )
  203. {
  204. uint r = (f32tof16(rgb.x) >> 4) & 0x000007FF;
  205. uint g = (f32tof16(rgb.y) << 7) & 0x003FF800;
  206. uint b = (f32tof16(rgb.z) << 17) & 0xFFC00000;
  207. return r | g | b;
  208. }
  209. float3 Unpack_R11G11B10_FLOAT( uint rgb )
  210. {
  211. float r = f16tof32((rgb << 4 ) & 0x7FF0);
  212. float g = f16tof32((rgb >> 7 ) & 0x7FF0);
  213. float b = f16tof32((rgb >> 17) & 0x7FE0);
  214. return float3(r, g, b);
  215. }
  216. // These next two encodings are great for LDR data. By knowing that our values are [0.0, 1.0]
  217. // (or [0.0, 2.0), incidentally), we can reduce how many bits we need in the exponent. We can
  218. // immediately eliminate all postive exponents. By giving more bits to the mantissa, we can
  219. // improve precision at the expense of range. The 8E3 format goes one bit further, quadrupling
  220. // mantissa precision but increasing smallest exponent from -14 to -6. The smallest value of 8E3
  221. // is 2^-14, while the smallest value of 7E4 is 2^-21. Both are smaller than the smallest 8-bit
  222. // sRGB value, which is close to 2^-12.
  223. // This is like R11G11B10_FLOAT except that it moves one bit from each exponent to each mantissa.
  224. uint Pack_R11G11B10_E4_FLOAT( float3 rgb )
  225. {
  226. // Clamp to [0.0, 2.0). The magic number is 1.FFFFF x 2^0. (We can't represent hex floats in HLSL.)
  227. // This trick works because clamping your exponent to 0 reduces the number of bits needed by 1.
  228. rgb = clamp( rgb, 0.0, asfloat(0x3FFFFFFF) );
  229. uint r = (f32tof16(rgb.r) >> 3 ) & 0x000007FF;
  230. uint g = (f32tof16(rgb.g) << 8 ) & 0x003FF800;
  231. uint b = (f32tof16(rgb.b) << 18) & 0xFFC00000;
  232. return r | g | b;
  233. }
  234. float3 Unpack_R11G11B10_E4_FLOAT( uint rgb )
  235. {
  236. float r = f16tof32((rgb << 3 ) & 0x3FF8);
  237. float g = f16tof32((rgb >> 8 ) & 0x3FF8);
  238. float b = f16tof32((rgb >> 18) & 0x3FF0);
  239. return float3(r, g, b);
  240. }
  241. // This is like R11G11B10_FLOAT except that it moves two bits from each exponent to each mantissa.
  242. uint Pack_R11G11B10_E3_FLOAT( float3 rgb )
  243. {
  244. // Clamp to [0.0, 2.0). Divide by 256 to bias the exponent by -8. This shifts it down to use one
  245. // fewer bit while still taking advantage of the denormalization hardware. In half precision,
  246. // the exponent of 0 is 0xF. Dividing by 256 makes the max exponent 0x7--one fewer bit.
  247. rgb = clamp( rgb, 0.0, asfloat(0x3FFFFFFF) ) / 256.0;
  248. uint r = (f32tof16(rgb.r) >> 2 ) & 0x000007FF;
  249. uint g = (f32tof16(rgb.g) << 9 ) & 0x003FF800;
  250. uint b = (f32tof16(rgb.b) << 19) & 0xFFC00000;
  251. return r | g | b;
  252. }
  253. float3 Unpack_R11G11B10_E3_FLOAT( uint rgb )
  254. {
  255. float r = f16tof32((rgb << 2 ) & 0x1FFC);
  256. float g = f16tof32((rgb >> 9 ) & 0x1FFC);
  257. float b = f16tof32((rgb >> 19) & 0x1FF8);
  258. return float3(r, g, b) * 256.0;
  259. }