ScreenSpaceSubsurfaceScatteringCS.azsl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /*
  2. * Copyright (c) Contributors to the Open 3D Engine Project.
  3. * For complete copyright and license terms please see the LICENSE at the root of this distribution.
  4. *
  5. * SPDX-License-Identifier: Apache-2.0 OR MIT
  6. *
  7. */
  8. #include <viewsrg_all.srgi>
  9. #include <Atom/Features/PBR/Hammersley.azsli>
  10. #include <Atom/RPI/Math.azsli>
  11. #define GROUP_DIM_X 16
  12. #define GROUP_DIM_Y 16
  13. // Padding of one side, e.g. 16x16 block + 2 pixel pad = 20x20 padded block
  14. // Intended to provide at least some LDS neighbours for boundary pixels
  15. #define PAD 2
  16. #define PAD2 (PAD * 2)
  17. // This size should be number of element in thread group + number of padding element
  18. #define SHARED_MEMORY_SIZE 400
  19. // Use 60% / 30% of total samples for relatively far / far objects
  20. #define SAMPLE_RATIO_1 0.3
  21. #define SAMPLE_RATIO_2 0.6
  22. #define NUMBER_OF_SAMPLE 200
  23. // How large the diffusion profile (filter) is on the screen in pixel, sampling rate of small filters will be reduced
  24. // to improve performance
  25. #define FOOTPRINT_SMALL 1
  26. #define FOOTPRINT_MEDIUM 4
  27. // Unit conversion
  28. #define MILLIMETER_TO_METER 1e-3
  29. #define METER_TO_MILLIMETER 1e3
  30. #define DEPTH_CLIP_THRESHOLD_SCALE (0.5 * MILLIMETER_TO_METER)
  31. ShaderResourceGroup PassSrg : SRG_PerPass
  32. {
  33. Texture2D<float4> m_diffuseTexture;
  34. Texture2D<float> m_linearDepthTexture;
  35. Texture2D<float3> m_scatterDistanceTexture;
  36. RWTexture2D<float4> m_outputTexture;
  37. // Passed by SubsurfaceScatterPass.cpp from cpu
  38. float2 m_screenSize;
  39. // float3 -> diffuse color
  40. // float -> linear depth
  41. groupshared float sColorR[SHARED_MEMORY_SIZE];
  42. groupshared float sColorG[SHARED_MEMORY_SIZE];
  43. groupshared float sColorB[SHARED_MEMORY_SIZE];
  44. groupshared float sDepth [SHARED_MEMORY_SIZE];
  45. void SetColorDepth(uint index, float3 color, float depth)
  46. {
  47. sColorR[index] = color.r;
  48. sColorG[index] = color.g;
  49. sColorB[index] = color.b;
  50. sDepth[index] = depth;
  51. }
  52. float3 GetRadiance(uint index)
  53. {
  54. return float3(sColorR[index], sColorG[index], sColorB[index]);
  55. }
  56. float GetDepth(uint index)
  57. {
  58. return sDepth[index];
  59. }
  60. }
  61. // --Return Monte-Carlo importance for the given kernel element, whose equation is :
  62. // DiffusionProfile(l, s) * l / DiffusionProfilePDF(r, minS)
  63. // -> (s / (8 * PI * l) * (exp(-s * l) + exp(-s * l / 3.0))) * l / (minS / (8 * PI) * (exp(-minS * r) + exp(-minS * r / 3.0)))
  64. // -> (exp(-s * l) + exp(-s * l / 3.0)) / (exp(-minS * r) + exp(-minS * r / 3.0))
  65. // --Smallest element of shape parameter 's' is used for PDF because we sampled from this single lobe during importance sampling.
  66. // --Since volume albedo A is a constant for all samples, thus been removed from equation of diffusion profile, which can be multiplied back later
  67. // --For more information about Burley's normalized diffusion please refer to:
  68. // Christensen H.P. Burley B. 2015, Approximate Reflectance Profiles for Efficient Subsurface Scattering
  69. // https://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf
  70. float3 Kernel(float l, float r, float3 s, float minS)
  71. {
  72. return l > 0 ? (exp(-s * l) + exp(-s * l / 3.0)) / (exp(-minS * r) + exp(-minS * r / 3.0)) : float3(0.0, 0.0, 0.0);
  73. }
  74. // Helper function of inverse cdf approximation
  75. float G(float u)
  76. {
  77. return 4.0 * u * (2.0 * u + sqrt(1.0 + 4.0 * u * u)) + 1.0;
  78. }
  79. // Approximation of inverse CDF via curve fitting, surves as an excellent model for graphical computation
  80. float GetSampleInverseCDFApprox(float u, float s)
  81. {
  82. // complementary value of u
  83. float uc = 1.0 - u;
  84. float oneDivThree = 1.0 / 3.0;
  85. return 3.0 * log((1.0 + 1.0 / pow(G(uc), oneDivThree) + pow(G(uc), oneDivThree)) / (4.0 * uc)) / s;
  86. }
  87. // 1D Spherical fibonacci point set for uniformly sampling azimuth angle
  88. float SF(uint index)
  89. {
  90. return 2 * PI * index * 2 / (1 + sqrt(5));
  91. }
  92. // Cosine value of pseudo random (hammersley) angle theta,
  93. // which used to rotate the direction of diffusion profile, which is a 2d plane at surface point parallel to viewing plane,
  94. // to eliminate structured noise pattern introduced by low-discrepancy sequence, this value equivalent to:
  95. // theta = 2 * pi * hammersley(index)
  96. // cosTheta = cos(theta)
  97. static const float cosTheta[4] = { 1.0, -1.0, 6.123233995736766e-17, -1.8369701987210297e-16 };
  98. // Same as above but sine value instead
  99. static const float sinTheta[4] = { 0.0, 1.2246467991473532e-16, 1.0, -1.0 };
  100. // --Get low-discrepancy 2d sample using sample index 'index' and inverse scatter distance 's'
  101. // --Radius 'r' sampled from hammersley sequance (see Hammersley.azsli::GetVanDerCorputRadicalInverse() for more information)
  102. // --Planar angle 'phi' sampled from spherical fibonacci's azimuthal component
  103. // --Sample rotated using interleaved sample strategy to avoid obvious artifacts, which repeat a
  104. // 2x2 block sampling pattern (generated using 'gridIndex', which from 0-7 to select rotation angle) across the whole image to generate well distributed noise that can be easily filtered
  105. // by a 2x2 box filter afterwards
  106. void GetSample(in uint index, in uint gridIndex, in float s, out float3 sp)
  107. {
  108. float r = GetSampleInverseCDFApprox(GetVanDerCorputRadicalInverse(index), s);
  109. float x = r * cos(SF(index));
  110. float y = r * sin(SF(index));
  111. // Rotate samples to reduce visible pattern:
  112. sp = float3(x * cosTheta[gridIndex] - y * sinTheta[gridIndex],
  113. x * sinTheta[gridIndex] + y * cosTheta[gridIndex], r);
  114. }
  115. [numthreads(GROUP_DIM_X,GROUP_DIM_Y,1)]
  116. void MainCS(
  117. uint3 dispatchID : SV_DispatchThreadID,
  118. uint3 groupThreadID : SV_GroupThreadID,
  119. uint3 groupID : SV_GroupID,
  120. uint groupIndex : SV_GroupIndex )
  121. {
  122. // Input diffuse color of current pixel, alpha channel is matte mask identify which pixel needs convolution
  123. float4 colorX = PassSrg::m_diffuseTexture.Load(uint3(dispatchID.x, dispatchID.y, 0));
  124. // unpack strength factor and quality factor
  125. float2 factorAndQuality = frac(colorX.w / float2(65536, 256));
  126. float factor = factorAndQuality.x < 0.99 ? factorAndQuality.x : 1.0;
  127. float quality = factorAndQuality.y < 0.99 ? factorAndQuality.y : 1.0;
  128. //+---------------------------+
  129. //| shared memory allocation |
  130. //+---------------------------+
  131. // X/Y thread group dimension include pad on both sides
  132. uint dimXWithPad = GROUP_DIM_X + PAD2;
  133. // Pre integration of constants used to convert thread group id to
  134. // texture coordinates, to avoid duplicate calculation, don't have special meaning
  135. uint texelOffsetX = groupID.x * GROUP_DIM_X - PAD;
  136. uint texelOffsetY = groupID.y * GROUP_DIM_Y - PAD;
  137. // --Corresponding texel coordinate for each element in the (GROUP_DIM_X + PAD) x (GROUP_DIM_Y + PAD) shared memory
  138. // due to the existence of padding to avoid bank conflicts threads are offseted to
  139. // load pixels based on coordinates in shared memory, instead of with original groupThreadID.
  140. // --All threads will load first GROUP_DIM_X x GROUP_DIM_Y elements into shared memory start from
  141. // top left corner, then portion of threads will continuously load rest of elements
  142. // --Note: since shared memory ID use top left as origin but texture coordinates use bottom left
  143. // as origin, image patch stored in shared memory is flipped vertically, but it's fine as
  144. // our profile is radial symmetric
  145. uint3 texelIndex = uint3(
  146. texelOffsetX + (groupIndex % dimXWithPad),
  147. texelOffsetY + (groupIndex / dimXWithPad),
  148. 0
  149. );
  150. PassSrg::SetColorDepth(groupIndex,
  151. PassSrg::m_diffuseTexture.Load(texelIndex).rgb,
  152. PassSrg::m_linearDepthTexture.Load(texelIndex).r
  153. );
  154. uint totalLoadingThread = SHARED_MEMORY_SIZE - GROUP_DIM_X * GROUP_DIM_Y;
  155. if(groupIndex < totalLoadingThread)
  156. {
  157. // load rest pixels into shared memory
  158. uint index = GROUP_DIM_X * GROUP_DIM_Y + groupIndex;
  159. texelIndex = uint3(
  160. texelOffsetX + (index % dimXWithPad),
  161. texelOffsetY + (index / dimXWithPad),
  162. 0
  163. );
  164. PassSrg::SetColorDepth(index,
  165. PassSrg::m_diffuseTexture.Load(texelIndex).rgb,
  166. PassSrg::m_linearDepthTexture.Load(texelIndex).r
  167. );
  168. }
  169. // If current thread don't need further convolution then terminate and return
  170. // Case 1: factor < EPSILON, factor is zero so further computation is not necessary
  171. // Case 2: factor is nonzero but subsurface scattering is disabled at this pixel
  172. if(factor < EPSILON || colorX.w < 0)
  173. {
  174. PassSrg::m_outputTexture[dispatchID.xy] = float4(colorX.rgb, 0.0f);
  175. return;
  176. }
  177. GroupMemoryBarrierWithGroupSync();
  178. //+--------------------------------------+
  179. //| subsurface scattering preprocessing |
  180. //+--------------------------------------+
  181. // Flattened index of current pixel in shared memory
  182. uint2 sMemID = groupThreadID.xy + uint2(PAD, PAD);
  183. uint sMemIndex = sMemID.y * dimXWithPad + sMemID.x;
  184. // Inverse of scatter distance, symbol 's' is used to match with literatures
  185. float3 s = rcp(PassSrg::m_scatterDistanceTexture.Load(uint3(dispatchID.x, dispatchID.y, 0)).rgb + EPSILON);
  186. float minS = min(s.x, min(s.y, s.z));
  187. // Input view space linear depth of current pixel
  188. float2 texelSize = float2(1.0 / PassSrg::m_screenSize.x, 1.0 / PassSrg::m_screenSize.y);
  189. float depthX = PassSrg::GetDepth(sMemIndex);
  190. // Map radius sampled from profile (in millimeter) to distance on view plane (in pixel), consists of three steps
  191. // 'MILLIMETER_TO_METER' converts unit of radius on diffusion profile from millimeter to meter
  192. // '(ViewSrg::m_projectionMatrix[0][0] / depthX)' projects radius on profile to distance on viewing plane
  193. // 'texelSize' maps viewing plane distance to distance in pixel
  194. float profileToScreenX = (ViewSrg::m_projectionMatrix[0][0] / depthX) * MILLIMETER_TO_METER / texelSize.x;
  195. float profileToScreenY = (ViewSrg::m_projectionMatrix[1][1] / depthX) * MILLIMETER_TO_METER / texelSize.y;
  196. // This quantity controls the strength of depth based lod mechanism
  197. // 90% confidence interval, gives a roughly linear response with respect to input minS which more stable than true maximum radius
  198. float maxRadius = GetSampleInverseCDFApprox(0.9, minS);
  199. // Radius of diffusion profile on viewing plane in pixel, accounting for projection distortion
  200. // Use longer axis of ellipsoid as footprint
  201. uint maxFootPrint = max(maxRadius * profileToScreenX, maxRadius * profileToScreenY);
  202. // Determine how much sample will be used based on the depth of current pixel
  203. uint numSample = uint(NUMBER_OF_SAMPLE * quality * (maxFootPrint <= FOOTPRINT_MEDIUM ? maxFootPrint <= FOOTPRINT_SMALL ? SAMPLE_RATIO_1 : SAMPLE_RATIO_2 : 1.0));
  204. // Grid index used for interleaved sampling
  205. // range from 0-3 in a local 2x2 block as following shows:
  206. // -+---+---+-
  207. // | 0 | 1 |
  208. // -+---+---+-
  209. // | 2 | 3 |
  210. // -+---+---+-
  211. uint sampleGridIndex = (dispatchID.y % 2) * 2 + dispatchID.x % 2;
  212. //+------------------------------------+
  213. //| subsurface scattering convolution |
  214. //+------------------------------------+
  215. float3 totalSum = float3(0.0, 0.0, 0.0);
  216. float3 weightSum = float3(0.0, 0.0, 0.0);
  217. for(uint i = 0; i < numSample; ++i)
  218. {
  219. float3 sp;
  220. GetSample(i, sampleGridIndex, minS, sp);
  221. // Distance in pixel
  222. int u = round(sp.x * profileToScreenX);
  223. int v = round(sp.y * profileToScreenY);
  224. // Sampled neightbor's ID in shared memory
  225. uint2 nbrSMemID = sMemID + uint2(u, v);
  226. // Neighbor's depth
  227. float depthNbr = 0.0;
  228. // Neighbor's color
  229. float3 colorNbr = float3(0.0, 0.0, 0.0);
  230. // Fall back to global memory to fetch pixel information
  231. // ortherwise unable to render wide range scatter effect, e.g. (scatter distance > 1)
  232. if(nbrSMemID.x >= dimXWithPad || nbrSMemID.x < 0 || nbrSMemID.y >= (GROUP_DIM_Y + PAD2) || nbrSMemID.y < 0)
  233. {
  234. // Fall back to fetch from global memory
  235. uint3 nbrTexelIndex = uint3(dispatchID.x + u, dispatchID.y + v, 0);
  236. depthNbr = PassSrg::m_linearDepthTexture.Load(nbrTexelIndex).r;
  237. colorNbr = PassSrg::m_diffuseTexture.Load(nbrTexelIndex).rgb;
  238. }
  239. else
  240. {
  241. // Fetch depth and color from shared memory
  242. nbrSMemID = sMemID + uint2(u, v);
  243. uint nbrSMemIndex = nbrSMemID.y * dimXWithPad + nbrSMemID.x;
  244. depthNbr = PassSrg::GetDepth(nbrSMemIndex);
  245. colorNbr = PassSrg::GetRadiance(nbrSMemIndex);
  246. }
  247. // Skip invalid samples (i.e. sample out of valid region (depth = 0), totally black pixel - no energy spread out (color = 0))
  248. if(depthNbr > 0.0 && (colorNbr.x + colorNbr.y + colorNbr.z) > 0.0)
  249. {
  250. // Radial distance between current pixel and its neighbour
  251. float r = sp.z;
  252. float d = abs(depthX - depthNbr) * METER_TO_MILLIMETER;
  253. // Distance taking depth into account, in theory only surface points on convex objects can be computed in this way,
  254. // but in practice it works well as an approximation in all cases
  255. float surfaceDistance = sqrt(r * r + d * d);
  256. // As the sample is drawn from the distribution of color channel with longest scatter distance,
  257. // the pdf should follows the same rule as well for consistency
  258. float3 weight = Kernel(surfaceDistance, r, s, minS);
  259. totalSum += weight * colorNbr;
  260. weightSum += weight;
  261. }
  262. }
  263. // Force normalization with sum of weights
  264. float3 subsurfaceScatteringColor = totalSum / (weightSum + EPSILON);
  265. float3 outColor = lerp(colorX.rgb, subsurfaceScatteringColor, factor);
  266. // 1.0 in alpha channel will trigger 2x2 noise reduction filter in output merger to remove low discrepancy noises
  267. PassSrg::m_outputTexture[dispatchID.xy] = float4(outColor, 1.0);
  268. }