Functions.hlsl 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #pragma once
  6. #include <AnKi/Shaders/Common.hlsl>
  7. // Convert to linear depth
  8. F32 linearizeDepth(F32 depth, F32 zNear, F32 zFar)
  9. {
  10. return zNear / ((zNear - zFar) + zFar / depth);
  11. }
  12. // Convert to linear depth
  13. Vec4 linearizeDepth(Vec4 depth, F32 zNear, F32 zFar)
  14. {
  15. return zNear / ((zNear - zFar) + zFar / depth);
  16. }
  17. // This is the optimal linearizeDepth where a=(n-f)/n and b=f/n
  18. F32 linearizeDepthOptimal(F32 depth, F32 a, F32 b)
  19. {
  20. return 1.0 / (a + b / depth);
  21. }
  22. // This is the optimal linearizeDepth where a=(n-f)/n and b=f/n
  23. Vec4 linearizeDepthOptimal(Vec4 depths, F32 a, F32 b)
  24. {
  25. return 1.0 / (a + b / depths);
  26. }
  27. /// Project a vector by knowing only the non zero values of a perspective matrix. Doesn't take into account jitter.
  28. Vec4 cheapPerspectiveProjection(F32 m00, F32 m11, F32 m22, F32 m23, Vec4 vec)
  29. {
  30. Vec4 o;
  31. o.x = vec.x * m00;
  32. o.y = vec.y * m11;
  33. o.z = vec.z * m22 + vec.w * m23;
  34. o.w = -vec.z;
  35. return o;
  36. }
  37. /// Project a vector by knowing only the non zero values of a perspective matrix. Doesn't take into account jitter.
  38. Vec4 cheapPerspectiveProjection(Vec4 projMat_00_11_22_23, Vec4 vec)
  39. {
  40. return cheapPerspectiveProjection(projMat_00_11_22_23.x, projMat_00_11_22_23.y, projMat_00_11_22_23.z, projMat_00_11_22_23.w, vec);
  41. }
  42. /// To unproject to view space. Jitter not considered. See Mat4::extractPerspectiveUnprojectionParams in C++.
  43. Vec3 cheapPerspectiveUnprojection(Vec4 unprojParams, Vec2 ndc, F32 depth)
  44. {
  45. const F32 z = unprojParams.z / (unprojParams.w + depth);
  46. const Vec2 xy = ndc * unprojParams.xy * z;
  47. return Vec3(xy, z);
  48. }
  49. #if ANKI_FRAGMENT_SHADER
  50. // Stolen from shadertoy.com/view/4tyGDD
  51. Vec4 textureCatmullRom4Samples(Texture2D tex, SamplerState sampl, Vec2 uv, Vec2 texSize)
  52. {
  53. const Vec2 halff = 2.0 * frac(0.5 * uv * texSize - 0.25) - 1.0;
  54. const Vec2 f = frac(halff);
  55. const Vec2 sum0 = (2.0 * f - 3.5) * f + 0.5;
  56. const Vec2 sum1 = (2.0 * f - 2.5) * f - 0.5;
  57. Vec4 w = Vec4(f * sum0 + 1.0, f * sum1);
  58. const Vec4 pos =
  59. Vec4((((-2.0 * f + 3.0) * f + 0.5) * f - 1.5) * f / (w.xy * texSize) + uv, (((-2.0 * f + 5.0) * f - 2.5) * f - 0.5) / (sum1 * texSize) + uv);
  60. w.xz *= halff.x * halff.y > 0.0 ? 1.0 : -1.0;
  61. return (tex.Sample(sampl, pos.xy) * w.x + tex.Sample(sampl, pos.zy) * w.z) * w.y
  62. + (tex.Sample(sampl, pos.xw) * w.x + tex.Sample(sampl, pos.zw) * w.z) * w.w;
  63. }
  64. #endif
  65. // Stolen from shadertoy.com/view/4df3Dn
  66. template<typename TVec>
  67. TVec textureBicubic(Texture2D<TVec> tex, SamplerState sampl, Vec2 uv, F32 lod)
  68. {
  69. #define w0(a) ((1.0 / 6.0) * ((a) * ((a) * (-(a) + 3.0) - 3.0) + 1.0))
  70. #define w1(a) ((1.0 / 6.0) * ((a) * (a) * (3.0 * (a)-6.0) + 4.0))
  71. #define w2(a) ((1.0 / 6.0) * ((a) * ((a) * (-3.0 * (a) + 3.0) + 3.0) + 1.0))
  72. #define w3(a) ((1.0 / 6.0) * ((a) * (a) * (a)))
  73. #define g0(a) (w0(a) + w1(a))
  74. #define g1(a) (w2(a) + w3(a))
  75. #define h0(a) (-1.0 + w1(a) / (w0(a) + w1(a)))
  76. #define h1(a) (1.0 + w3(a) / (w2(a) + w3(a)))
  77. #define texSample(uv) tex.SampleLevel(sampl, uv, lod)
  78. UVec2 texSize;
  79. U32 mipCount;
  80. tex.GetDimensions(0, texSize.x, texSize.y, mipCount);
  81. const U32 lodi = min(U32(lod), mipCount - 1u);
  82. texSize = texSize >> lodi;
  83. uv = uv * texSize + 0.5;
  84. const Vec2 iuv = floor(uv);
  85. const Vec2 fuv = frac(uv);
  86. const F32 g0x = g0(fuv.x);
  87. const F32 g1x = g1(fuv.x);
  88. const F32 h0x = h0(fuv.x);
  89. const F32 h1x = h1(fuv.x);
  90. const F32 h0y = h0(fuv.y);
  91. const F32 h1y = h1(fuv.y);
  92. const Vec2 p0 = (Vec2(iuv.x + h0x, iuv.y + h0y) - 0.5) / texSize;
  93. const Vec2 p1 = (Vec2(iuv.x + h1x, iuv.y + h0y) - 0.5) / texSize;
  94. const Vec2 p2 = (Vec2(iuv.x + h0x, iuv.y + h1y) - 0.5) / texSize;
  95. const Vec2 p3 = (Vec2(iuv.x + h1x, iuv.y + h1y) - 0.5) / texSize;
  96. return g0(fuv.y) * (g0x * texSample(p0) + g1x * texSample(p1)) + g1(fuv.y) * (g0x * texSample(p2) + g1x * texSample(p3));
  97. #undef w0
  98. #undef w1
  99. #undef w2
  100. #undef g0
  101. #undef g1
  102. #undef h0
  103. #undef h1
  104. #undef texSample
  105. }
  106. F32 rand(Vec2 n)
  107. {
  108. return 0.5 + 0.5 * frac(sin(dot(n, Vec2(12.9898, 78.233))) * 43758.5453);
  109. }
  110. Vec4 nearestDepthUpscale(Vec2 uv, Texture2D<Vec4> depthFull, Texture2D<Vec4> depthHalf, Texture2D<Vec4> colorTex, SamplerState linearAnyClampSampler,
  111. Vec2 linearDepthCf, F32 depthThreshold)
  112. {
  113. F32 fullDepth = depthFull.SampleLevel(linearAnyClampSampler, uv, 0.0).r; // Sampler not important.
  114. fullDepth = linearizeDepthOptimal(fullDepth, linearDepthCf.x, linearDepthCf.y);
  115. Vec4 halfDepths = depthHalf.GatherRed(linearAnyClampSampler, uv); // Sampler not important.
  116. halfDepths = linearizeDepthOptimal(halfDepths, linearDepthCf.x, linearDepthCf.y);
  117. const Vec4 diffs = abs(Vec4(fullDepth, fullDepth, fullDepth, fullDepth) - halfDepths);
  118. Vec4 color;
  119. if(all(diffs < Vec4(depthThreshold, depthThreshold, depthThreshold, depthThreshold)))
  120. {
  121. // No major discontinuites, sample with bilinear
  122. color = colorTex.SampleLevel(linearAnyClampSampler, uv, 0.0);
  123. }
  124. else
  125. {
  126. // Some discontinuites, need to use the newUv
  127. const Vec4 r = colorTex.GatherRed(linearAnyClampSampler, uv);
  128. const Vec4 g = colorTex.GatherGreen(linearAnyClampSampler, uv);
  129. const Vec4 b = colorTex.GatherBlue(linearAnyClampSampler, uv);
  130. const Vec4 a = colorTex.GatherAlpha(linearAnyClampSampler, uv);
  131. F32 minDiff = diffs.x;
  132. U32 comp = 0u;
  133. if(diffs.y < minDiff)
  134. {
  135. comp = 1u;
  136. minDiff = diffs.y;
  137. }
  138. if(diffs.z < minDiff)
  139. {
  140. comp = 2u;
  141. minDiff = diffs.z;
  142. }
  143. if(diffs.w < minDiff)
  144. {
  145. comp = 3u;
  146. }
  147. color = Vec4(r[comp], g[comp], b[comp], a[comp]);
  148. }
  149. return color;
  150. }
  151. F32 _calcDepthWeight(Texture2D depthLow, SamplerState nearestAnyClamp, Vec2 uv, F32 ref, Vec2 linearDepthCf)
  152. {
  153. const F32 d = depthLow.SampleLevel(nearestAnyClamp, uv, 0.0).r;
  154. const F32 linearD = linearizeDepthOptimal(d, linearDepthCf.x, linearDepthCf.y);
  155. return 1.0 / (kEpsilonF32 + abs(ref - linearD));
  156. }
  157. Vec4 _sampleAndWeight(Texture2D depthLow, Texture2D colorLow, SamplerState linearAnyClamp, SamplerState nearestAnyClamp, const Vec2 lowInvSize,
  158. Vec2 uv, const Vec2 offset, const F32 ref, const F32 weight, const Vec2 linearDepthCf, inout F32 normalize)
  159. {
  160. uv += offset * lowInvSize;
  161. const F32 dw = _calcDepthWeight(depthLow, nearestAnyClamp, uv, ref, linearDepthCf);
  162. const Vec4 v = colorLow.SampleLevel(linearAnyClamp, uv, 0.0);
  163. normalize += weight * dw;
  164. return v * dw * weight;
  165. }
  166. Vec4 bilateralUpsample(Texture2D depthHigh, Texture2D depthLow, Texture2D colorLow, SamplerState linearAnyClamp, SamplerState nearestAnyClamp,
  167. const Vec2 lowInvSize, const Vec2 uv, const Vec2 linearDepthCf)
  168. {
  169. const Vec3 kWeights = Vec3(0.25, 0.125, 0.0625);
  170. F32 depthRef = depthHigh.SampleLevel(nearestAnyClamp, uv, 0.0).r;
  171. depthRef = linearizeDepthOptimal(depthRef, linearDepthCf.x, linearDepthCf.y);
  172. F32 normalize = 0.0;
  173. Vec4 sum = _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(0.0, 0.0), depthRef, kWeights.x,
  174. linearDepthCf, normalize);
  175. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(-1.0, 0.0), depthRef, kWeights.y, linearDepthCf,
  176. normalize);
  177. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(0.0, -1.0), depthRef, kWeights.y, linearDepthCf,
  178. normalize);
  179. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(1.0, 0.0), depthRef, kWeights.y, linearDepthCf,
  180. normalize);
  181. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(0.0, 1.0), depthRef, kWeights.y, linearDepthCf,
  182. normalize);
  183. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(1.0, 1.0), depthRef, kWeights.z, linearDepthCf,
  184. normalize);
  185. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(1.0, -1.0), depthRef, kWeights.z, linearDepthCf,
  186. normalize);
  187. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(-1.0, 1.0), depthRef, kWeights.z, linearDepthCf,
  188. normalize);
  189. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(-1.0, -1.0), depthRef, kWeights.z,
  190. linearDepthCf, normalize);
  191. return sum / normalize;
  192. }
  193. /// Compute the UV that can be passed to a cube texture. The norm is in [-1, 1].
  194. Vec3 getCubemapDirection(const Vec2 norm, const U32 faceIdx)
  195. {
  196. Vec3 zDir = Vec3((faceIdx <= 1u) ? 1 : 0, (faceIdx & 2u) >> 1u, (faceIdx & 4u) >> 2u);
  197. zDir *= (((faceIdx & 1u) == 1u) ? -1.0 : 1.0);
  198. const Vec3 yDir = (faceIdx == 2u) ? Vec3(0.0, 0.0, 1.0) : (faceIdx == 3u) ? Vec3(0.0, 0.0, -1.0) : Vec3(0.0, -1.0, 0.0);
  199. const Vec3 xDir = cross(zDir, yDir);
  200. return normalize(norm.x * xDir + norm.y * yDir + zDir);
  201. }
  202. // Convert 3D cubemap coordinates to 2D plus face index. v doesn't need to be normalized.
  203. Vec2 convertCubeUvs(const Vec3 v, out F32 faceIndex)
  204. {
  205. const Vec3 absV = abs(v);
  206. F32 mag;
  207. Vec2 uv;
  208. if(absV.z >= absV.x && absV.z >= absV.y)
  209. {
  210. faceIndex = (v.z < 0.0) ? 5.0 : 4.0;
  211. uv = Vec2((v.z < 0.0) ? -v.x : v.x, -v.y);
  212. mag = absV.z;
  213. }
  214. else if(absV.y >= absV.x)
  215. {
  216. faceIndex = (v.y < 0.0) ? 3.0 : 2.0;
  217. uv = Vec2(v.x, (v.y < 0.0) ? -v.z : v.z);
  218. mag = absV.y;
  219. }
  220. else
  221. {
  222. faceIndex = (v.x < 0.0) ? 1.0 : 0.0;
  223. uv = Vec2((v.x < 0.0) ? v.z : -v.z, -v.y);
  224. mag = absV.x;
  225. }
  226. return 0.5 / mag * uv + 0.5;
  227. }
  228. // Same as convertCubeUvs but it returns the faceIndex as unsigned I32.
  229. Vec2 convertCubeUvsu(const Vec3 v, out U32 faceIndex)
  230. {
  231. const Vec3 absV = abs(v);
  232. F32 mag;
  233. Vec2 uv;
  234. if(absV.z >= absV.x && absV.z >= absV.y)
  235. {
  236. faceIndex = (v.z < 0.0) ? 5u : 4u;
  237. uv = Vec2((v.z < 0.0) ? -v.x : v.x, -v.y);
  238. mag = absV.z;
  239. }
  240. else if(absV.y >= absV.x)
  241. {
  242. faceIndex = (v.y < 0.0) ? 3u : 2u;
  243. uv = Vec2(v.x, (v.y < 0.0) ? -v.z : v.z);
  244. mag = absV.y;
  245. }
  246. else
  247. {
  248. faceIndex = (v.x < 0.0) ? 1u : 0u;
  249. uv = Vec2((v.x < 0.0) ? v.z : -v.z, -v.y);
  250. mag = absV.x;
  251. }
  252. return 0.5 / mag * uv + 0.5;
  253. }
  254. template<typename T>
  255. vector<T, 3> grayScale(const vector<T, 3> col)
  256. {
  257. const T grey = (col.r + col.g + col.b) * T(1.0 / 3.0);
  258. return vector<T, 3>(grey, grey, grey);
  259. }
  260. template<typename T>
  261. vector<T, 3> saturateColor(const vector<T, 3> col, const T factor)
  262. {
  263. const vector<T, 3> lumCoeff = vector<T, 3>(0.2125, 0.7154, 0.0721);
  264. const T d = dot(col, lumCoeff);
  265. const vector<T, 3> intensity = vector<T, 3>(d, d, d);
  266. return lerp(intensity, col, factor);
  267. }
  268. template<typename T>
  269. vector<T, 3> gammaCorrection(vector<T, 3> gamma, vector<T, 3> col)
  270. {
  271. return pow(col, T(1.0) / gamma);
  272. }
  273. // Can use 0.15 for sharpenFactor
  274. template<typename T>
  275. vector<T, 3> readSharpen(Texture2D<vector<T, 4> > tex, SamplerState sampl, Vec2 uv, T sharpenFactor, Bool detailed)
  276. {
  277. vector<T, 3> col = tex.SampleLevel(sampl, uv, 0.0).rgb;
  278. vector<T, 3> col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(1, 1)).rgb;
  279. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, -1)).rgb;
  280. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(1, -1)).rgb;
  281. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, 1)).rgb;
  282. T f = 4.0;
  283. if(detailed)
  284. {
  285. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(0, 1)).rgb;
  286. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(1, 0)).rgb;
  287. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, 0)).rgb;
  288. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(0, -1)).rgb;
  289. f = 8.0;
  290. }
  291. col = col * (f * sharpenFactor + T(1.0)) - sharpenFactor * col2;
  292. return max(vector<T, 3>(0.0, 0.0, 0.0), col);
  293. }
  294. template<typename T>
  295. vector<T, 3> readErosion(Texture2D<vector<T, 4> > tex, SamplerState sampl, const Vec2 uv)
  296. {
  297. vector<T, 3> minValue = tex.SampleLevel(sampl, uv, 0.0).rgb;
  298. #define ANKI_EROSION(x, y) \
  299. col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(x, y)).rgb; \
  300. minValue = min(col2, minValue);
  301. vector<T, 3> col2;
  302. ANKI_EROSION(1, 1);
  303. ANKI_EROSION(-1, -1);
  304. ANKI_EROSION(1, -1);
  305. ANKI_EROSION(-1, 1);
  306. ANKI_EROSION(0, 1);
  307. ANKI_EROSION(1, 0);
  308. ANKI_EROSION(-1, 0);
  309. ANKI_EROSION(0, -1);
  310. #undef ANKI_EROSION
  311. return minValue;
  312. }
  313. // 5 color heatmap from a factor.
  314. Vec3 heatmap(const F32 factor)
  315. {
  316. F32 intPart;
  317. const F32 fractional = modf(factor * 4.0, intPart);
  318. if(intPart < 1.0)
  319. {
  320. return lerp(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 1.0), fractional);
  321. }
  322. else if(intPart < 2.0)
  323. {
  324. return lerp(Vec3(0.0, 0.0, 1.0), Vec3(0.0, 1.0, 0.0), fractional);
  325. }
  326. else if(intPart < 3.0)
  327. {
  328. return lerp(Vec3(0.0, 1.0, 0.0), Vec3(1.0, 1.0, 0.0), fractional);
  329. }
  330. else
  331. {
  332. return lerp(Vec3(1.0, 1.0, 0.0), Vec3(1.0, 0.0, 0.0), fractional);
  333. }
  334. }
  335. // Return a color per cubemap face. The +X is red, -X dark red, +Y green, -Y dark green, +Z blue, -Z dark blue
  336. Vec3 colorPerCubeFace(const U32 dir)
  337. {
  338. Vec3 color;
  339. switch(dir)
  340. {
  341. case 0:
  342. color = Vec3(1.0, 0.0, 0.0);
  343. break;
  344. case 1:
  345. color = Vec3(0.25, 0.0, 0.0);
  346. break;
  347. case 2:
  348. color = Vec3(0.0, 1.0, 0.0);
  349. break;
  350. case 3:
  351. color = Vec3(0.0, 0.25, 0.0);
  352. break;
  353. case 4:
  354. color = Vec3(0.0, 0.0, 1.0);
  355. break;
  356. default:
  357. color = Vec3(0.0, 0.0, 0.25);
  358. }
  359. return color;
  360. }
  361. Bool incorrectColor(const Vec3 c)
  362. {
  363. return isnan(c.x) || isnan(c.y) || isnan(c.z) || isinf(c.x) || isinf(c.y) || isinf(c.z);
  364. }
  365. F32 areaElement(const F32 x, const F32 y)
  366. {
  367. return atan2(x * y, sqrt(x * x + y * y + 1.0));
  368. }
  369. // Compute the solid angle of a cube. Solid angle is the area of a sphere when projected into a cubemap. It's also the
  370. // delta omega (dω) in the irradiance integral and other integrals that operate in a sphere.
  371. // http://www.rorydriscoll.com/2012/01/15/cubemap-texel-solid-angle/
  372. F32 cubeCoordSolidAngle(Vec2 norm, F32 cubeFaceSize)
  373. {
  374. const F32 s = 1.0f / cubeFaceSize;
  375. const Vec2 invSize = Vec2(s, s);
  376. const Vec2 v0 = norm - invSize;
  377. const Vec2 v1 = norm + invSize;
  378. return areaElement(v0.x, v0.y) - areaElement(v0.x, v1.y) - areaElement(v1.x, v0.y) + areaElement(v1.x, v1.y);
  379. }
  380. /// A convenience function to skip out of bounds invocations on post-process compute shaders.
  381. Bool skipOutOfBoundsInvocations(UVec2 groupSize, UVec2 threadCount, UVec2 svDispatchThreadId)
  382. {
  383. if((threadCount.x % groupSize.x) != 0u || (threadCount.y % groupSize.y) != 0u)
  384. {
  385. if(svDispatchThreadId.x >= threadCount.x || svDispatchThreadId.y >= threadCount.y)
  386. {
  387. return true;
  388. }
  389. }
  390. return false;
  391. }
  392. // Create a matrix from some direction.
  393. Mat3 rotationFromDirection(Vec3 zAxis)
  394. {
  395. #if 0
  396. const Vec3 z = zAxis;
  397. const Bool alignsWithXBasis = abs(z.x - 1.0) <= kEpsilonF32; // aka z == Vec3(1.0, 0.0, 0.0)
  398. Vec3 x = (alignsWithXBasis) ? Vec3(0.0, 0.0, 1.0) : Vec3(1.0, 0.0, 0.0);
  399. const Vec3 y = normalize(cross(x, z));
  400. x = normalize(cross(z, y));
  401. #else
  402. // http://jcgt.org/published/0006/01/01/
  403. const Vec3 z = zAxis;
  404. const F32 sign = (z.z >= 0.0) ? 1.0 : -1.0;
  405. const F32 a = -1.0 / (sign + z.z);
  406. const F32 b = z.x * z.y * a;
  407. const Vec3 x = Vec3(1.0 + sign * a * pow(z.x, 2.0), sign * b, -sign * z.x);
  408. const Vec3 y = Vec3(b, sign + a * pow(z.y, 2.0), -z.y);
  409. #endif
  410. Mat3 o;
  411. o.setColumns(x, y, z);
  412. return o;
  413. }
  414. #if ANKI_COMPUTE_SHADER && ANKI_GLSL
  415. // See getOptimalGlobalInvocationId8x8Amd
  416. U32 _ABfiM(U32 src, U32 ins, U32 bits)
  417. {
  418. const U32 mask = (1u << bits) - 1u;
  419. return (ins & mask) | (src & (~mask));
  420. }
  421. // See getOptimalGlobalInvocationId8x8Amd
  422. U32 _ABfe(U32 src, U32 off, U32 bits)
  423. {
  424. const U32 mask = (1u << bits) - 1u;
  425. return (src >> off) & mask;
  426. }
  427. // See getOptimalGlobalInvocationId8x8Amd
  428. UVec2 _ARmpRed8x8(U32 a)
  429. {
  430. return UVec2(_ABfiM(_ABfe(a, 2u, 3u), a, 1u), _ABfiM(_ABfe(a, 3u, 3u), _ABfe(a, 1u, 2u), 2u));
  431. }
  432. // https://github.com/GPUOpen-Effects/FidelityFX-CAS/blob/master/ffx-cas/ffx_a.h
  433. UVec2 getOptimalGlobalInvocationId8x8Amd()
  434. {
  435. const UVec2 localInvocationId = _ARmpRed8x8(gl_LocalInvocationIndex);
  436. return gl_WorkGroupID.xy * UVec2(8u) + localInvocationId;
  437. }
  438. // https://github.com/LouisBavoil/ThreadGroupIDSwizzling/blob/master/ThreadGroupTilingX.hlsl
  439. UVec2 getOptimalGlobalInvocationId8x8Nvidia()
  440. {
  441. const U32 maxTileWidth = 8u;
  442. const UVec2 workgroupSize = UVec2(8u);
  443. const U32 workgroupsInAPerfectTile = maxTileWidth * gl_NumWorkGroups.y;
  444. const U32 perfectTileCount = gl_NumWorkGroups.x / maxTileWidth;
  445. const U32 totalWorkgroupsInAllPerfectTiles = perfectTileCount * maxTileWidth * gl_NumWorkGroups.y;
  446. const U32 vThreadGroupIDFlattened = gl_NumWorkGroups.x * gl_WorkGroupID.y + gl_WorkGroupID.x;
  447. const U32 tileIdOfCurrentWorkgroup = vThreadGroupIDFlattened / workgroupsInAPerfectTile;
  448. const U32 localWorkgroupIdWithinCurrentTile = vThreadGroupIDFlattened % workgroupsInAPerfectTile;
  449. U32 localWorkgroupIdYWithinCurrentTile;
  450. U32 localWorgroupIdXWithinCurrentTile;
  451. if(totalWorkgroupsInAllPerfectTiles <= vThreadGroupIDFlattened)
  452. {
  453. U32 xDimensionOfLastTile = gl_NumWorkGroups.x % maxTileWidth;
  454. localWorkgroupIdYWithinCurrentTile = localWorkgroupIdWithinCurrentTile / xDimensionOfLastTile;
  455. localWorgroupIdXWithinCurrentTile = localWorkgroupIdWithinCurrentTile % xDimensionOfLastTile;
  456. }
  457. else
  458. {
  459. localWorkgroupIdYWithinCurrentTile = localWorkgroupIdWithinCurrentTile / maxTileWidth;
  460. localWorgroupIdXWithinCurrentTile = localWorkgroupIdWithinCurrentTile % maxTileWidth;
  461. }
  462. const U32 swizzledvThreadGroupIdFlattened =
  463. tileIdOfCurrentWorkgroup * maxTileWidth + localWorkgroupIdYWithinCurrentTile * gl_NumWorkGroups.x + localWorgroupIdXWithinCurrentTile;
  464. UVec2 swizzledvThreadGroupId;
  465. swizzledvThreadGroupId.y = swizzledvThreadGroupIdFlattened / gl_NumWorkGroups.x;
  466. swizzledvThreadGroupId.x = swizzledvThreadGroupIdFlattened % gl_NumWorkGroups.x;
  467. UVec2 swizzledGlobalId;
  468. swizzledGlobalId.x = workgroupSize.x * swizzledvThreadGroupId.x + gl_LocalInvocationID.x;
  469. swizzledGlobalId.y = workgroupSize.y * swizzledvThreadGroupId.y + gl_LocalInvocationID.y;
  470. return swizzledGlobalId.xy;
  471. }
  472. #endif
  473. // Gaussian distrubution function
  474. template<typename T>
  475. T gaussianWeight(T s, T x)
  476. {
  477. T p = T(1.0) / (s * sqrt(T(2.0) * kPi));
  478. p *= exp((x * x) / (T(-2.0) * s * s));
  479. return p;
  480. }
  481. // https://www.shadertoy.com/view/WsfBDf
  482. template<typename T>
  483. vector<T, 3> animateBlueNoise(vector<T, 3> inputBlueNoise, U32 frameIdx)
  484. {
  485. const T goldenRatioConjugate = 0.61803398875;
  486. return frac(inputBlueNoise + T(frameIdx % 64u) * goldenRatioConjugate);
  487. }
  488. #if ANKI_FRAGMENT_SHADER
  489. /// https://bgolus.medium.com/distinctive-derivative-differences-cce38d36797b
  490. /// normalizedUvs is uv*textureResolution
  491. F32 computeMipLevel(Vec2 normalizedUvs)
  492. {
  493. const Vec2 dx = ddx_coarse(normalizedUvs);
  494. const Vec2 dy = ddy_coarse(normalizedUvs);
  495. const F32 deltaMax2 = max(dot(dx, dx), dot(dy, dy));
  496. return max(0.0, 0.5 * log2(deltaMax2));
  497. }
  498. #endif
  499. #if ANKI_SUPPORTS_64BIT_TYPES
  500. /// The regular firstbitlow in DXC has some issues since it invokes a builtin that is only supposed to be used with
  501. /// 32bit input. This is an alternative implementation but it expects that the input is not zero.
  502. I32 firstbitlow2(U64 v)
  503. {
  504. const I32 lsb1 = firstbitlow((U32)v);
  505. const I32 lsb2 = firstbitlow((U32)(v >> 32ul));
  506. return (lsb1 >= 0) ? lsb1 : lsb2 + 32;
  507. }
  508. #endif
  509. /// Define an alternative firstbitlow to go in pair with the 64bit version.
  510. I32 firstbitlow2(U32 v)
  511. {
  512. return firstbitlow(v);
  513. }
  514. /// Encode the shading rate to be stored in an SRI. The rates should be power of two, can't be zero and can't exceed 4.
  515. /// So the possible values are 1,2,4
  516. U32 encodeVrsRate(UVec2 rateXY)
  517. {
  518. return (rateXY.y >> 1u) | ((rateXY.x << 1u) & 12u);
  519. }
  520. Vec3 visualizeVrsRate(UVec2 rate)
  521. {
  522. if(all(rate == UVec2(1u, 1u)))
  523. {
  524. return Vec3(1.0, 0.0, 0.0);
  525. }
  526. else if(all(rate == UVec2(2u, 1u)) || all(rate == UVec2(1u, 2u)))
  527. {
  528. return Vec3(1.0, 0.5, 0.0);
  529. }
  530. else if(all(rate == UVec2(2u, 2u)) || all(rate == UVec2(4u, 1u)) || all(rate == UVec2(1u, 4u)))
  531. {
  532. return Vec3(1.0, 1.0, 0.0);
  533. }
  534. else if(all(rate == UVec2(4u, 2u)) || all(rate == UVec2(2u, 4u)))
  535. {
  536. return Vec3(0.65, 1.0, 0.0);
  537. }
  538. else if(all(rate == UVec2(4u, 4u)))
  539. {
  540. return Vec3(0.0, 1.0, 0.0);
  541. }
  542. else
  543. {
  544. return Vec3(0.0, 0.0, 0.0);
  545. }
  546. }
  547. /// Decodes a number produced by encodeVrsRate(). Returns the shading rates.
  548. UVec2 decodeVrsRate(U32 texel)
  549. {
  550. UVec2 rateXY;
  551. rateXY.x = 1u << ((texel >> 2u) & 3u);
  552. rateXY.y = 1u << (texel & 3u);
  553. return rateXY;
  554. }
  555. /// 3D coordinates to equirectangular 2D coordinates.
  556. Vec2 equirectangularMapping(Vec3 v)
  557. {
  558. Vec2 uv = Vec2(atan2(v.z, v.x), asin(v.y));
  559. uv *= Vec2(0.1591, 0.3183);
  560. uv += 0.5;
  561. return uv;
  562. }
  563. template<typename T>
  564. vector<T, 3> linearToSRgb(vector<T, 3> linearRgb)
  565. {
  566. constexpr T a = 6.10352e-5;
  567. constexpr T b = 1.0 / 2.4;
  568. linearRgb = max(vector<T, 3>(a, a, a), linearRgb);
  569. return min(linearRgb * T(12.92), pow(max(linearRgb, T(0.00313067)), Vec3(b, b, b)) * T(1.055) - T(0.055));
  570. }
  571. template<typename T>
  572. vector<T, 3> sRgbToLinear(vector<T, 3> sRgb)
  573. {
  574. const bool3 cutoff = sRgb < vector<T, 3>(0.04045, 0.04045, 0.04045);
  575. const vector<T, 3> higher = pow((sRgb + T(0.055)) / T(1.055), vector<T, 3>(2.4, 2.4, 2.4));
  576. const vector<T, 3> lower = sRgb / T(12.92);
  577. return lerp(higher, lower, cutoff);
  578. }
  579. template<typename T>
  580. vector<T, 3> filmGrain(vector<T, 3> color, Vec2 uv, T strength, F32 time)
  581. {
  582. const T x = (uv.x + 4.0) * (uv.y + 4.0) * time;
  583. const T grain = T(1.0) - (fmod((fmod(x, T(13.0)) + T(1.0)) * (fmod(x, T(123.0)) + T(1.0)), T(0.01)) - T(0.005)) * strength;
  584. return color * grain;
  585. }
  586. #if ANKI_COMPUTE_SHADER || ANKI_WORK_GRAPH_SHADER
  587. /// HLSL doesn't have SubgroupID so compute it. It's a macro because we can't have functions that InterlockedAdd on local variables (the compiler
  588. /// can't see it's groupshared).
  589. /// @param svGroupIndex Self explanatory.
  590. /// @param tmpGroupsharedU32Var A U32 groupshared variable that will help with the calculation.
  591. /// @param waveIndexInsideThreadgroup The SubgroupID.
  592. /// @param wavesPerThreadGroup Also calculate that in case some GPUs manage to mess this up.
  593. # define ANKI_COMPUTE_WAVE_INDEX_INSIDE_THREADGROUP(svGroupIndex, tmpGroupsharedU32Var, waveIndexInsideThreadgroup, wavesPerThreadGroup) \
  594. do \
  595. { \
  596. if(svGroupIndex == 0) \
  597. { \
  598. tmpGroupsharedU32Var = 0; \
  599. } \
  600. GroupMemoryBarrierWithGroupSync(); \
  601. waveIndexInsideThreadgroup = 0; \
  602. if(WaveIsFirstLane()) \
  603. { \
  604. InterlockedAdd(tmpGroupsharedU32Var, 1, waveIndexInsideThreadgroup); \
  605. } \
  606. GroupMemoryBarrierWithGroupSync(); \
  607. wavesPerThreadGroup = tmpGroupsharedU32Var; \
  608. waveIndexInsideThreadgroup = WaveReadLaneFirst(waveIndexInsideThreadgroup); \
  609. } while(false)
  610. #endif
  611. /// Perturb normal, see http://www.thetenthplanet.de/archives/1180
  612. /// Does normal mapping in the fragment shader. It assumes that green is up. viewDir and geometricNormal need to be in the same space.
  613. RVec3 perturbNormal(RVec3 tangentNormal, Vec3 viewDir, Vec2 uv, Vec3 geometricNormal)
  614. {
  615. tangentNormal.y = -tangentNormal.y; // Green is up
  616. // Get edge vectors of the pixel triangle
  617. const Vec3 dp1 = ddx(viewDir);
  618. const Vec3 dp2 = ddy(viewDir);
  619. const Vec2 duv1 = ddx(uv);
  620. const Vec2 duv2 = ddy(uv);
  621. // Solve the linear system
  622. const Vec3 dp2perp = cross(dp2, geometricNormal);
  623. const Vec3 dp1perp = cross(geometricNormal, dp1);
  624. const Vec3 T = dp2perp * duv1.x + dp1perp * duv2.x;
  625. const Vec3 B = dp2perp * duv1.y + dp1perp * duv2.y;
  626. // Construct a scale-invariant frame
  627. const F32 invmax = rsqrt(max(dot(T, T), dot(B, B)));
  628. RMat3 TBN;
  629. TBN.setColumns(T * invmax, B * invmax, geometricNormal);
  630. return normalize(mul(TBN, tangentNormal));
  631. }
  632. /// Project a sphere into NDC. Sphere in view space. The sphere should be in front of the near plane (-sphereCenter.z > sphereRadius + znear)
  633. /// @param P00 projection matrix's [0,0]
  634. /// @param P11 projection matrix's [1,1]
  635. void projectSphereView(Vec3 sphereCenter, F32 sphereRadius, F32 P00, F32 P11, out Vec2 aabbMin, out Vec2 aabbMax)
  636. {
  637. sphereCenter.z = abs(sphereCenter.z);
  638. const Vec3 cr = sphereCenter * sphereRadius;
  639. const F32 czr2 = sphereCenter.z * sphereCenter.z - sphereRadius * sphereRadius;
  640. const F32 vx = sqrt(sphereCenter.x * sphereCenter.x + czr2);
  641. const F32 minx = (vx * sphereCenter.x - cr.z) / (vx * sphereCenter.z + cr.x);
  642. const F32 maxx = (vx * sphereCenter.x + cr.z) / (vx * sphereCenter.z - cr.x);
  643. const F32 vy = sqrt(sphereCenter.y * sphereCenter.y + czr2);
  644. const F32 miny = (vy * sphereCenter.y - cr.z) / (vy * sphereCenter.z + cr.y);
  645. const F32 maxy = (vy * sphereCenter.y + cr.z) / (vy * sphereCenter.z - cr.y);
  646. aabbMin = Vec2(minx * P00, miny * P11);
  647. aabbMax = Vec2(maxx * P00, maxy * P11);
  648. }
  649. template<typename T>
  650. T barycentricInterpolation(T a, T b, T c, Vec3 barycentrics)
  651. {
  652. return a * barycentrics.x + b * barycentrics.y + c * barycentrics.z;
  653. }
  654. void unflatten3dArrayIndex(const U32 sizeA, const U32 sizeB, const U32 sizeC, const U32 flatIdx, out U32 a, out U32 b, out U32 c)
  655. {
  656. ANKI_ASSERT(flatIdx < (sizeA * sizeB * sizeC));
  657. a = (flatIdx / (sizeB * sizeC)) % sizeA;
  658. b = (flatIdx / sizeC) % sizeB;
  659. c = flatIdx % sizeC;
  660. }