Functions.hlsl 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #pragma once
  6. #include <AnKi/Shaders/Common.hlsl>
  7. // Convert to linear depth
  8. F32 linearizeDepth(F32 depth, F32 zNear, F32 zFar)
  9. {
  10. return zNear / ((zNear - zFar) + zFar / depth);
  11. }
  12. // Convert to linear depth
  13. Vec4 linearizeDepth(Vec4 depth, F32 zNear, F32 zFar)
  14. {
  15. return zNear / ((zNear - zFar) + zFar / depth);
  16. }
  17. // This is the optimal linearizeDepth where a=(n-f)/n and b=f/n
  18. F32 linearizeDepthOptimal(F32 depth, F32 a, F32 b)
  19. {
  20. return 1.0 / (a + b / depth);
  21. }
  22. // This is the optimal linearizeDepth where a=(n-f)/n and b=f/n
  23. Vec4 linearizeDepthOptimal(Vec4 depths, F32 a, F32 b)
  24. {
  25. return 1.0 / (a + b / depths);
  26. }
  27. /// Project a vector by knowing only the non zero values of a perspective matrix. Doesn't take into account jitter.
  28. Vec4 cheapPerspectiveProjection(F32 m00, F32 m11, F32 m22, F32 m23, Vec4 vec)
  29. {
  30. Vec4 o;
  31. o.x = vec.x * m00;
  32. o.y = vec.y * m11;
  33. o.z = vec.z * m22 + vec.w * m23;
  34. o.w = -vec.z;
  35. return o;
  36. }
  37. /// Project a vector by knowing only the non zero values of a perspective matrix. Doesn't take into account jitter.
  38. Vec4 cheapPerspectiveProjection(Vec4 projMat_00_11_22_23, Vec4 vec)
  39. {
  40. return cheapPerspectiveProjection(projMat_00_11_22_23.x, projMat_00_11_22_23.y, projMat_00_11_22_23.z, projMat_00_11_22_23.w, vec);
  41. }
  42. /// To unproject to view space. Jitter not considered. See Mat4::extractPerspectiveUnprojectionParams in C++.
  43. Vec3 cheapPerspectiveUnprojection(Vec4 unprojParams, Vec2 ndc, F32 depth)
  44. {
  45. const F32 z = unprojParams.z / (unprojParams.w + depth);
  46. const Vec2 xy = ndc * unprojParams.xy * z;
  47. return Vec3(xy, z);
  48. }
  49. #if ANKI_PIXEL_SHADER
  50. // Stolen from shadertoy.com/view/4tyGDD
  51. Vec4 textureCatmullRom4Samples(Texture2D tex, SamplerState sampl, Vec2 uv, Vec2 texSize)
  52. {
  53. const Vec2 halff = 2.0 * frac(0.5 * uv * texSize - 0.25) - 1.0;
  54. const Vec2 f = frac(halff);
  55. const Vec2 sum0 = (2.0 * f - 3.5) * f + 0.5;
  56. const Vec2 sum1 = (2.0 * f - 2.5) * f - 0.5;
  57. Vec4 w = Vec4(f * sum0 + 1.0, f * sum1);
  58. const Vec4 pos =
  59. Vec4((((-2.0 * f + 3.0) * f + 0.5) * f - 1.5) * f / (w.xy * texSize) + uv, (((-2.0 * f + 5.0) * f - 2.5) * f - 0.5) / (sum1 * texSize) + uv);
  60. w.xz *= halff.x * halff.y > 0.0 ? 1.0 : -1.0;
  61. return (tex.Sample(sampl, pos.xy) * w.x + tex.Sample(sampl, pos.zy) * w.z) * w.y
  62. + (tex.Sample(sampl, pos.xw) * w.x + tex.Sample(sampl, pos.zw) * w.z) * w.w;
  63. }
  64. #endif
  65. // Stolen from shadertoy.com/view/4df3Dn
  66. template<typename TVec>
  67. TVec textureBicubic(Texture2D<TVec> tex, SamplerState sampl, Vec2 uv, F32 lod)
  68. {
  69. #define w0(a) ((1.0 / 6.0) * ((a) * ((a) * (-(a) + 3.0) - 3.0) + 1.0))
  70. #define w1(a) ((1.0 / 6.0) * ((a) * (a) * (3.0 * (a)-6.0) + 4.0))
  71. #define w2(a) ((1.0 / 6.0) * ((a) * ((a) * (-3.0 * (a) + 3.0) + 3.0) + 1.0))
  72. #define w3(a) ((1.0 / 6.0) * ((a) * (a) * (a)))
  73. #define g0(a) (w0(a) + w1(a))
  74. #define g1(a) (w2(a) + w3(a))
  75. #define h0(a) (-1.0 + w1(a) / (w0(a) + w1(a)))
  76. #define h1(a) (1.0 + w3(a) / (w2(a) + w3(a)))
  77. #define texSample(uv) tex.SampleLevel(sampl, uv, lod)
  78. UVec2 texSize;
  79. U32 mipCount;
  80. tex.GetDimensions(0, texSize.x, texSize.y, mipCount);
  81. const U32 lodi = min(U32(lod), mipCount - 1u);
  82. texSize = texSize >> lodi;
  83. uv = uv * texSize + 0.5;
  84. const Vec2 iuv = floor(uv);
  85. const Vec2 fuv = frac(uv);
  86. const F32 g0x = g0(fuv.x);
  87. const F32 g1x = g1(fuv.x);
  88. const F32 h0x = h0(fuv.x);
  89. const F32 h1x = h1(fuv.x);
  90. const F32 h0y = h0(fuv.y);
  91. const F32 h1y = h1(fuv.y);
  92. const Vec2 p0 = (Vec2(iuv.x + h0x, iuv.y + h0y) - 0.5) / texSize;
  93. const Vec2 p1 = (Vec2(iuv.x + h1x, iuv.y + h0y) - 0.5) / texSize;
  94. const Vec2 p2 = (Vec2(iuv.x + h0x, iuv.y + h1y) - 0.5) / texSize;
  95. const Vec2 p3 = (Vec2(iuv.x + h1x, iuv.y + h1y) - 0.5) / texSize;
  96. return g0(fuv.y) * (g0x * texSample(p0) + g1x * texSample(p1)) + g1(fuv.y) * (g0x * texSample(p2) + g1x * texSample(p3));
  97. #undef w0
  98. #undef w1
  99. #undef w2
  100. #undef g0
  101. #undef g1
  102. #undef h0
  103. #undef h1
  104. #undef texSample
  105. }
  106. F32 rand(Vec2 n)
  107. {
  108. return 0.5 + 0.5 * frac(sin(dot(n, Vec2(12.9898, 78.233))) * 43758.5453);
  109. }
  110. Vec4 nearestDepthUpscale(Vec2 uv, Texture2D<Vec4> depthFull, Texture2D<Vec4> depthHalf, Texture2D<Vec4> colorTex, SamplerState linearAnyClampSampler,
  111. Vec2 linearDepthCf, F32 depthThreshold)
  112. {
  113. F32 fullDepth = depthFull.SampleLevel(linearAnyClampSampler, uv, 0.0).r; // Sampler not important.
  114. fullDepth = linearizeDepthOptimal(fullDepth, linearDepthCf.x, linearDepthCf.y);
  115. Vec4 halfDepths = depthHalf.GatherRed(linearAnyClampSampler, uv); // Sampler not important.
  116. halfDepths = linearizeDepthOptimal(halfDepths, linearDepthCf.x, linearDepthCf.y);
  117. const Vec4 diffs = abs(Vec4(fullDepth, fullDepth, fullDepth, fullDepth) - halfDepths);
  118. Vec4 color;
  119. if(all(diffs < Vec4(depthThreshold, depthThreshold, depthThreshold, depthThreshold)))
  120. {
  121. // No major discontinuites, sample with bilinear
  122. color = colorTex.SampleLevel(linearAnyClampSampler, uv, 0.0);
  123. }
  124. else
  125. {
  126. // Some discontinuites, need to use the newUv
  127. const Vec4 r = colorTex.GatherRed(linearAnyClampSampler, uv);
  128. const Vec4 g = colorTex.GatherGreen(linearAnyClampSampler, uv);
  129. const Vec4 b = colorTex.GatherBlue(linearAnyClampSampler, uv);
  130. const Vec4 a = colorTex.GatherAlpha(linearAnyClampSampler, uv);
  131. F32 minDiff = diffs.x;
  132. U32 comp = 0u;
  133. if(diffs.y < minDiff)
  134. {
  135. comp = 1u;
  136. minDiff = diffs.y;
  137. }
  138. if(diffs.z < minDiff)
  139. {
  140. comp = 2u;
  141. minDiff = diffs.z;
  142. }
  143. if(diffs.w < minDiff)
  144. {
  145. comp = 3u;
  146. }
  147. color = Vec4(r[comp], g[comp], b[comp], a[comp]);
  148. }
  149. return color;
  150. }
  151. F32 _calcDepthWeight(Texture2D depthLow, SamplerState nearestAnyClamp, Vec2 uv, F32 ref, Vec2 linearDepthCf)
  152. {
  153. const F32 d = depthLow.SampleLevel(nearestAnyClamp, uv, 0.0).r;
  154. const F32 linearD = linearizeDepthOptimal(d, linearDepthCf.x, linearDepthCf.y);
  155. return 1.0 / (kEpsilonF32 + abs(ref - linearD));
  156. }
  157. Vec4 _sampleAndWeight(Texture2D depthLow, Texture2D colorLow, SamplerState linearAnyClamp, SamplerState nearestAnyClamp, const Vec2 lowInvSize,
  158. Vec2 uv, const Vec2 offset, const F32 ref, const F32 weight, const Vec2 linearDepthCf, inout F32 normalize)
  159. {
  160. uv += offset * lowInvSize;
  161. const F32 dw = _calcDepthWeight(depthLow, nearestAnyClamp, uv, ref, linearDepthCf);
  162. const Vec4 v = colorLow.SampleLevel(linearAnyClamp, uv, 0.0);
  163. normalize += weight * dw;
  164. return v * dw * weight;
  165. }
  166. Vec4 bilateralUpsample(Texture2D depthHigh, Texture2D depthLow, Texture2D colorLow, SamplerState linearAnyClamp, SamplerState nearestAnyClamp,
  167. const Vec2 lowInvSize, const Vec2 uv, const Vec2 linearDepthCf)
  168. {
  169. const Vec3 kWeights = Vec3(0.25, 0.125, 0.0625);
  170. F32 depthRef = depthHigh.SampleLevel(nearestAnyClamp, uv, 0.0).r;
  171. depthRef = linearizeDepthOptimal(depthRef, linearDepthCf.x, linearDepthCf.y);
  172. F32 normalize = 0.0;
  173. Vec4 sum = _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(0.0, 0.0), depthRef, kWeights.x,
  174. linearDepthCf, normalize);
  175. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(-1.0, 0.0), depthRef, kWeights.y, linearDepthCf,
  176. normalize);
  177. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(0.0, -1.0), depthRef, kWeights.y, linearDepthCf,
  178. normalize);
  179. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(1.0, 0.0), depthRef, kWeights.y, linearDepthCf,
  180. normalize);
  181. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(0.0, 1.0), depthRef, kWeights.y, linearDepthCf,
  182. normalize);
  183. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(1.0, 1.0), depthRef, kWeights.z, linearDepthCf,
  184. normalize);
  185. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(1.0, -1.0), depthRef, kWeights.z, linearDepthCf,
  186. normalize);
  187. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(-1.0, 1.0), depthRef, kWeights.z, linearDepthCf,
  188. normalize);
  189. sum += _sampleAndWeight(depthLow, colorLow, linearAnyClamp, nearestAnyClamp, lowInvSize, uv, Vec2(-1.0, -1.0), depthRef, kWeights.z,
  190. linearDepthCf, normalize);
  191. return sum / normalize;
  192. }
  193. /// Compute the UV that can be passed to a cube texture.
  194. /// (0.5, 0) returns {1, 0, 0}
  195. /// (0.5, 1) returns {-1, 0, 0}
  196. /// (0.5, 2) returns {0, 1, 0}
  197. /// (0.5, 3) returns {0, -1, 0}
  198. /// (0.5, 4) returns {0, 0, 1}
  199. /// (0.5, 5) returns {0, 0, -1}
  200. Vec3 getCubemapDirection(const Vec2 uv, const U32 faceIdx)
  201. {
  202. const Vec2 norm = uv * 2.0 - 1.0;
  203. Vec3 zDir = Vec3((faceIdx <= 1u) ? 1 : 0, (faceIdx & 2u) >> 1u, (faceIdx & 4u) >> 2u);
  204. zDir *= (((faceIdx & 1u) == 1u) ? -1.0 : 1.0);
  205. const Vec3 yDir = (faceIdx == 2u) ? Vec3(0.0, 0.0, 1.0) : (faceIdx == 3u) ? Vec3(0.0, 0.0, -1.0) : Vec3(0.0, -1.0, 0.0);
  206. const Vec3 xDir = cross(zDir, yDir);
  207. return normalize(norm.x * xDir + norm.y * yDir + zDir);
  208. }
  209. /// Convert 3D cubemap coordinates to 2D plus face index. vec doesn't need to be normalized. It's the opposite of getCubemapDirection.
  210. /// This is the exact same thing AMD is doing (v_cubeid and co) with a small difference. AMD for some reason adds 1.5 to the final result instead of
  211. /// 0.5.
  212. template<typename T>
  213. Vec2 convertCubeUvs(const Vec3 vec, out T faceIndex)
  214. {
  215. F32 u, v;
  216. const F32 x = vec.x;
  217. const F32 y = vec.y;
  218. const F32 z = vec.z;
  219. const F32 ax = abs(vec.x);
  220. const F32 ay = abs(vec.y);
  221. const F32 az = abs(vec.z);
  222. F32 major;
  223. if(az >= ax && az >= ay)
  224. {
  225. major = az;
  226. u = (z < 0.0f) ? -x : x;
  227. v = -y;
  228. faceIndex = (z < 0.0f) ? (T)5 : (T)4;
  229. }
  230. else if(ay >= ax)
  231. {
  232. major = ay;
  233. u = x;
  234. v = (y < 0.0f) ? -z : z;
  235. faceIndex = (y < 0.0f) ? (T)3 : (T)2;
  236. }
  237. else
  238. {
  239. major = ax;
  240. u = (x < 0.0f) ? z : -z;
  241. v = -y;
  242. faceIndex = (x < 0.0f) ? (T)1 : (T)0;
  243. }
  244. return Vec2(u, v) / (major * 2.0f) + 0.5f;
  245. }
  246. template<typename T>
  247. vector<T, 3> grayScale(const vector<T, 3> col)
  248. {
  249. const T grey = (col.r + col.g + col.b) * T(1.0 / 3.0);
  250. return vector<T, 3>(grey, grey, grey);
  251. }
  252. template<typename T>
  253. vector<T, 3> saturateColor(const vector<T, 3> col, const T factor)
  254. {
  255. const vector<T, 3> lumCoeff = vector<T, 3>(0.2125, 0.7154, 0.0721);
  256. const T d = dot(col, lumCoeff);
  257. const vector<T, 3> intensity = vector<T, 3>(d, d, d);
  258. return lerp(intensity, col, factor);
  259. }
  260. template<typename T>
  261. vector<T, 3> gammaCorrection(vector<T, 3> gamma, vector<T, 3> col)
  262. {
  263. return pow(col, T(1.0) / gamma);
  264. }
  265. // Can use 0.15 for sharpenFactor
  266. template<typename T>
  267. vector<T, 3> readSharpen(Texture2D<vector<T, 4> > tex, SamplerState sampl, Vec2 uv, T sharpenFactor, Bool detailed)
  268. {
  269. vector<T, 3> col = tex.SampleLevel(sampl, uv, 0.0).rgb;
  270. vector<T, 3> col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(1, 1)).rgb;
  271. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, -1)).rgb;
  272. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(1, -1)).rgb;
  273. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, 1)).rgb;
  274. T f = 4.0;
  275. if(detailed)
  276. {
  277. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(0, 1)).rgb;
  278. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(1, 0)).rgb;
  279. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, 0)).rgb;
  280. col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(0, -1)).rgb;
  281. f = 8.0;
  282. }
  283. col = col * (f * sharpenFactor + T(1.0)) - sharpenFactor * col2;
  284. return max(vector<T, 3>(0.0, 0.0, 0.0), col);
  285. }
  286. template<typename T>
  287. vector<T, 3> readErosion(Texture2D<vector<T, 4> > tex, SamplerState sampl, const Vec2 uv)
  288. {
  289. vector<T, 3> minValue = tex.SampleLevel(sampl, uv, 0.0).rgb;
  290. #define ANKI_EROSION(x, y) \
  291. col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(x, y)).rgb; \
  292. minValue = min(col2, minValue);
  293. vector<T, 3> col2;
  294. ANKI_EROSION(1, 1);
  295. ANKI_EROSION(-1, -1);
  296. ANKI_EROSION(1, -1);
  297. ANKI_EROSION(-1, 1);
  298. ANKI_EROSION(0, 1);
  299. ANKI_EROSION(1, 0);
  300. ANKI_EROSION(-1, 0);
  301. ANKI_EROSION(0, -1);
  302. #undef ANKI_EROSION
  303. return minValue;
  304. }
  305. // 5 color heatmap from a factor.
  306. Vec3 heatmap(const F32 factor)
  307. {
  308. F32 intPart;
  309. const F32 fractional = modf(factor * 4.0, intPart);
  310. if(intPart < 1.0)
  311. {
  312. return lerp(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 1.0), fractional);
  313. }
  314. else if(intPart < 2.0)
  315. {
  316. return lerp(Vec3(0.0, 0.0, 1.0), Vec3(0.0, 1.0, 0.0), fractional);
  317. }
  318. else if(intPart < 3.0)
  319. {
  320. return lerp(Vec3(0.0, 1.0, 0.0), Vec3(1.0, 1.0, 0.0), fractional);
  321. }
  322. else
  323. {
  324. return lerp(Vec3(1.0, 1.0, 0.0), Vec3(1.0, 0.0, 0.0), fractional);
  325. }
  326. }
  327. // Return a color per cubemap face. The +X is red, -X dark red, +Y green, -Y dark green, +Z blue, -Z dark blue
  328. Vec3 colorPerCubeFace(const U32 dir)
  329. {
  330. Vec3 color;
  331. switch(dir)
  332. {
  333. case 0:
  334. color = Vec3(1.0, 0.0, 0.0);
  335. break;
  336. case 1:
  337. color = Vec3(0.25, 0.0, 0.0);
  338. break;
  339. case 2:
  340. color = Vec3(0.0, 1.0, 0.0);
  341. break;
  342. case 3:
  343. color = Vec3(0.0, 0.25, 0.0);
  344. break;
  345. case 4:
  346. color = Vec3(0.0, 0.0, 1.0);
  347. break;
  348. default:
  349. color = Vec3(0.0, 0.0, 0.25);
  350. }
  351. return color;
  352. }
  353. Bool incorrectColor(const Vec3 c)
  354. {
  355. return isnan(c.x) || isnan(c.y) || isnan(c.z) || isinf(c.x) || isinf(c.y) || isinf(c.z);
  356. }
  357. F32 areaElement(const F32 x, const F32 y)
  358. {
  359. return atan2(x * y, sqrt(x * x + y * y + 1.0));
  360. }
  361. // Compute the solid angle of a cube. Solid angle is the area of a sphere when projected into a cubemap. It's also the
  362. // delta omega (dω) in the irradiance integral and other integrals that operate in a sphere.
  363. // http://www.rorydriscoll.com/2012/01/15/cubemap-texel-solid-angle/
  364. F32 cubeCoordSolidAngle(Vec2 norm, F32 cubeFaceSize)
  365. {
  366. const F32 s = 1.0f / cubeFaceSize;
  367. const Vec2 invSize = Vec2(s, s);
  368. const Vec2 v0 = norm - invSize;
  369. const Vec2 v1 = norm + invSize;
  370. return areaElement(v0.x, v0.y) - areaElement(v0.x, v1.y) - areaElement(v1.x, v0.y) + areaElement(v1.x, v1.y);
  371. }
  372. /// A convenience function to skip out of bounds invocations on post-process compute shaders.
  373. Bool skipOutOfBoundsInvocations(UVec2 groupSize, UVec2 threadCount, UVec2 svDispatchThreadId)
  374. {
  375. if((threadCount.x % groupSize.x) != 0u || (threadCount.y % groupSize.y) != 0u)
  376. {
  377. if(svDispatchThreadId.x >= threadCount.x || svDispatchThreadId.y >= threadCount.y)
  378. {
  379. return true;
  380. }
  381. }
  382. return false;
  383. }
  384. // Create a matrix from some direction.
  385. Mat3 rotationFromDirection(Vec3 zAxis)
  386. {
  387. #if 0
  388. const Vec3 z = zAxis;
  389. const Bool alignsWithXBasis = abs(z.x - 1.0) <= kEpsilonF32; // aka z == Vec3(1.0, 0.0, 0.0)
  390. Vec3 x = (alignsWithXBasis) ? Vec3(0.0, 0.0, 1.0) : Vec3(1.0, 0.0, 0.0);
  391. const Vec3 y = normalize(cross(x, z));
  392. x = normalize(cross(z, y));
  393. #else
  394. // http://jcgt.org/published/0006/01/01/
  395. const Vec3 z = zAxis;
  396. const F32 sign = (z.z >= 0.0) ? 1.0 : -1.0;
  397. const F32 a = -1.0 / (sign + z.z);
  398. const F32 b = z.x * z.y * a;
  399. const Vec3 x = Vec3(1.0 + sign * a * pow(z.x, 2.0), sign * b, -sign * z.x);
  400. const Vec3 y = Vec3(b, sign + a * pow(z.y, 2.0), -z.y);
  401. #endif
  402. Mat3 o;
  403. o.setColumns(x, y, z);
  404. return o;
  405. }
  406. #if ANKI_COMPUTE_SHADER && ANKI_GLSL
  407. // See getOptimalGlobalInvocationId8x8Amd
  408. U32 _ABfiM(U32 src, U32 ins, U32 bits)
  409. {
  410. const U32 mask = (1u << bits) - 1u;
  411. return (ins & mask) | (src & (~mask));
  412. }
  413. // See getOptimalGlobalInvocationId8x8Amd
  414. U32 _ABfe(U32 src, U32 off, U32 bits)
  415. {
  416. const U32 mask = (1u << bits) - 1u;
  417. return (src >> off) & mask;
  418. }
  419. // See getOptimalGlobalInvocationId8x8Amd
  420. UVec2 _ARmpRed8x8(U32 a)
  421. {
  422. return UVec2(_ABfiM(_ABfe(a, 2u, 3u), a, 1u), _ABfiM(_ABfe(a, 3u, 3u), _ABfe(a, 1u, 2u), 2u));
  423. }
  424. // https://github.com/GPUOpen-Effects/FidelityFX-CAS/blob/master/ffx-cas/ffx_a.h
  425. UVec2 getOptimalGlobalInvocationId8x8Amd()
  426. {
  427. const UVec2 localInvocationId = _ARmpRed8x8(gl_LocalInvocationIndex);
  428. return gl_WorkGroupID.xy * UVec2(8u) + localInvocationId;
  429. }
  430. // https://github.com/LouisBavoil/ThreadGroupIDSwizzling/blob/master/ThreadGroupTilingX.hlsl
  431. UVec2 getOptimalGlobalInvocationId8x8Nvidia()
  432. {
  433. const U32 maxTileWidth = 8u;
  434. const UVec2 workgroupSize = UVec2(8u);
  435. const U32 workgroupsInAPerfectTile = maxTileWidth * gl_NumWorkGroups.y;
  436. const U32 perfectTileCount = gl_NumWorkGroups.x / maxTileWidth;
  437. const U32 totalWorkgroupsInAllPerfectTiles = perfectTileCount * maxTileWidth * gl_NumWorkGroups.y;
  438. const U32 vThreadGroupIDFlattened = gl_NumWorkGroups.x * gl_WorkGroupID.y + gl_WorkGroupID.x;
  439. const U32 tileIdOfCurrentWorkgroup = vThreadGroupIDFlattened / workgroupsInAPerfectTile;
  440. const U32 localWorkgroupIdWithinCurrentTile = vThreadGroupIDFlattened % workgroupsInAPerfectTile;
  441. U32 localWorkgroupIdYWithinCurrentTile;
  442. U32 localWorgroupIdXWithinCurrentTile;
  443. if(totalWorkgroupsInAllPerfectTiles <= vThreadGroupIDFlattened)
  444. {
  445. U32 xDimensionOfLastTile = gl_NumWorkGroups.x % maxTileWidth;
  446. localWorkgroupIdYWithinCurrentTile = localWorkgroupIdWithinCurrentTile / xDimensionOfLastTile;
  447. localWorgroupIdXWithinCurrentTile = localWorkgroupIdWithinCurrentTile % xDimensionOfLastTile;
  448. }
  449. else
  450. {
  451. localWorkgroupIdYWithinCurrentTile = localWorkgroupIdWithinCurrentTile / maxTileWidth;
  452. localWorgroupIdXWithinCurrentTile = localWorkgroupIdWithinCurrentTile % maxTileWidth;
  453. }
  454. const U32 swizzledvThreadGroupIdFlattened =
  455. tileIdOfCurrentWorkgroup * maxTileWidth + localWorkgroupIdYWithinCurrentTile * gl_NumWorkGroups.x + localWorgroupIdXWithinCurrentTile;
  456. UVec2 swizzledvThreadGroupId;
  457. swizzledvThreadGroupId.y = swizzledvThreadGroupIdFlattened / gl_NumWorkGroups.x;
  458. swizzledvThreadGroupId.x = swizzledvThreadGroupIdFlattened % gl_NumWorkGroups.x;
  459. UVec2 swizzledGlobalId;
  460. swizzledGlobalId.x = workgroupSize.x * swizzledvThreadGroupId.x + gl_LocalInvocationID.x;
  461. swizzledGlobalId.y = workgroupSize.y * swizzledvThreadGroupId.y + gl_LocalInvocationID.y;
  462. return swizzledGlobalId.xy;
  463. }
  464. #endif
  465. // Gaussian distrubution function. Play with the values here https://www.desmos.com/calculator/7oxmohg3ta
  466. // s is the sigma and x is a factor where abs(x) is in [0, 1]
  467. template<typename T>
  468. T gaussianWeight(T s, T x)
  469. {
  470. T p = T(1) / (s * sqrt(T(2) * kPi));
  471. p *= exp((x * x) / (T(-2) * s * s));
  472. return p;
  473. }
  474. template<typename T>
  475. T gaussianWeight2d(T s, T x, T y)
  476. {
  477. T p = T(1) / (T(2) * kPi * s * s);
  478. p *= exp((x * x + y * y) / (T(-2) * s * s));
  479. return p;
  480. }
  481. // https://www.shadertoy.com/view/WsfBDf
  482. template<typename T>
  483. vector<T, 3> animateBlueNoise(vector<T, 3> inputBlueNoise, U32 frameIdx)
  484. {
  485. const T goldenRatioConjugate = 0.61803398875;
  486. return frac(inputBlueNoise + T(frameIdx % 64u) * goldenRatioConjugate);
  487. }
  488. #if ANKI_PIXEL_SHADER
  489. /// https://bgolus.medium.com/distinctive-derivative-differences-cce38d36797b
  490. /// normalizedUvs is uv*textureResolution
  491. F32 computeMipLevel(Vec2 normalizedUvs)
  492. {
  493. const Vec2 dx = ddx_coarse(normalizedUvs);
  494. const Vec2 dy = ddy_coarse(normalizedUvs);
  495. const F32 deltaMax2 = max(dot(dx, dx), dot(dy, dy));
  496. return max(0.0, 0.5 * log2(deltaMax2));
  497. }
  498. #endif
  499. #if ANKI_SUPPORTS_64BIT_TYPES
  500. /// The regular firstbitlow in DXC has some issues since it invokes a builtin that is only supposed to be used with
  501. /// 32bit input. This is an alternative implementation but it expects that the input is not zero.
  502. I32 firstbitlow2(U64 v)
  503. {
  504. const I32 lsb1 = firstbitlow((U32)v);
  505. const I32 lsb2 = firstbitlow((U32)(v >> 32ul));
  506. return (lsb1 >= 0) ? lsb1 : lsb2 + 32;
  507. }
  508. #endif
  509. /// Define an alternative firstbitlow to go in pair with the 64bit version.
  510. I32 firstbitlow2(U32 v)
  511. {
  512. return firstbitlow(v);
  513. }
  514. #if ANKI_SUPPORTS_64BIT_TYPES
  515. /// The regular firstbitlow in DXC has some issues since it invokes a builtin that is only supposed to be used with
  516. /// 32bit input. This is an alternative implementation but it expects that the input is not zero.
  517. U32 countbits2(U64 v)
  518. {
  519. return countbits(U32(v)) + countbits(U32(v >> 32ul));
  520. }
  521. #endif
  522. /// Encode the shading rate to be stored in an SRI. The rates should be power of two, can't be zero and can't exceed 4.
  523. /// So the possible values are 1,2,4
  524. U32 encodeVrsRate(UVec2 rateXY)
  525. {
  526. return (rateXY.y >> 1u) | ((rateXY.x << 1u) & 12u);
  527. }
  528. Vec3 visualizeVrsRate(UVec2 rate)
  529. {
  530. if(all(rate == UVec2(1u, 1u)))
  531. {
  532. return Vec3(1.0, 0.0, 0.0);
  533. }
  534. else if(all(rate == UVec2(2u, 1u)) || all(rate == UVec2(1u, 2u)))
  535. {
  536. return Vec3(1.0, 0.5, 0.0);
  537. }
  538. else if(all(rate == UVec2(2u, 2u)) || all(rate == UVec2(4u, 1u)) || all(rate == UVec2(1u, 4u)))
  539. {
  540. return Vec3(1.0, 1.0, 0.0);
  541. }
  542. else if(all(rate == UVec2(4u, 2u)) || all(rate == UVec2(2u, 4u)))
  543. {
  544. return Vec3(0.65, 1.0, 0.0);
  545. }
  546. else if(all(rate == UVec2(4u, 4u)))
  547. {
  548. return Vec3(0.0, 1.0, 0.0);
  549. }
  550. else
  551. {
  552. return Vec3(0.0, 0.0, 0.0);
  553. }
  554. }
  555. /// Decodes a number produced by encodeVrsRate(). Returns the shading rates.
  556. UVec2 decodeVrsRate(U32 texel)
  557. {
  558. UVec2 rateXY;
  559. rateXY.x = 1u << ((texel >> 2u) & 3u);
  560. rateXY.y = 1u << (texel & 3u);
  561. return rateXY;
  562. }
  563. /// 3D coordinates to equirectangular 2D coordinates.
  564. Vec2 equirectangularMapping(Vec3 v)
  565. {
  566. Vec2 uv = Vec2(atan2(v.z, v.x), asin(v.y));
  567. uv *= Vec2(0.1591, 0.3183);
  568. uv += 0.5;
  569. return uv;
  570. }
  571. template<typename T>
  572. vector<T, 3> linearToSRgb(vector<T, 3> linearRgb)
  573. {
  574. constexpr T a = 6.10352e-5;
  575. constexpr T b = 1.0 / 2.4;
  576. linearRgb = max(vector<T, 3>(a, a, a), linearRgb);
  577. return min(linearRgb * T(12.92), pow(max(linearRgb, T(0.00313067)), Vec3(b, b, b)) * T(1.055) - T(0.055));
  578. }
  579. template<typename T>
  580. vector<T, 3> sRgbToLinear(vector<T, 3> sRgb)
  581. {
  582. const bool3 cutoff = sRgb < vector<T, 3>(0.04045, 0.04045, 0.04045);
  583. const vector<T, 3> higher = pow((sRgb + T(0.055)) / T(1.055), vector<T, 3>(2.4, 2.4, 2.4));
  584. const vector<T, 3> lower = sRgb / T(12.92);
  585. return lerp(higher, lower, cutoff);
  586. }
  587. template<typename T>
  588. vector<T, 3> filmGrain(vector<T, 3> color, Vec2 uv, T strength, F32 time)
  589. {
  590. const T x = (uv.x + 4.0) * (uv.y + 4.0) * time;
  591. const T grain = T(1.0) - (fmod((fmod(x, T(13.0)) + T(1.0)) * (fmod(x, T(123.0)) + T(1.0)), T(0.01)) - T(0.005)) * strength;
  592. return color * grain;
  593. }
  594. #if ANKI_COMPUTE_SHADER || ANKI_WORK_GRAPH_SHADER
  595. /// HLSL doesn't have SubgroupID so compute it. It's a macro because we can't have functions that InterlockedAdd on local variables (the compiler
  596. /// can't see it's groupshared).
  597. /// @param svGroupIndex Self explanatory.
  598. /// @param tmpGroupsharedU32Var A U32 groupshared variable that will help with the calculation.
  599. /// @param waveIndexInsideThreadgroup The SubgroupID.
  600. /// @param wavesPerThreadGroup Also calculate that in case some GPUs manage to mess this up.
  601. # define ANKI_COMPUTE_WAVE_INDEX_INSIDE_THREADGROUP(svGroupIndex, tmpGroupsharedU32Var, waveIndexInsideThreadgroup, wavesPerThreadGroup) \
  602. do \
  603. { \
  604. if(svGroupIndex == 0) \
  605. { \
  606. tmpGroupsharedU32Var = 0; \
  607. } \
  608. GroupMemoryBarrierWithGroupSync(); \
  609. waveIndexInsideThreadgroup = 0; \
  610. if(WaveIsFirstLane()) \
  611. { \
  612. InterlockedAdd(tmpGroupsharedU32Var, 1, waveIndexInsideThreadgroup); \
  613. } \
  614. GroupMemoryBarrierWithGroupSync(); \
  615. wavesPerThreadGroup = tmpGroupsharedU32Var; \
  616. waveIndexInsideThreadgroup = WaveReadLaneFirst(waveIndexInsideThreadgroup); \
  617. } while(false)
  618. #endif
  619. /// Perturb normal, see http://www.thetenthplanet.de/archives/1180
  620. /// Does normal mapping in the fragment shader. It assumes that green is up. viewDir and geometricNormal need to be in the same space.
  621. /// viewDir is the -(eye - vertexPos)
  622. Vec3 perturbNormal(Vec3 tangentNormal, Vec3 viewDir, Vec2 uv, Vec3 geometricNormal)
  623. {
  624. tangentNormal.y = -tangentNormal.y; // Green is up
  625. // Get edge vectors of the pixel triangle
  626. const Vec3 dp1 = ddx(viewDir);
  627. const Vec3 dp2 = ddy(viewDir);
  628. const Vec2 duv1 = ddx(uv);
  629. const Vec2 duv2 = ddy(uv);
  630. // Solve the linear system
  631. const Vec3 dp2perp = cross(dp2, geometricNormal);
  632. const Vec3 dp1perp = cross(geometricNormal, dp1);
  633. const Vec3 T = dp2perp * duv1.x + dp1perp * duv2.x;
  634. const Vec3 B = dp2perp * duv1.y + dp1perp * duv2.y;
  635. // Construct a scale-invariant frame
  636. const F32 invmax = rsqrt(max(dot(T, T), dot(B, B)));
  637. Mat3 TBN;
  638. TBN.setColumns(T * invmax, B * invmax, geometricNormal);
  639. return normalize(mul(TBN, tangentNormal));
  640. }
  641. /// Project a sphere into NDC. Sphere in view space. The sphere should be in front of the near plane (-sphereCenter.z > sphereRadius + znear)
  642. /// @param P00 projection matrix's [0,0]
  643. /// @param P11 projection matrix's [1,1]
  644. void projectSphereView(Vec3 sphereCenter, F32 sphereRadius, F32 P00, F32 P11, out Vec2 aabbMin, out Vec2 aabbMax)
  645. {
  646. sphereCenter.z = abs(sphereCenter.z);
  647. const Vec3 cr = sphereCenter * sphereRadius;
  648. const F32 czr2 = sphereCenter.z * sphereCenter.z - sphereRadius * sphereRadius;
  649. const F32 vx = sqrt(sphereCenter.x * sphereCenter.x + czr2);
  650. const F32 minx = (vx * sphereCenter.x - cr.z) / (vx * sphereCenter.z + cr.x);
  651. const F32 maxx = (vx * sphereCenter.x + cr.z) / (vx * sphereCenter.z - cr.x);
  652. const F32 vy = sqrt(sphereCenter.y * sphereCenter.y + czr2);
  653. const F32 miny = (vy * sphereCenter.y - cr.z) / (vy * sphereCenter.z + cr.y);
  654. const F32 maxy = (vy * sphereCenter.y + cr.z) / (vy * sphereCenter.z - cr.y);
  655. aabbMin = Vec2(minx * P00, miny * P11);
  656. aabbMax = Vec2(maxx * P00, maxy * P11);
  657. }
  658. template<typename T>
  659. T barycentricInterpolation(T a, T b, T c, Vec3 barycentrics)
  660. {
  661. return a * barycentrics.x + b * barycentrics.y + c * barycentrics.z;
  662. }
  663. void unflatten3dArrayIndex(const U32 sizeA, const U32 sizeB, const U32 sizeC, const U32 flatIdx, out U32 a, out U32 b, out U32 c)
  664. {
  665. ANKI_ASSERT(flatIdx < (sizeA * sizeB * sizeC));
  666. a = (flatIdx / (sizeB * sizeC)) % sizeA;
  667. b = (flatIdx / sizeC) % sizeB;
  668. c = flatIdx % sizeC;
  669. }
  670. Bool dither2x2(Vec2 svPosition, F32 factor)
  671. {
  672. const U32 ditherMatrix[4] = {0, 3, 2, 1};
  673. const F32 axisSize = 2.0;
  674. const U32 x = U32(fmod(svPosition.x, axisSize));
  675. const U32 y = U32(fmod(svPosition.y, axisSize));
  676. const U32 index = x + y * U32(axisSize);
  677. const F32 limit = (F32(ditherMatrix[index]) + 1.0) / (1.0 + axisSize * axisSize);
  678. return (factor < limit) ? true : false;
  679. }
  680. Bool dither4x4(Vec2 svPosition, F32 factor)
  681. {
  682. const U32 ditherMatrix[16] = {0, 12, 3, 15, 8, 4, 11, 7, 2, 14, 1, 13, 10, 6, 9, 5};
  683. const F32 axisSize = 4.0;
  684. const U32 x = U32(fmod(svPosition.x, axisSize));
  685. const U32 y = U32(fmod(svPosition.y, axisSize));
  686. const U32 index = x + y * U32(axisSize);
  687. const F32 limit = (F32(ditherMatrix[index]) + 1.0) / (1.0 + axisSize * axisSize);
  688. return (factor < limit) ? true : false;
  689. }
  690. // Encode a normal to octahedron UV coordinates
  691. Vec2 octahedronEncode(Vec3 n)
  692. {
  693. n /= (abs(n.x) + abs(n.y) + abs(n.z));
  694. const Vec2 octWrap = (1.0 - abs(n.yx)) * select(n.xy >= 0.0, 1.0, -1.0);
  695. n.xy = select(n.z >= 0.0, n.xy, octWrap);
  696. n.xy = n.xy * 0.5 + 0.5;
  697. return n.xy;
  698. }
  699. // The reverse of octahedronEncode
  700. // https://twitter.com/Stubbesaurus/status/937994790553227264
  701. Vec3 octahedronDecode(Vec2 f)
  702. {
  703. f = f * 2.0 - 1.0;
  704. Vec3 n = Vec3(f.x, f.y, 1.0 - abs(f.x) - abs(f.y));
  705. const F32 t = saturate(-n.z);
  706. n.xy += select(n.xy >= 0.0, -t, t);
  707. return normalize(n);
  708. }
  709. /// Given the size of the octahedron texture and a texel that belongs to it, return the offsets relative to this texel that belong to the border.
  710. /// The texSize is without border and the texCoord as well.
  711. U32 octahedronBorder(IVec2 texSize, IVec2 texCoord, out IVec2 borderTexOffsets[3])
  712. {
  713. U32 borderCount = 0;
  714. if(all(texCoord == 0))
  715. {
  716. borderTexOffsets[borderCount++] = texSize;
  717. }
  718. else if(texCoord.x == 0 && texCoord.y == texSize.y - 1)
  719. {
  720. borderTexOffsets[borderCount++] = IVec2(texSize.x, -texSize.y);
  721. }
  722. else if(all(texCoord == texSize - 1))
  723. {
  724. borderTexOffsets[borderCount++] = -texSize;
  725. }
  726. else if(texCoord.x == texSize.x - 1 && texCoord.y == 0)
  727. {
  728. borderTexOffsets[borderCount++] = IVec2(-texSize.x, texSize.y);
  729. }
  730. if(texCoord.y == 0)
  731. {
  732. borderTexOffsets[borderCount++] = IVec2((texSize.x - 1) - 2 * texCoord.x, -1);
  733. }
  734. else if(texCoord.y == texSize.y - 1)
  735. {
  736. borderTexOffsets[borderCount++] = IVec2((texSize.x - 1) - 2 * texCoord.x, 1);
  737. }
  738. if(texCoord.x == 0)
  739. {
  740. borderTexOffsets[borderCount++] = IVec2(-1, (texSize.y - 1) - 2 * texCoord.y);
  741. }
  742. else if(texCoord.x == texSize.x - 1)
  743. {
  744. borderTexOffsets[borderCount++] = IVec2(1, (texSize.y - 1) - 2 * texCoord.y);
  745. }
  746. return borderCount;
  747. }
  748. /// Manual texture sampling of a 3D texture.
  749. template<typename T, U32 kComp>
  750. vector<T, kComp> linearTextureSampling(Texture3D<Vec4> sam, Vec3 uv)
  751. {
  752. Vec3 texSize;
  753. sam.GetDimensions(texSize.x, texSize.y, texSize.z);
  754. uv = frac(uv);
  755. uv = uv * texSize - 0.5;
  756. Vec3 iuv = floor(uv);
  757. Vec3 fuv = frac(uv);
  758. vector<T, kComp> o = T(0);
  759. for(U32 i = 0u; i < 8u; ++i)
  760. {
  761. const Vec3 xyz = Vec3(UVec3(i, i >> 1u, i >> 2u) & 1u);
  762. Vec3 coords = iuv + xyz;
  763. // Repeat
  764. coords = select(coords >= 0.0, coords, texSize + coords);
  765. coords = select(coords < texSize, coords, coords - texSize);
  766. const vector<T, kComp> s = sam[coords];
  767. const vector<T, 3> w3 = select(xyz == 0.0, T(1) - fuv, fuv);
  768. const T w = w3.x * w3.y * w3.z;
  769. o += s * w;
  770. }
  771. return o;
  772. }
  773. /// Generate a 4x MSAA pattern. Returns the numbers in
  774. /// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_standard_multisample_quality_levels
  775. /// Divide the result by 8.0 to normalize.
  776. IVec2 generateMsaa4x(U32 sample)
  777. {
  778. sample <<= 2u;
  779. IVec2 pattern = IVec2(41702, 60002);
  780. pattern >>= sample;
  781. pattern &= 0xF;
  782. pattern -= 8;
  783. return pattern;
  784. }
  785. /// Generate a 16x MSAA pattern. Returns the numbers in
  786. /// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_standard_multisample_quality_levels
  787. /// Divide the result by 8.0 to normalize.
  788. IVec2 generateMsaa16x(U32 sample)
  789. {
  790. const IVec2 packed[2] = {IVec2(0xBDA3C579, 0x3BD67A59), IVec2(0x1EF02486, 0xF48C21E)};
  791. const U32 bit = (sample % 8u) * 4u;
  792. IVec2 pattern = (sample < 8) ? packed[0] : packed[1];
  793. pattern >>= bit;
  794. pattern &= 0xF;
  795. pattern -= 8;
  796. return pattern;
  797. }