ffx_spd.h 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248
  1. //_____________________________________________________________/\_______________________________________________________________
  2. //==============================================================================================================================
  3. //
  4. // [FFX SPD] Single Pass Downsampler 2.0
  5. //
  6. //==============================================================================================================================
  7. // LICENSE
  8. // =======
  9. // Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
  10. // -------
  11. // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  12. // documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
  13. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
  14. // permit persons to whom the Software is furnished to do so, subject to the following conditions:
  15. // -------
  16. // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
  17. // Software.
  18. // -------
  19. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  20. // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
  21. // OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  22. // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23. //
  24. //------------------------------------------------------------------------------------------------------------------------------
  25. // CHANGELIST v2.0
  26. // ===============
  27. // - Added support for cube and array textures. SpdDownsample and SpdDownsampleH shader functions now take index of
  28. // texture slice
  29. // as an additional parameter. For regular texture use 0.
  30. // - Added support for updating only sub-rectangle of the texture. Additional, optional parameter workGroupOffset added
  31. // to shader
  32. // functions SpdDownsample and SpdDownsampleH.
  33. // - Added C function SpdSetup that helps to setup constants to be passed as a constant buffer.
  34. // - The global atomic counter is automatically reset to 0 by the shader at the end, so you do not need to clear it
  35. // before every
  36. // use, just once after creation
  37. //
  38. //------------------------------------------------------------------------------------------------------------------------------
  39. // INTEGRATION SUMMARY FOR CPU
  40. // ===========================
  41. // // you need to provide as constants:
  42. // // number of mip levels to be computed (maximum is 12)
  43. // // number of total thread groups: ((widthInPixels+63)>>6) * ((heightInPixels+63)>>6)
  44. // // workGroupOffset -> by default 0, if you only downsample a rectancle within the source texture use SpdSetup
  45. // function to calculate correct offset
  46. // ...
  47. // // Dispatch the shader such that each thread group works on a 64x64 sub-tile of the source image
  48. // // for Cube Textures or Texture2DArray, use the z dimension
  49. // vkCmdDispatch(cmdBuf,(widthInPixels+63)>>6,(heightInPixels+63)>>6, slices);
  50. // // you can also use the SpdSetup function:
  51. // //on top of your cpp file:
  52. // #define A_CPU
  53. // #include "ffx_a.h"
  54. // #include "ffx_spd.h"
  55. // // before your dispatch call, use SpdSetup function to get your constants
  56. // varAU2(dispatchThreadGroupCountXY); // output variable
  57. // varAU2(workGroupOffset); // output variable, this constants are required if Left and Top are not 0,0
  58. // varAU2(numWorkGroupsAndMips); // output variable
  59. // // input information about your source texture:
  60. // // left and top of the rectancle within your texture you want to downsample
  61. // // width and height of the rectancle you want to downsample
  62. // // if complete source texture should get downsampled: left = 0, top = 0, width = sourceTexture.width, height =
  63. // sourceTexture.height varAU4(rectInfo) = initAU4(0, 0, m_Texture.GetWidth(), m_Texture.GetHeight()); // left, top,
  64. // width, height SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo);
  65. // ...
  66. // // constants:
  67. // data.numWorkGroupsPerSlice = numWorkGroupsAndMips[0];
  68. // data.mips = numWorkGroupsAndMips[1];
  69. // data.workGroupOffset[0] = workGroupOffset[0];
  70. // data.workGroupOffset[1] = workGroupOffset[1];
  71. // ...
  72. // uint32_t dispatchX = dispatchThreadGroupCountXY[0];
  73. // uint32_t dispatchY = dispatchThreadGroupCountXY[1];
  74. // uint32_t dispatchZ = m_CubeTexture.GetArraySize(); // slices - for 2D Texture this is 1, for cube texture 6
  75. // vkCmdDispatch(cmd_buf, dispatchX, dispatchY, dispatchZ);
  76. //------------------------------------------------------------------------------------------------------------------------------
  77. // INTEGRATION SUMMARY FOR GPU
  78. // ===========================
  79. // [SAMPLER] - if you want to use a sampler with linear filtering for loading the source image
  80. // follow additionally the instructions marked with [SAMPLER]
  81. // add following define:
  82. // #define SPD_LINEAR_SAMPLER
  83. // this is recommended, as using one sample() with linear filter to reduce 2x2 is faster
  84. // than 4x load() plus manual averaging
  85. // // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
  86. // // Note: If you use SRGB format for UAV load() and store() (if it's supported), you need to convert to and from
  87. // linear space
  88. // // when using UAV load() and store()
  89. // // approximate conversion to linear (load function): x*x
  90. // // approximate conversion from linear (store function): sqrt()
  91. // // or use more accurate functions from ffx_a.h: AFromSrgbF1(value) and AToSrgbF1(value)
  92. // // Recommendation: use UNORM format instead of SRGB for UAV access, and SRGB for SRV access
  93. // // look in the sample app to see how it's done
  94. // // source image
  95. // // if cube texture use image2DArray / Texture2DArray and adapt your load/store/sample calls
  96. // GLSL: layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
  97. // [SAMPLER]: layout(set=0,binding=0)uniform texture2D imgSrc;
  98. // HLSL: [[vk::binding(0)]] Texture2D<float4> imgSrc :register(u0);
  99. // // destination -> 12 is the maximum number of mips supported by SPD
  100. // GLSL: layout(set=0,binding=1,rgba16f) uniform coherent image2D imgDst[12];
  101. // HLSL: [[vk::binding(1)]] globallycoherent RWTexture2D<float4> imgDst[12] :register(u1);
  102. // // global atomic counter - MUST be initialized to 0
  103. // // SPD resets the counter back after each run by calling SpdResetAtomicCounter(slice)
  104. // // if you have more than 1 slice (== if you downsample a cube texture or a texture2Darray)
  105. // // you have an array of counters: counter[6] -> if you have 6 slices for example
  106. // // GLSL:
  107. // layout(std430, set=0, binding=2) coherent buffer SpdGlobalAtomicBuffer
  108. // {
  109. // uint counter;
  110. // } spdGlobalAtomic;
  111. // // HLSL:
  112. // struct SpdGlobalAtomicBuffer
  113. // {
  114. // uint counter;
  115. // };
  116. // [[vk::binding(2)]] globallycoherent RWStructuredBuffer<SpdGlobalAtomicBuffer> spdGlobalAtomic;
  117. // // [SAMPLER] add sampler
  118. // GLSL: layout(set=0, binding=3) uniform sampler srcSampler;
  119. // HLSL: [[vk::binding(3)]] SamplerState srcSampler :register(s0);
  120. // // constants - either push constant or constant buffer
  121. // // or calculate within shader
  122. // // [SAMPLER] when using sampler add inverse source image size
  123. // // GLSL:
  124. // layout(push_constant) uniform SpdConstants {
  125. // uint mips; // needed to opt out earlier if mips are < 12
  126. // uint numWorkGroups; // number of total thread groups, so numWorkGroupsX * numWorkGroupsY * 1
  127. // // it is important to NOT take the number of slices (z dimension) into account here
  128. // // as each slice has its own counter!
  129. // vec2 workGroupOffset; // optional - use SpdSetup() function to calculate correct workgroup offset
  130. // } spdConstants;
  131. // // HLSL:
  132. // [[vk::push_constant]]
  133. // cbuffer spdConstants {
  134. // uint mips;
  135. // uint numWorkGroups;
  136. // float2 workGroupOffset; // optional
  137. // };
  138. // ...
  139. // // Setup pre-portability-header defines (sets up GLSL/HLSL path, etc)
  140. // #define A_GPU 1
  141. // #define A_GLSL 1 // or // #define A_HLSL 1
  142. // // if you want to use PACKED version
  143. // // recommended if bpc <= 16bit
  144. // #define A_HALF
  145. // ...
  146. // // Include the portability header (or copy it in without an include).
  147. // #include "ffx_a.h"
  148. // ...
  149. // // Define LDS variables
  150. // shared AF4 spdIntermediate[16][16]; // HLSL: groupshared
  151. // shared AU1 spdCounter; // HLSL: groupshared
  152. // // PACKED version
  153. // shared AH4 spdIntermediate[16][16]; // HLSL: groupshared
  154. // // Note: You can also use
  155. // shared AF1 spdIntermediateR[16][16];
  156. // shared AF1 spdIntermediateG[16][16];
  157. // shared AF1 spdIntermediateB[16][16];
  158. // shared AF1 spdIntermediateA[16][16];
  159. // // or for Packed version:
  160. // shared AH2 spdIntermediateRG[16][16];
  161. // shared AH2 spdIntermediateBA[16][16];
  162. // // This is potentially faster
  163. // // Adapt your load and store functions accordingly
  164. // // if subgroup operations are not supported / can't use SM6.0
  165. // #define SPD_NO_WAVE_OPERATIONS
  166. // // Define the fetch function(s) and the reduction function
  167. // // if non-power-of-2 textures, add border controls to the load and store functions
  168. // // to make sure the borders of the mip level look as you want it
  169. // // if you don't add border controls you'll read zeros past the border
  170. // // if you load with a sampler, this is obv. handled by your sampler :)
  171. // // this is also the place where you need to do color space transformation if needed
  172. // // E.g. if your texture format is SRGB/UNORM and you use the UAV load and store functions
  173. // // no automatic to/from linear conversions are happening
  174. // // there is to/from linear conversions when using a sampler and render target approach
  175. // // conversion to linear (load function): x*x
  176. // // conversion from linear (store function): sqrt()
  177. // AU1 slice parameter is for Cube textures and texture2DArray
  178. // if downsampling Texture2D you can ignore this parameter, otherwise use it to access correct slice
  179. // // Load from source image
  180. // GLSL: AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return imageLoad(imgSrc, p);}
  181. // HLSL: AF4 SpdLoadSourceImage(ASU2 tex, AU1 slice){return imgSrc[tex];}
  182. // [SAMPLER] don't forget to add the define #SPD_LINEAR_SAMPLER :)
  183. // GLSL:
  184. // AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
  185. // AF2 textureCoord = p * invInputSize + invInputSize;
  186. // return texture(sampler2D(imgSrc, srcSampler), textureCoord);
  187. // }
  188. // HLSL:
  189. // AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
  190. // AF2 textureCoord = p * invInputSize + invInputSize;
  191. // return imgSrc.SampleLevel(srcSampler, textureCoord, 0);
  192. // }
  193. // // SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color.
  194. // // Loads the 5th mip level, each value is computed by a different thread group
  195. // // last thread group will access all its elements and compute the subsequent mips
  196. // // reminder: if non-power-of-2 textures, add border controls if you do not want to read zeros past the border
  197. // GLSL: AF4 SpdLoad(ASU2 p, AU1 slice){return imageLoad(imgDst[5],p);}
  198. // HLSL: AF4 SpdLoad(ASU2 tex, AU1 slice){return imgDst[5][tex];}
  199. // Define the store function
  200. // GLSL: void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, value);}
  201. // HLSL: void SpdStore(ASU2 pix, AF4 value, AU1 mip, AU1 slice){imgDst[mip][pix] = value;}
  202. // // Define the atomic counter increase function
  203. // // each slice only reads and stores to its specific slice counter
  204. // // so, if you have several slices it's
  205. // // InterlockedAdd(spdGlobalAtomic[0].counter[slice], 1, spdCounter);
  206. // // GLSL:
  207. // void SpdIncreaseAtomicCounter(AU1 slice){spdCounter = atomicAdd(spdGlobalAtomic.counter, 1);}
  208. // AU1 SpdGetAtomicCounter() {return spdCounter;}
  209. // void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic.counter[slice] = 0;}
  210. // // HLSL:
  211. // void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
  212. // AU1 SpdGetAtomicCounter(){return spdCounter;}
  213. // void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic[0].counter[slice] = 0;}
  214. // // Define the LDS load and store functions
  215. // // GLSL:
  216. // AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
  217. // void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
  218. // // HLSL:
  219. // AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
  220. // void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
  221. // // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
  222. // Example below: computes the average value
  223. // AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return (v0+v1+v2+v3)*0.25;}
  224. // // PACKED VERSION
  225. // Load from source image
  226. // GLSL: AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){return AH4(imageLoad(imgSrc, p));}
  227. // HLSL: AH4 SpdLoadSourceImageH(ASU2 tex, AU1 slice){return AH4(imgSrc[tex]);}
  228. // [SAMPLER]
  229. // GLSL:
  230. // AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
  231. // AF2 textureCoord = p * invInputSize + invInputSize;
  232. // return AH4(texture(sampler2D(imgSrc, srcSampler), textureCoord));
  233. // }
  234. // HLSL:
  235. // AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
  236. // AF2 textureCoord = p * invInputSize + invInputSize;
  237. // return AH4(imgSrc.SampleLevel(srcSampler, textureCoord, 0));
  238. // }
  239. // // SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color.
  240. // // Loads the 5th mip level, each value is computed by a different thread group
  241. // // last thread group will access all its elements and compute the subsequent mips
  242. // GLSL: AH4 SpdLoadH(ASU2 p, AU1 slice){return AH4(imageLoad(imgDst[5],p));}
  243. // HLSL: AH4 SpdLoadH(ASU2 tex, AU1 slice){return AH4(imgDst[5][tex]);}
  244. // Define the store function
  245. // GLSL: void SpdStoreH(ASU2 p, AH4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, AF4(value));}
  246. // HLSL: void SpdStoreH(ASU2 pix, AH4 value, AU1 index, AU1 slice){imgDst[index][pix] = AF4(value);}
  247. // // Define the atomic counter increase function
  248. // // GLSL:
  249. // void SpdIncreaseAtomicCounter(AU1 slice){spd_counter = atomicAdd(spdGlobalAtomic.counter, 1);}
  250. // AU1 SpdGetAtomicCounter() {return spdCounter;}
  251. // // HLSL:
  252. // void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
  253. // AU1 SpdGetAtomicCounter(){return spdCounter;}
  254. // // Define the LDS load and store functions
  255. // // GLSL:
  256. // AH4 SpdLoadIntermediateH(AU1 x, AU1 y){return spdIntermediate[x][y];}
  257. // void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
  258. // // HLSL:
  259. // AH4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
  260. // void SpdStoreIntermediate(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
  261. // // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
  262. // Example below: computes the average value
  263. // AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3){return (v0+v1+v2+v3)*AH1(0.25);}
  264. // //
  265. // // If you only use PACKED version
  266. // #define SPD_PACKED_ONLY
  267. // // Include this SPD (single pass downsampler) header file (or copy it in without an include).
  268. // #include "ffx_spd.h"
  269. // ...
  270. // // Example in shader integration
  271. // // GLSL:
  272. // layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
  273. // void main(){
  274. // // Call the downsampling function
  275. // // WorkGroupId.z should be 0 if you only downsample a Texture2D!
  276. // SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex),
  277. // AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
  278. //
  279. // // PACKED:
  280. // SpdDownsampleH(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex),
  281. // AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
  282. // ...
  283. // // HLSL:
  284. // [numthreads(256,1,1)]
  285. // void main(uint3 WorkGroupId : SV_GroupID, uint LocalThreadIndex : SV_GroupIndex) {
  286. // SpdDownsample(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),
  287. // AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
  288. //
  289. // // PACKED:
  290. // SpdDownsampleH(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),
  291. // AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
  292. // ...
  293. //
  294. //------------------------------------------------------------------------------------------------------------------------------
  295. //==============================================================================================================================
  296. // SPD Setup
  297. //==============================================================================================================================
  298. #ifdef A_CPU
  299. A_STATIC void SpdSetup(outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
  300. outAU2 workGroupOffset, // GPU side: pass in as constant
  301. outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
  302. inAU4 rectInfo, // left, top, width, height
  303. ASU1 mips // optional: if -1, calculate based on rect width and height
  304. )
  305. {
  306. workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
  307. workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
  308. AU1 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
  309. AU1 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
  310. dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
  311. dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
  312. numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
  313. if(mips >= 0)
  314. {
  315. numWorkGroupsAndMips[1] = AU1(mips);
  316. }
  317. else
  318. { // calculate based on rect width and height
  319. AU1 resolution = AMaxU1(rectInfo[2], rectInfo[3]);
  320. numWorkGroupsAndMips[1] = AU1((AMinF1(AFloorF1(ALog2F1(AF1(resolution))), AF1(12))));
  321. }
  322. }
  323. A_STATIC void SpdSetup(outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
  324. outAU2 workGroupOffset, // GPU side: pass in as constant
  325. outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
  326. inAU4 rectInfo // left, top, width, height
  327. )
  328. {
  329. SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
  330. }
  331. #endif // #ifdef A_CPU
  332. //==============================================================================================================================
  333. // NON-PACKED VERSION
  334. //==============================================================================================================================
  335. #ifdef A_GPU
  336. # ifdef SPD_PACKED_ONLY
  337. // Avoid compiler error
  338. AF4 SpdLoadSourceImage(ASU2 p, AU1 slice)
  339. {
  340. return AF4(0.0, 0.0, 0.0, 0.0);
  341. }
  342. AF4 SpdLoad(ASU2 p, AU1 slice)
  343. {
  344. return AF4(0.0, 0.0, 0.0, 0.0);
  345. }
  346. void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice)
  347. {
  348. }
  349. AF4 SpdLoadIntermediate(AU1 x, AU1 y)
  350. {
  351. return AF4(0.0, 0.0, 0.0, 0.0);
  352. }
  353. void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value)
  354. {
  355. }
  356. AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3)
  357. {
  358. return AF4(0.0, 0.0, 0.0, 0.0);
  359. }
  360. # endif // #ifdef SPD_PACKED_ONLY
  361. //_____________________________________________________________/\_______________________________________________________________
  362. # if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
  363. # extension GL_KHR_shader_subgroup_quad : require
  364. # endif
  365. void SpdWorkgroupShuffleBarrier()
  366. {
  367. # ifdef A_GLSL
  368. barrier();
  369. # endif
  370. # ifdef A_HLSL
  371. GroupMemoryBarrierWithGroupSync();
  372. # endif
  373. }
  374. // Only last active workgroup should proceed
  375. bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex, AU1 slice)
  376. {
  377. // global atomic counter
  378. if(localInvocationIndex == 0u)
  379. {
  380. SpdIncreaseAtomicCounter(slice);
  381. }
  382. SpdWorkgroupShuffleBarrier();
  383. return (SpdGetAtomicCounter() != (numWorkGroups - 1u));
  384. }
  385. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  386. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  387. // User defined: AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3);
  388. AF4 SpdReduceQuad(AF4 v)
  389. {
  390. # if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
  391. AF4 v0 = v;
  392. AF4 v1 = subgroupQuadSwapHorizontal(v);
  393. AF4 v2 = subgroupQuadSwapVertical(v);
  394. AF4 v3 = subgroupQuadSwapDiagonal(v);
  395. return SpdReduce4(v0, v1, v2, v3);
  396. # elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
  397. // requires SM6.0
  398. AU1 quad = WaveGetLaneIndex() & (~0x3);
  399. AF4 v0 = v;
  400. AF4 v1 = WaveReadLaneAt(v, quad | 1);
  401. AF4 v2 = WaveReadLaneAt(v, quad | 2);
  402. AF4 v3 = WaveReadLaneAt(v, quad | 3);
  403. return SpdReduce4(v0, v1, v2, v3);
  404. /*
  405. // if SM6.0 is not available, you can use the AMD shader intrinsics
  406. // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
  407. // https://gpuopen.com/amd-gpu-services-ags-library/
  408. // works for DX11
  409. AF4 v0 = v;
  410. AF4 v1;
  411. v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  412. v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  413. v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  414. v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  415. AF4 v2;
  416. v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  417. v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  418. v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  419. v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  420. AF4 v3;
  421. v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  422. v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  423. v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  424. v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  425. return SpdReduce4(v0, v1, v2, v3);
  426. */
  427. # endif
  428. return v;
  429. }
  430. AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
  431. {
  432. AF4 v0 = SpdLoadIntermediate(i0.x, i0.y);
  433. AF4 v1 = SpdLoadIntermediate(i1.x, i1.y);
  434. AF4 v2 = SpdLoadIntermediate(i2.x, i2.y);
  435. AF4 v3 = SpdLoadIntermediate(i3.x, i3.y);
  436. return SpdReduce4(v0, v1, v2, v3);
  437. }
  438. AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
  439. {
  440. AF4 v0 = SpdLoad(i0, slice);
  441. AF4 v1 = SpdLoad(i1, slice);
  442. AF4 v2 = SpdLoad(i2, slice);
  443. AF4 v3 = SpdLoad(i3, slice);
  444. return SpdReduce4(v0, v1, v2, v3);
  445. }
  446. AF4 SpdReduceLoad4(AU2 base, AU1 slice)
  447. {
  448. return SpdReduceLoad4(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)), AU2(base + AU2(1, 1)),
  449. slice);
  450. }
  451. AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
  452. {
  453. AF4 v0 = SpdLoadSourceImage(i0, slice);
  454. AF4 v1 = SpdLoadSourceImage(i1, slice);
  455. AF4 v2 = SpdLoadSourceImage(i2, slice);
  456. AF4 v3 = SpdLoadSourceImage(i3, slice);
  457. return SpdReduce4(v0, v1, v2, v3);
  458. }
  459. AF4 SpdReduceLoadSourceImage(AU2 base, AU1 slice)
  460. {
  461. # ifdef SPD_LINEAR_SAMPLER
  462. return SpdLoadSourceImage(base, slice);
  463. # else
  464. return SpdReduceLoadSourceImage4(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)),
  465. AU2(base + AU2(1, 1)), slice);
  466. # endif
  467. }
  468. void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  469. {
  470. AF4 v[4];
  471. AU2 tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u);
  472. AU2 pix = AU2(workGroupID.xy * 32u) + AU2(x, y);
  473. v[0] = SpdReduceLoadSourceImage(tex, slice);
  474. SpdStore(pix, v[0], 0u, slice);
  475. tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u);
  476. pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y);
  477. v[1] = SpdReduceLoadSourceImage(tex, slice);
  478. SpdStore(pix, v[1], 0u, slice);
  479. tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u + 32u);
  480. pix = AU2(workGroupID.xy * 32u) + AU2(x, y + 16u);
  481. v[2] = SpdReduceLoadSourceImage(tex, slice);
  482. SpdStore(pix, v[2], 0u, slice);
  483. tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u + 32u);
  484. pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y + 16u);
  485. v[3] = SpdReduceLoadSourceImage(tex, slice);
  486. SpdStore(pix, v[3], 0u, slice);
  487. if(mip <= 1u)
  488. return;
  489. v[0] = SpdReduceQuad(v[0]);
  490. v[1] = SpdReduceQuad(v[1]);
  491. v[2] = SpdReduceQuad(v[2]);
  492. v[3] = SpdReduceQuad(v[3]);
  493. if((localInvocationIndex % 4u) == 0u)
  494. {
  495. SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u, y / 2u), v[0], 1u, slice);
  496. SpdStoreIntermediate(x / 2u, y / 2u, v[0]);
  497. SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u + 8u, y / 2u), v[1], 1u, slice);
  498. SpdStoreIntermediate(x / 2u + 8u, y / 2u, v[1]);
  499. SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u, y / 2u + 8u), v[2], 1u, slice);
  500. SpdStoreIntermediate(x / 2u, y / 2u + 8u, v[2]);
  501. SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u + 8u, y / 2u + 8u), v[3], 1u, slice);
  502. SpdStoreIntermediate(x / 2u + 8u, y / 2u + 8u, v[3]);
  503. }
  504. }
  505. void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  506. {
  507. AF4 v[4];
  508. AU2 tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u);
  509. AU2 pix = AU2(workGroupID.xy * 32u) + AU2(x, y);
  510. v[0] = SpdReduceLoadSourceImage(tex, slice);
  511. SpdStore(pix, v[0], 0u, slice);
  512. tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u);
  513. pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y);
  514. v[1] = SpdReduceLoadSourceImage(tex, slice);
  515. SpdStore(pix, v[1], 0u, slice);
  516. tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u + 32u);
  517. pix = AU2(workGroupID.xy * 32u) + AU2(x, y + 16u);
  518. v[2] = SpdReduceLoadSourceImage(tex, slice);
  519. SpdStore(pix, v[2], 0u, slice);
  520. tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u + 32u);
  521. pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y + 16u);
  522. v[3] = SpdReduceLoadSourceImage(tex, slice);
  523. SpdStore(pix, v[3], 0u, slice);
  524. if(mip <= 1u)
  525. return;
  526. for(AU1 i = 0u; i < 4u; i++)
  527. {
  528. SpdStoreIntermediate(x, y, v[i]);
  529. SpdWorkgroupShuffleBarrier();
  530. if(localInvocationIndex < 64u)
  531. {
  532. v[i] = SpdReduceIntermediate(AU2(x * 2u + 0u, y * 2u + 0u), AU2(x * 2u + 1u, y * 2u + 0u),
  533. AU2(x * 2u + 0u, y * 2u + 1u), AU2(x * 2u + 1u, y * 2u + 1u));
  534. SpdStore(AU2(workGroupID.xy * 16u) + AU2(x + (i % 2u) * 8u, y + (i / 2u) * 8u), v[i], 1u, slice);
  535. }
  536. SpdWorkgroupShuffleBarrier();
  537. }
  538. if(localInvocationIndex < 64u)
  539. {
  540. SpdStoreIntermediate(x + 0u, y + 0u, v[0]);
  541. SpdStoreIntermediate(x + 8u, y + 0u, v[1]);
  542. SpdStoreIntermediate(x + 0u, y + 8u, v[2]);
  543. SpdStoreIntermediate(x + 8u, y + 8u, v[3]);
  544. }
  545. }
  546. void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  547. {
  548. # ifdef SPD_NO_WAVE_OPERATIONS
  549. SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
  550. # else
  551. SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
  552. # endif
  553. }
  554. void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  555. {
  556. # ifdef SPD_NO_WAVE_OPERATIONS
  557. if(localInvocationIndex < 64u)
  558. {
  559. AF4 v = SpdReduceIntermediate(AU2(x * 2u + 0u, y * 2u + 0u), AU2(x * 2u + 1u, y * 2u + 0u),
  560. AU2(x * 2u + 0u, y * 2u + 1u), AU2(x * 2u + 1u, y * 2u + 1u));
  561. SpdStore(AU2(workGroupID.xy * 8u) + AU2(x, y), v, mip, slice);
  562. // store to LDS, try to reduce bank conflicts
  563. // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
  564. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  565. // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
  566. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  567. // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
  568. // ...
  569. // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
  570. SpdStoreIntermediate(x * 2u + y % 2u, y * 2u, v);
  571. }
  572. # else
  573. AF4 v = SpdLoadIntermediate(x, y);
  574. v = SpdReduceQuad(v);
  575. // quad index 0 stores result
  576. if(localInvocationIndex % 4u == 0u)
  577. {
  578. SpdStore(AU2(workGroupID.xy * 8u) + AU2(x / 2u, y / 2u), v, mip, slice);
  579. SpdStoreIntermediate(x + (y / 2u) % 2u, y, v);
  580. }
  581. # endif
  582. }
  583. void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  584. {
  585. # ifdef SPD_NO_WAVE_OPERATIONS
  586. if(localInvocationIndex < 16u)
  587. {
  588. // x 0 x 0
  589. // 0 0 0 0
  590. // 0 x 0 x
  591. // 0 0 0 0
  592. AF4 v = SpdReduceIntermediate(AU2(x * 4u + 0u + 0u, y * 4u + 0u), AU2(x * 4u + 2u + 0u, y * 4u + 0u),
  593. AU2(x * 4u + 0u + 1u, y * 4u + 2u), AU2(x * 4u + 2u + 1u, y * 4u + 2u));
  594. SpdStore(AU2(workGroupID.xy * 4u) + AU2(x, y), v, mip, slice);
  595. // store to LDS
  596. // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
  597. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  598. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  599. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  600. // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
  601. // ...
  602. // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
  603. // ...
  604. // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
  605. // ...
  606. SpdStoreIntermediate(x * 4u + y, y * 4u, v);
  607. }
  608. # else
  609. if(localInvocationIndex < 64u)
  610. {
  611. AF4 v = SpdLoadIntermediate(x * 2u + y % 2u, y * 2u);
  612. v = SpdReduceQuad(v);
  613. // quad index 0 stores result
  614. if(localInvocationIndex % 4u == 0u)
  615. {
  616. SpdStore(AU2(workGroupID.xy * 4u) + AU2(x / 2u, y / 2u), v, mip, slice);
  617. SpdStoreIntermediate(x * 2u + y / 2u, y * 2u, v);
  618. }
  619. }
  620. # endif
  621. }
  622. void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  623. {
  624. # ifdef SPD_NO_WAVE_OPERATIONS
  625. if(localInvocationIndex < 4u)
  626. {
  627. // x 0 0 0 x 0 0 0
  628. // ...
  629. // 0 x 0 0 0 x 0 0
  630. AF4 v = SpdReduceIntermediate(
  631. AU2(x * 8u + 0u + 0u + y * 2u, y * 8u + 0u), AU2(x * 8u + 4u + 0u + y * 2u, y * 8u + 0u),
  632. AU2(x * 8u + 0u + 1u + y * 2u, y * 8u + 4u), AU2(x * 8u + 4u + 1u + y * 2u, y * 8u + 4u));
  633. SpdStore(AU2(workGroupID.xy * 2u) + AU2(x, y), v, mip, slice);
  634. // store to LDS
  635. // x x x x 0 ...
  636. // 0 ...
  637. SpdStoreIntermediate(x + y * 2u, 0u, v);
  638. }
  639. # else
  640. if(localInvocationIndex < 16u)
  641. {
  642. AF4 v = SpdLoadIntermediate(x * 4u + y, y * 4u);
  643. v = SpdReduceQuad(v);
  644. // quad index 0 stores result
  645. if(localInvocationIndex % 4u == 0u)
  646. {
  647. SpdStore(AU2(workGroupID.xy * 2u) + AU2(x / 2u, y / 2u), v, mip, slice);
  648. SpdStoreIntermediate(x / 2u + y, 0u, v);
  649. }
  650. }
  651. # endif
  652. }
  653. void SpdDownsampleMip_5(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  654. {
  655. # ifdef SPD_NO_WAVE_OPERATIONS
  656. if(localInvocationIndex < 1u)
  657. {
  658. // x x x x 0 ...
  659. // 0 ...
  660. AF4 v = SpdReduceIntermediate(AU2(0, 0), AU2(1, 0), AU2(2, 0), AU2(3, 0));
  661. SpdStore(AU2(workGroupID.xy), v, mip, slice);
  662. }
  663. # else
  664. if(localInvocationIndex < 4u)
  665. {
  666. AF4 v = SpdLoadIntermediate(localInvocationIndex, 0u);
  667. v = SpdReduceQuad(v);
  668. // quad index 0 stores result
  669. if(localInvocationIndex % 4u == 0u)
  670. {
  671. SpdStore(AU2(workGroupID.xy), v, mip, slice);
  672. }
  673. }
  674. # endif
  675. }
  676. void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips, AU1 slice)
  677. {
  678. AU2 tex = AU2(x * 4u + 0u, y * 4u + 0u);
  679. AU2 pix = AU2(x * 2u + 0u, y * 2u + 0u);
  680. AF4 v0 = SpdReduceLoad4(tex, slice);
  681. SpdStore(pix, v0, 6u, slice);
  682. tex = AU2(x * 4u + 2u, y * 4u + 0u);
  683. pix = AU2(x * 2u + 1u, y * 2u + 0u);
  684. AF4 v1 = SpdReduceLoad4(tex, slice);
  685. SpdStore(pix, v1, 6u, slice);
  686. tex = AU2(x * 4u + 0u, y * 4u + 2u);
  687. pix = AU2(x * 2u + 0u, y * 2u + 1u);
  688. AF4 v2 = SpdReduceLoad4(tex, slice);
  689. SpdStore(pix, v2, 6u, slice);
  690. tex = AU2(x * 4u + 2u, y * 4u + 2u);
  691. pix = AU2(x * 2u + 1u, y * 2u + 1u);
  692. AF4 v3 = SpdReduceLoad4(tex, slice);
  693. SpdStore(pix, v3, 6u, slice);
  694. if(mips <= 7u)
  695. return;
  696. // no barrier needed, working on values only from the same thread
  697. AF4 v = SpdReduce4(v0, v1, v2, v3);
  698. SpdStore(AU2(x, y), v, 7u, slice);
  699. SpdStoreIntermediate(x, y, v);
  700. }
  701. void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
  702. {
  703. if(mips <= baseMip)
  704. return;
  705. SpdWorkgroupShuffleBarrier();
  706. SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
  707. if(mips <= baseMip + 1u)
  708. return;
  709. SpdWorkgroupShuffleBarrier();
  710. SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1u, slice);
  711. if(mips <= baseMip + 2u)
  712. return;
  713. SpdWorkgroupShuffleBarrier();
  714. SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2u, slice);
  715. if(mips <= baseMip + 3u)
  716. return;
  717. SpdWorkgroupShuffleBarrier();
  718. SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3u, slice);
  719. }
  720. void SpdDownsample(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice)
  721. {
  722. AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64u);
  723. AU1 x = sub_xy.x + 8u * ((localInvocationIndex >> 6u) % 2u);
  724. AU1 y = sub_xy.y + 8u * ((localInvocationIndex >> 7u));
  725. SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
  726. SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2u, mips, slice);
  727. if(mips <= 6u)
  728. return;
  729. if(SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
  730. return;
  731. SpdResetAtomicCounter(slice);
  732. // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
  733. SpdDownsampleMips_6_7(x, y, mips, slice);
  734. SpdDownsampleNextFour(x, y, AU2(0, 0), localInvocationIndex, 8u, mips, slice);
  735. }
  736. void SpdDownsample(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice,
  737. AU2 workGroupOffset)
  738. {
  739. SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
  740. }
  741. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  742. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  743. //==============================================================================================================================
  744. // PACKED VERSION
  745. //==============================================================================================================================
  746. # ifdef A_HALF
  747. # ifdef A_GLSL
  748. # extension GL_EXT_shader_subgroup_extended_types_float16 : require
  749. # endif
  750. AH4 SpdReduceQuadH(AH4 v)
  751. {
  752. # if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
  753. AH4 v0 = v;
  754. AH4 v1 = subgroupQuadSwapHorizontal(v);
  755. AH4 v2 = subgroupQuadSwapVertical(v);
  756. AH4 v3 = subgroupQuadSwapDiagonal(v);
  757. return SpdReduce4H(v0, v1, v2, v3);
  758. # elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
  759. // requires SM6.0
  760. AU1 quad = WaveGetLaneIndex() & (~0x3);
  761. AH4 v0 = v;
  762. AH4 v1 = WaveReadLaneAt(v, quad | 1);
  763. AH4 v2 = WaveReadLaneAt(v, quad | 2);
  764. AH4 v3 = WaveReadLaneAt(v, quad | 3);
  765. return SpdReduce4H(v0, v1, v2, v3);
  766. /*
  767. // if SM6.0 is not available, you can use the AMD shader intrinsics
  768. // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
  769. // https://gpuopen.com/amd-gpu-services-ags-library/
  770. // works for DX11
  771. AH4 v0 = v;
  772. AH4 v1;
  773. v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  774. v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  775. v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  776. v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
  777. AH4 v2;
  778. v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  779. v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  780. v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  781. v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
  782. AH4 v3;
  783. v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  784. v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  785. v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  786. v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
  787. return SpdReduce4H(v0, v1, v2, v3);
  788. */
  789. # endif
  790. return AH4(0.0, 0.0, 0.0, 0.0);
  791. }
  792. AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
  793. {
  794. AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
  795. AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
  796. AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
  797. AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
  798. return SpdReduce4H(v0, v1, v2, v3);
  799. }
  800. AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
  801. {
  802. AH4 v0 = SpdLoadH(ASU2(i0), slice);
  803. AH4 v1 = SpdLoadH(ASU2(i1), slice);
  804. AH4 v2 = SpdLoadH(ASU2(i2), slice);
  805. AH4 v3 = SpdLoadH(ASU2(i3), slice);
  806. return SpdReduce4H(v0, v1, v2, v3);
  807. }
  808. AH4 SpdReduceLoad4H(AU2 base, AU1 slice)
  809. {
  810. return SpdReduceLoad4H(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)), AU2(base + AU2(1, 1)),
  811. slice);
  812. }
  813. AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
  814. {
  815. AH4 v0 = SpdLoadSourceImageH(ASU2(i0), slice);
  816. AH4 v1 = SpdLoadSourceImageH(ASU2(i1), slice);
  817. AH4 v2 = SpdLoadSourceImageH(ASU2(i2), slice);
  818. AH4 v3 = SpdLoadSourceImageH(ASU2(i3), slice);
  819. return SpdReduce4H(v0, v1, v2, v3);
  820. }
  821. AH4 SpdReduceLoadSourceImageH(AU2 base, AU1 slice)
  822. {
  823. # ifdef SPD_LINEAR_SAMPLER
  824. return SpdLoadSourceImageH(ASU2(base), slice);
  825. # else
  826. return SpdReduceLoadSourceImage4H(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)),
  827. AU2(base + AU2(1, 1)), slice);
  828. # endif
  829. }
  830. void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
  831. {
  832. AH4 v[4];
  833. ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
  834. ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
  835. v[0] = SpdReduceLoadSourceImageH(tex, slice);
  836. SpdStoreH(pix, v[0], 0, slice);
  837. tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
  838. pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
  839. v[1] = SpdReduceLoadSourceImageH(tex, slice);
  840. SpdStoreH(pix, v[1], 0, slice);
  841. tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
  842. pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
  843. v[2] = SpdReduceLoadSourceImageH(tex, slice);
  844. SpdStoreH(pix, v[2], 0, slice);
  845. tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
  846. pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
  847. v[3] = SpdReduceLoadSourceImageH(tex, slice);
  848. SpdStoreH(pix, v[3], 0, slice);
  849. if(mips <= 1)
  850. return;
  851. v[0] = SpdReduceQuadH(v[0]);
  852. v[1] = SpdReduceQuadH(v[1]);
  853. v[2] = SpdReduceQuadH(v[2]);
  854. v[3] = SpdReduceQuadH(v[3]);
  855. if((localInvocationIndex % 4) == 0)
  856. {
  857. SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2, y / 2), v[0], 1, slice);
  858. SpdStoreIntermediateH(x / 2, y / 2, v[0]);
  859. SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2 + 8, y / 2), v[1], 1, slice);
  860. SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
  861. SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2, y / 2 + 8), v[2], 1, slice);
  862. SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
  863. SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
  864. SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
  865. }
  866. }
  867. void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
  868. {
  869. AH4 v[4];
  870. ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
  871. ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
  872. v[0] = SpdReduceLoadSourceImageH(tex, slice);
  873. SpdStoreH(pix, v[0], 0, slice);
  874. tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
  875. pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
  876. v[1] = SpdReduceLoadSourceImageH(tex, slice);
  877. SpdStoreH(pix, v[1], 0, slice);
  878. tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
  879. pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
  880. v[2] = SpdReduceLoadSourceImageH(tex, slice);
  881. SpdStoreH(pix, v[2], 0, slice);
  882. tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
  883. pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
  884. v[3] = SpdReduceLoadSourceImageH(tex, slice);
  885. SpdStoreH(pix, v[3], 0, slice);
  886. if(mips <= 1)
  887. return;
  888. for(int i = 0; i < 4; i++)
  889. {
  890. SpdStoreIntermediateH(x, y, v[i]);
  891. SpdWorkgroupShuffleBarrier();
  892. if(localInvocationIndex < 64u)
  893. {
  894. v[i] = SpdReduceIntermediateH(AU2(x * 2 + 0, y * 2 + 0), AU2(x * 2 + 1, y * 2 + 0),
  895. AU2(x * 2 + 0, y * 2 + 1), AU2(x * 2 + 1, y * 2 + 1));
  896. SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
  897. }
  898. SpdWorkgroupShuffleBarrier();
  899. }
  900. if(localInvocationIndex < 64u)
  901. {
  902. SpdStoreIntermediateH(x + 0, y + 0, v[0]);
  903. SpdStoreIntermediateH(x + 8, y + 0, v[1]);
  904. SpdStoreIntermediateH(x + 0, y + 8, v[2]);
  905. SpdStoreIntermediateH(x + 8, y + 8, v[3]);
  906. }
  907. }
  908. void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
  909. {
  910. # ifdef SPD_NO_WAVE_OPERATIONS
  911. SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
  912. # else
  913. SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
  914. # endif
  915. }
  916. void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  917. {
  918. # ifdef SPD_NO_WAVE_OPERATIONS
  919. if(localInvocationIndex < 64u)
  920. {
  921. AH4 v = SpdReduceIntermediateH(AU2(x * 2 + 0, y * 2 + 0), AU2(x * 2 + 1, y * 2 + 0), AU2(x * 2 + 0, y * 2 + 1),
  922. AU2(x * 2 + 1, y * 2 + 1));
  923. SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
  924. // store to LDS, try to reduce bank conflicts
  925. // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
  926. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  927. // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
  928. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  929. // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
  930. // ...
  931. // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
  932. SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
  933. }
  934. # else
  935. AH4 v = SpdLoadIntermediateH(x, y);
  936. v = SpdReduceQuadH(v);
  937. // quad index 0 stores result
  938. if(localInvocationIndex % 4 == 0)
  939. {
  940. SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x / 2, y / 2), v, mip, slice);
  941. SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
  942. }
  943. # endif
  944. }
  945. void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  946. {
  947. # ifdef SPD_NO_WAVE_OPERATIONS
  948. if(localInvocationIndex < 16)
  949. {
  950. // x 0 x 0
  951. // 0 0 0 0
  952. // 0 x 0 x
  953. // 0 0 0 0
  954. AH4 v = SpdReduceIntermediateH(AU2(x * 4 + 0 + 0, y * 4 + 0), AU2(x * 4 + 2 + 0, y * 4 + 0),
  955. AU2(x * 4 + 0 + 1, y * 4 + 2), AU2(x * 4 + 2 + 1, y * 4 + 2));
  956. SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
  957. // store to LDS
  958. // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
  959. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  960. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  961. // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  962. // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
  963. // ...
  964. // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
  965. // ...
  966. // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
  967. // ...
  968. SpdStoreIntermediateH(x * 4 + y, y * 4, v);
  969. }
  970. # else
  971. if(localInvocationIndex < 64u)
  972. {
  973. AH4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
  974. v = SpdReduceQuadH(v);
  975. // quad index 0 stores result
  976. if(localInvocationIndex % 4 == 0)
  977. {
  978. SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x / 2, y / 2), v, mip, slice);
  979. SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
  980. }
  981. }
  982. # endif
  983. }
  984. void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  985. {
  986. # ifdef SPD_NO_WAVE_OPERATIONS
  987. if(localInvocationIndex < 4)
  988. {
  989. // x 0 0 0 x 0 0 0
  990. // ...
  991. // 0 x 0 0 0 x 0 0
  992. AH4 v = SpdReduceIntermediateH(AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
  993. AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
  994. SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
  995. // store to LDS
  996. // x x x x 0 ...
  997. // 0 ...
  998. SpdStoreIntermediateH(x + y * 2, 0, v);
  999. }
  1000. # else
  1001. if(localInvocationIndex < 16)
  1002. {
  1003. AH4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
  1004. v = SpdReduceQuadH(v);
  1005. // quad index 0 stores result
  1006. if(localInvocationIndex % 4 == 0)
  1007. {
  1008. SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x / 2, y / 2), v, mip, slice);
  1009. SpdStoreIntermediateH(x / 2 + y, 0, v);
  1010. }
  1011. }
  1012. # endif
  1013. }
  1014. void SpdDownsampleMip_5H(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
  1015. {
  1016. # ifdef SPD_NO_WAVE_OPERATIONS
  1017. if(localInvocationIndex < 1)
  1018. {
  1019. // x x x x 0 ...
  1020. // 0 ...
  1021. AH4 v = SpdReduceIntermediateH(AU2(0, 0), AU2(1, 0), AU2(2, 0), AU2(3, 0));
  1022. SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
  1023. }
  1024. # else
  1025. if(localInvocationIndex < 4)
  1026. {
  1027. AH4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
  1028. v = SpdReduceQuadH(v);
  1029. // quad index 0 stores result
  1030. if(localInvocationIndex % 4 == 0)
  1031. {
  1032. SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
  1033. }
  1034. }
  1035. # endif
  1036. }
  1037. void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips, AU1 slice)
  1038. {
  1039. ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
  1040. ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
  1041. AH4 v0 = SpdReduceLoad4H(tex, slice);
  1042. SpdStoreH(pix, v0, 6, slice);
  1043. tex = ASU2(x * 4 + 2, y * 4 + 0);
  1044. pix = ASU2(x * 2 + 1, y * 2 + 0);
  1045. AH4 v1 = SpdReduceLoad4H(tex, slice);
  1046. SpdStoreH(pix, v1, 6, slice);
  1047. tex = ASU2(x * 4 + 0, y * 4 + 2);
  1048. pix = ASU2(x * 2 + 0, y * 2 + 1);
  1049. AH4 v2 = SpdReduceLoad4H(tex, slice);
  1050. SpdStoreH(pix, v2, 6, slice);
  1051. tex = ASU2(x * 4 + 2, y * 4 + 2);
  1052. pix = ASU2(x * 2 + 1, y * 2 + 1);
  1053. AH4 v3 = SpdReduceLoad4H(tex, slice);
  1054. SpdStoreH(pix, v3, 6, slice);
  1055. if(mips < 8)
  1056. return;
  1057. // no barrier needed, working on values only from the same thread
  1058. AH4 v = SpdReduce4H(v0, v1, v2, v3);
  1059. SpdStoreH(ASU2(x, y), v, 7, slice);
  1060. SpdStoreIntermediateH(x, y, v);
  1061. }
  1062. void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
  1063. {
  1064. if(mips <= baseMip)
  1065. return;
  1066. SpdWorkgroupShuffleBarrier();
  1067. SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
  1068. if(mips <= baseMip + 1)
  1069. return;
  1070. SpdWorkgroupShuffleBarrier();
  1071. SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
  1072. if(mips <= baseMip + 2)
  1073. return;
  1074. SpdWorkgroupShuffleBarrier();
  1075. SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
  1076. if(mips <= baseMip + 3)
  1077. return;
  1078. SpdWorkgroupShuffleBarrier();
  1079. SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
  1080. }
  1081. void SpdDownsampleH(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice)
  1082. {
  1083. AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
  1084. AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
  1085. AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
  1086. SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
  1087. SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
  1088. if(mips < 7)
  1089. return;
  1090. if(SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
  1091. return;
  1092. SpdResetAtomicCounter(slice);
  1093. // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
  1094. SpdDownsampleMips_6_7H(x, y, mips, slice);
  1095. SpdDownsampleNextFourH(x, y, AU2(0, 0), localInvocationIndex, 8, mips, slice);
  1096. }
  1097. void SpdDownsampleH(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice,
  1098. AU2 workGroupOffset)
  1099. {
  1100. SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
  1101. }
  1102. # endif // #ifdef A_HALF
  1103. #endif // #ifdef A_GPU