GpuVisibility.cpp 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #include <AnKi/Renderer/Utils/GpuVisibility.h>
  6. #include <AnKi/Renderer/Renderer.h>
  7. #include <AnKi/Scene/RenderStateBucket.h>
  8. #include <AnKi/Scene/GpuSceneArray.h>
  9. #include <AnKi/GpuMemory/GpuVisibleTransientMemoryPool.h>
  10. #include <AnKi/GpuMemory/RebarTransientMemoryPool.h>
  11. #include <AnKi/GpuMemory/GpuSceneBuffer.h>
  12. #include <AnKi/Collision/Functions.h>
  13. #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
  14. #include <AnKi/GpuMemory/UnifiedGeometryBuffer.h>
  15. #include <AnKi/Core/StatsSet.h>
  16. #include <AnKi/Util/CVarSet.h>
  17. #include <AnKi/Util/Tracer.h>
  18. #include <AnKi/Core/App.h>
  19. namespace anki {
  20. constexpr U32 kMaxVisibleObjects = 30 * 1024;
  21. constexpr U32 kMaxVisiblePrimitives = 40'000'000;
  22. constexpr U32 kMaxVisibleMeshlets = kMaxVisiblePrimitives / kMaxPrimitivesPerMeshlet;
  23. static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem",
  24. StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
  25. static StatCounter g_maxGpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem: max ever used/frame",
  26. StatFlag::kBytes | StatFlag::kMainThreadUpdates);
  27. class GpuVisLimits
  28. {
  29. public:
  30. U32 m_maxVisibleLegacyRenderables = 0;
  31. U32 m_totalLegacyRenderables = 0;
  32. U32 m_maxVisibleMeshlets = 0;
  33. };
  34. static GpuVisLimits computeLimits(RenderingTechnique t)
  35. {
  36. GpuVisLimits out;
  37. const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
  38. const U32 meshletUserCount = buckets.getBucketsActiveUserCountWithMeshletSupport(t);
  39. ANKI_ASSERT(meshletUserCount == 0 || (g_meshletRenderingCVar || GrManager::getSingleton().getDeviceCapabilities().m_meshShaders));
  40. out.m_totalLegacyRenderables = buckets.getBucketsActiveUserCountWithNoMeshletSupport(t);
  41. out.m_maxVisibleLegacyRenderables = min(out.m_totalLegacyRenderables, kMaxVisibleObjects);
  42. out.m_maxVisibleMeshlets = (meshletUserCount) ? min(kMaxVisibleMeshlets, buckets.getBucketsLod0MeshletCount(t)) : 0;
  43. return out;
  44. }
  45. class GpuVisMemoryStats : public RendererObject, public MakeSingletonSimple<GpuVisMemoryStats>
  46. {
  47. public:
  48. void informAboutAllocation(PtrSize size)
  49. {
  50. if(m_frameIdx != getRenderer().getFrameCount())
  51. {
  52. // First call in the frame, update the stat var
  53. m_frameIdx = getRenderer().getFrameCount();
  54. m_maxMemUsedInFrame = max(m_maxMemUsedInFrame, m_memUsedThisFrame);
  55. m_memUsedThisFrame = 0;
  56. g_maxGpuVisMemoryAllocatedStatVar.set(m_maxMemUsedInFrame);
  57. }
  58. m_memUsedThisFrame += size;
  59. }
  60. private:
  61. PtrSize m_memUsedThisFrame = 0;
  62. PtrSize m_maxMemUsedInFrame = 0;
  63. U64 m_frameIdx = kMaxU64;
  64. };
  65. template<typename T>
  66. static BufferView allocateStructuredBuffer(U32 count)
  67. {
  68. BufferView out = {};
  69. if(count > 0)
  70. {
  71. g_gpuVisMemoryAllocatedStatVar.increment(sizeof(T) * count);
  72. out = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
  73. GpuVisMemoryStats::getSingleton().informAboutAllocation(sizeof(T) * count);
  74. }
  75. return out;
  76. }
  77. Error GpuVisibility::init()
  78. {
  79. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  80. {
  81. for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
  82. {
  83. for(MutatorValue genHash = 0; genHash < 2; ++genHash)
  84. {
  85. for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
  86. {
  87. for(MutatorValue gatherLegacy = 0; gatherLegacy < 2; ++gatherLegacy)
  88. {
  89. if(gatherLegacy == 0 && gatherMeshlets == 0)
  90. {
  91. continue; // Not allowed
  92. }
  93. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
  94. {{"HZB_TEST", hzb},
  95. {"DISTANCE_TEST", 0},
  96. {"GATHER_AABBS", gatherAabbs},
  97. {"HASH_VISIBLES", genHash},
  98. {"GATHER_MESHLETS", gatherMeshlets},
  99. {"GATHER_LEGACY", gatherLegacy}},
  100. m_1stStageProg, m_frustumGrProgs[hzb][gatherAabbs][genHash][gatherMeshlets][gatherLegacy]));
  101. }
  102. }
  103. }
  104. }
  105. }
  106. for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
  107. {
  108. for(MutatorValue genHash = 0; genHash < 2; ++genHash)
  109. {
  110. for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
  111. {
  112. for(MutatorValue gatherLegacy = 0; gatherLegacy < 2; ++gatherLegacy)
  113. {
  114. if(gatherLegacy == 0 && gatherMeshlets == 0)
  115. {
  116. continue; // Not allowed
  117. }
  118. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
  119. {{"HZB_TEST", 0},
  120. {"DISTANCE_TEST", 1},
  121. {"GATHER_AABBS", gatherAabbs},
  122. {"HASH_VISIBLES", genHash},
  123. {"GATHER_MESHLETS", gatherMeshlets},
  124. {"GATHER_LEGACY", gatherLegacy}},
  125. m_1stStageProg, m_distGrProgs[gatherAabbs][genHash][gatherMeshlets][gatherLegacy]));
  126. }
  127. }
  128. }
  129. }
  130. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
  131. {{"HZB_TEST", 0}, {"PASSTHROUGH", 0}, {"MESH_SHADERS", 0}, {"STORE_MESHLETS_FAILED_HZB", 1}}, m_2ndStageProg,
  132. m_gatherGrProg, "Legacy"));
  133. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  134. {
  135. for(MutatorValue passthrough = 0; passthrough < 2; ++passthrough)
  136. {
  137. for(MutatorValue meshShaders = 0; meshShaders < 2; ++meshShaders)
  138. {
  139. for(MutatorValue storeMeshletsFailedHzb = 0; storeMeshletsFailedHzb < 2; ++storeMeshletsFailedHzb)
  140. {
  141. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
  142. {{"HZB_TEST", hzb},
  143. {"PASSTHROUGH", passthrough},
  144. {"MESH_SHADERS", meshShaders},
  145. {"STORE_MESHLETS_FAILED_HZB", storeMeshletsFailedHzb}},
  146. m_2ndStageProg, m_meshletGrProgs[hzb][passthrough][meshShaders][storeMeshletsFailedHzb],
  147. "Meshlets"));
  148. }
  149. }
  150. }
  151. }
  152. return Error::kNone;
  153. }
  154. void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out)
  155. {
  156. ANKI_ASSERT(in.m_lodReferencePoint.x() != kMaxF32);
  157. if(RenderStateBucketContainer::getSingleton().getBucketsActiveUserCount(in.m_technique) == 0) [[unlikely]]
  158. {
  159. // Early exit
  160. in = {};
  161. return;
  162. }
  163. RenderGraphBuilder& rgraph = *in.m_rgraph;
  164. class DistanceTestData
  165. {
  166. public:
  167. Vec3 m_pointOfTest;
  168. F32 m_testRadius;
  169. };
  170. class FrustumTestData
  171. {
  172. public:
  173. RenderTargetHandle m_hzbRt;
  174. Mat4 m_viewProjMat;
  175. UVec2 m_finalRenderTargetSize;
  176. };
  177. FrustumTestData* frustumTestData = nullptr;
  178. DistanceTestData* distTestData = nullptr;
  179. Bool bStoreMeshletsFailedHzb = false;
  180. if(distanceBased)
  181. {
  182. distTestData = newInstance<DistanceTestData>(getRenderer().getFrameMemoryPool());
  183. const DistanceGpuVisibilityInput& din = static_cast<DistanceGpuVisibilityInput&>(in);
  184. distTestData->m_pointOfTest = din.m_pointOfTest;
  185. distTestData->m_testRadius = din.m_testRadius;
  186. }
  187. else
  188. {
  189. frustumTestData = newInstance<FrustumTestData>(getRenderer().getFrameMemoryPool());
  190. const FrustumGpuVisibilityInput& fin = static_cast<FrustumGpuVisibilityInput&>(in);
  191. frustumTestData->m_viewProjMat = fin.m_viewProjectionMatrix;
  192. frustumTestData->m_finalRenderTargetSize = fin.m_viewportSize;
  193. if(fin.m_hzbRt)
  194. {
  195. frustumTestData->m_hzbRt = *fin.m_hzbRt;
  196. }
  197. bStoreMeshletsFailedHzb = fin.m_twoPhaseOcclusionCulling;
  198. }
  199. const Bool firstCallInFrame = m_persistentMemory.m_frameIdx != getRenderer().getFrameCount();
  200. if(firstCallInFrame)
  201. {
  202. m_persistentMemory.m_frameIdx = getRenderer().getFrameCount();
  203. }
  204. // OoM
  205. if(firstCallInFrame)
  206. {
  207. U32 data;
  208. PtrSize dataReadSize;
  209. getRenderer().getReadbackManager().readMostRecentData(m_outOfMemoryReadback, &data, sizeof(data), dataReadSize);
  210. if(dataReadSize == sizeof(U32) && data != 0)
  211. {
  212. CString who;
  213. switch(data)
  214. {
  215. case 0b1:
  216. who = "Stage 1";
  217. break;
  218. case 0b10:
  219. who = "Stage 2";
  220. break;
  221. case 0b11:
  222. who = "Both stages";
  223. break;
  224. default:
  225. ANKI_ASSERT(0);
  226. }
  227. ANKI_RESOURCE_LOGE("GPU visibility went out of memory: %s", who.cstr());
  228. }
  229. m_outOfMemoryReadbackBuffer = getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_outOfMemoryReadback, 1);
  230. }
  231. // Get some limits
  232. const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
  233. const U32 bucketCount = buckets.getBucketCount(in.m_technique);
  234. const GpuVisLimits limits = computeLimits(in.m_technique);
  235. const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
  236. const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
  237. const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
  238. const Bool bLegacyRendering = limits.m_maxVisibleLegacyRenderables > 0;
  239. if(bStoreMeshletsFailedHzb)
  240. {
  241. ANKI_ASSERT(bMeshletRendering && frustumTestData->m_hzbRt.isValid());
  242. }
  243. // Allocate persistent memory for the frame
  244. if(firstCallInFrame)
  245. {
  246. GpuVisLimits maxLimits;
  247. for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
  248. {
  249. const GpuVisLimits limits = computeLimits(t);
  250. maxLimits.m_maxVisibleLegacyRenderables = max(maxLimits.m_maxVisibleLegacyRenderables, limits.m_maxVisibleLegacyRenderables);
  251. maxLimits.m_maxVisibleMeshlets = max(maxLimits.m_maxVisibleMeshlets, limits.m_maxVisibleMeshlets);
  252. }
  253. m_persistentMemory.m_stage1.m_visibleRenderables =
  254. allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleLegacyRenderables);
  255. m_persistentMemory.m_stage1.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(maxLimits.m_maxVisibleMeshlets);
  256. m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(maxLimits.m_maxVisibleLegacyRenderables);
  257. m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs =
  258. allocateStructuredBuffer<DrawIndexedIndirectArgs>(maxLimits.m_maxVisibleLegacyRenderables);
  259. m_persistentMemory.m_stage2Meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
  260. m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb =
  261. allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleMeshlets);
  262. m_persistentMemory.m_stage3.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
  263. m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
  264. : m_persistentMemory.m_stage1.m_visibleRenderables,
  265. BufferUsageBit::kNone);
  266. }
  267. // Compute the MDI sub-ranges
  268. if(limits.m_maxVisibleLegacyRenderables)
  269. {
  270. newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), bucketCount, out.m_legacy.m_bucketIndirectArgsRanges);
  271. U32 ibucket = 0;
  272. U32 offset = 0;
  273. buckets.iterateBuckets(in.m_technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletCount) {
  274. out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance = offset;
  275. if(meshletCount == 0 && userCount > 0)
  276. {
  277. out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount =
  278. max(1u, U32(U64(userCount) * limits.m_maxVisibleLegacyRenderables / limits.m_totalLegacyRenderables));
  279. offset += out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
  280. }
  281. ++ibucket;
  282. });
  283. // The last element should point to the limit of the buffer
  284. InstanceRange& last = out.m_legacy.m_bucketIndirectArgsRanges.getBack();
  285. ANKI_ASSERT(limits.m_maxVisibleLegacyRenderables >= last.m_firstInstance);
  286. last.m_instanceCount = limits.m_maxVisibleLegacyRenderables - last.m_firstInstance;
  287. }
  288. // Allocate memory for stage 1
  289. class Stage1Mem
  290. {
  291. public:
  292. BufferView m_counters;
  293. BufferView m_visibleRenderables;
  294. BufferView m_visibleMeshlets;
  295. BufferView m_renderablePrefixSums;
  296. BufferView m_meshletPrefixSums;
  297. BufferView m_gpuVisIndirectDispatchArgs;
  298. BufferView m_visibleAabbIndices;
  299. BufferView m_hash;
  300. } stage1Mem;
  301. stage1Mem.m_counters = allocateStructuredBuffer<U32>(U32(GpuVisibilityCounter::kCount));
  302. if(in.m_limitMemory)
  303. {
  304. PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
  305. if(newRange)
  306. {
  307. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleRenderables.getRange());
  308. stage1Mem.m_visibleRenderables = BufferView(m_persistentMemory.m_stage1.m_visibleRenderables).setRange(newRange);
  309. }
  310. newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
  311. if(newRange)
  312. {
  313. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleMeshlets.getRange());
  314. stage1Mem.m_visibleMeshlets = BufferView(m_persistentMemory.m_stage1.m_visibleMeshlets).setRange(newRange);
  315. }
  316. }
  317. else
  318. {
  319. stage1Mem.m_visibleRenderables = allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(limits.m_maxVisibleLegacyRenderables);
  320. stage1Mem.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(limits.m_maxVisibleMeshlets);
  321. }
  322. stage1Mem.m_renderablePrefixSums = allocateStructuredBuffer<U32>(bucketCount);
  323. stage1Mem.m_meshletPrefixSums = allocateStructuredBuffer<U32>(bucketCount);
  324. stage1Mem.m_gpuVisIndirectDispatchArgs = allocateStructuredBuffer<DispatchIndirectArgs>(U32(GpuVisibilityIndirectDispatches::kCount));
  325. if(in.m_gatherAabbIndices)
  326. {
  327. stage1Mem.m_visibleAabbIndices = allocateStructuredBuffer<U32>(buckets.getBucketsActiveUserCount(in.m_technique));
  328. }
  329. if(in.m_hashVisibles)
  330. {
  331. stage1Mem.m_hash = allocateStructuredBuffer<GpuVisibilityHash>(1);
  332. }
  333. // Allocate memory for stage 2
  334. class Stage2Mem
  335. {
  336. public:
  337. class
  338. {
  339. public:
  340. BufferView m_instanceRateRenderables;
  341. BufferView m_drawIndexedIndirectArgs;
  342. BufferView m_mdiDrawCounts;
  343. } m_legacy;
  344. class
  345. {
  346. public:
  347. BufferView m_indirectDrawArgs;
  348. BufferView m_dispatchMeshIndirectArgs;
  349. BufferView m_meshletInstances;
  350. BufferView m_meshletsFailedHzb;
  351. } m_meshlet;
  352. } stage2Mem;
  353. if(bLegacyRendering)
  354. {
  355. if(in.m_limitMemory)
  356. {
  357. PtrSize newRange = sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables;
  358. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables.getRange());
  359. stage2Mem.m_legacy.m_instanceRateRenderables = BufferView(m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables).setRange(newRange);
  360. newRange = sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables;
  361. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs.getRange());
  362. stage2Mem.m_legacy.m_drawIndexedIndirectArgs = BufferView(m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs).setRange(newRange);
  363. }
  364. else
  365. {
  366. stage2Mem.m_legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(limits.m_maxVisibleLegacyRenderables);
  367. stage2Mem.m_legacy.m_drawIndexedIndirectArgs = allocateStructuredBuffer<DrawIndexedIndirectArgs>(limits.m_maxVisibleLegacyRenderables);
  368. }
  369. stage2Mem.m_legacy.m_mdiDrawCounts = allocateStructuredBuffer<U32>(bucketCount);
  370. }
  371. if(bMeshletRendering)
  372. {
  373. if(bHwMeshletRendering)
  374. {
  375. stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
  376. }
  377. else
  378. {
  379. stage2Mem.m_meshlet.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
  380. }
  381. const U32 newCount = limits.m_maxVisibleMeshlets;
  382. if(in.m_limitMemory)
  383. {
  384. ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
  385. stage2Mem.m_meshlet.m_meshletInstances =
  386. BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
  387. }
  388. else
  389. {
  390. stage2Mem.m_meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
  391. }
  392. if(bStoreMeshletsFailedHzb)
  393. {
  394. const U32 newCount = limits.m_maxVisibleMeshlets;
  395. if(in.m_limitMemory)
  396. {
  397. ANKI_ASSERT(newCount * sizeof(GpuVisibilityVisibleMeshletDesc) <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
  398. stage2Mem.m_meshlet.m_meshletsFailedHzb =
  399. BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newCount * sizeof(GpuVisibilityVisibleMeshletDesc));
  400. }
  401. else
  402. {
  403. stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(newCount);
  404. }
  405. }
  406. }
  407. // Stage 3 memory
  408. class Stage3Mem
  409. {
  410. public:
  411. BufferView m_indirectDrawArgs;
  412. BufferView m_dispatchMeshIndirectArgs;
  413. BufferView m_meshletInstances;
  414. } stage3Mem;
  415. if(bStoreMeshletsFailedHzb)
  416. {
  417. if(bHwMeshletRendering)
  418. {
  419. stage3Mem.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
  420. }
  421. else
  422. {
  423. stage3Mem.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
  424. }
  425. const U32 newCount = limits.m_maxVisibleMeshlets;
  426. if(in.m_limitMemory)
  427. {
  428. ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
  429. stage3Mem.m_meshletInstances =
  430. BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
  431. }
  432. else
  433. {
  434. stage3Mem.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
  435. }
  436. }
  437. // Setup output
  438. out.m_legacy.m_renderableInstancesBuffer = stage2Mem.m_legacy.m_instanceRateRenderables;
  439. out.m_legacy.m_mdiDrawCountsBuffer = stage2Mem.m_legacy.m_mdiDrawCounts;
  440. out.m_legacy.m_drawIndexedIndirectArgsBuffer = stage2Mem.m_legacy.m_drawIndexedIndirectArgs;
  441. out.m_mesh.m_dispatchMeshIndirectArgsBuffer = stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs;
  442. out.m_mesh.m_drawIndirectArgs = stage2Mem.m_meshlet.m_indirectDrawArgs;
  443. out.m_mesh.m_meshletInstancesBuffer = stage2Mem.m_meshlet.m_meshletInstances;
  444. out.m_visibleAaabbIndicesBuffer = stage1Mem.m_visibleAabbIndices;
  445. out.m_visiblesHashBuffer = stage1Mem.m_hash;
  446. if(bHwMeshletRendering)
  447. {
  448. out.m_mesh.m_firstMeshletBuffer = stage1Mem.m_meshletPrefixSums;
  449. }
  450. if(bStoreMeshletsFailedHzb)
  451. {
  452. out.m_stage1And2Mem.m_meshletsFailedHzb = stage2Mem.m_meshlet.m_meshletsFailedHzb;
  453. out.m_stage1And2Mem.m_counters = stage1Mem.m_counters;
  454. out.m_stage1And2Mem.m_meshletPrefixSums = stage1Mem.m_meshletPrefixSums;
  455. out.m_stage1And2Mem.m_gpuVisIndirectDispatchArgs = stage1Mem.m_gpuVisIndirectDispatchArgs;
  456. out.m_stage3Mem.m_indirectDrawArgs = stage3Mem.m_indirectDrawArgs;
  457. out.m_stage3Mem.m_dispatchMeshIndirectArgs = stage3Mem.m_dispatchMeshIndirectArgs;
  458. out.m_stage3Mem.m_meshletInstances = stage3Mem.m_meshletInstances;
  459. }
  460. // Use one buffer as a depedency. Doesn't matter which
  461. out.m_dependency =
  462. (in.m_limitMemory) ? m_persistentMemory.m_dep : rgraph.importBuffer(stage1Mem.m_gpuVisIndirectDispatchArgs, BufferUsageBit::kNone);
  463. // Zero some stuff
  464. const BufferHandle zeroMemDep = rgraph.importBuffer(stage1Mem.m_counters, BufferUsageBit::kNone);
  465. {
  466. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis zero: %s", in.m_passesName.cstr()));
  467. pass.newBufferDependency(zeroMemDep, BufferUsageBit::kUavCompute);
  468. pass.setWork([stage1Mem, stage2Mem, stage3Mem](RenderPassWorkContext& rpass) {
  469. ANKI_TRACE_SCOPED_EVENT(GpuVisZero);
  470. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  471. constexpr Bool debugZeroing = false; // For debugging purposes zero everything
  472. #define ANKI_ZERO(buff, alwaysZero) \
  473. if((alwaysZero || debugZeroing) && buff.isValid()) \
  474. { \
  475. cmdb.pushDebugMarker(#buff, Vec3(1.0f, 1.0f, 1.0f)); \
  476. fillBuffer(cmdb, buff, 0); \
  477. cmdb.popDebugMarker(); \
  478. }
  479. #define ANKI_ZERO_PART(buff, alwaysZero, sizeToZero) \
  480. if((alwaysZero || debugZeroing) && buff.isValid()) \
  481. { \
  482. cmdb.pushDebugMarker(#buff, Vec3(1.0f, 1.0f, 1.0f)); \
  483. fillBuffer(cmdb, (debugZeroing) ? buff : BufferView(buff).setRange(sizeToZero), 0); \
  484. cmdb.popDebugMarker(); \
  485. }
  486. ANKI_ZERO(stage1Mem.m_counters, true)
  487. ANKI_ZERO(stage1Mem.m_visibleRenderables, false)
  488. ANKI_ZERO(stage1Mem.m_visibleMeshlets, false)
  489. ANKI_ZERO(stage1Mem.m_renderablePrefixSums, true)
  490. ANKI_ZERO(stage1Mem.m_meshletPrefixSums, true)
  491. ANKI_ZERO(stage1Mem.m_gpuVisIndirectDispatchArgs, false)
  492. ANKI_ZERO_PART(stage1Mem.m_visibleAabbIndices, true, sizeof(U32))
  493. ANKI_ZERO(stage1Mem.m_hash, true)
  494. ANKI_ZERO(stage2Mem.m_legacy.m_instanceRateRenderables, false)
  495. ANKI_ZERO(stage2Mem.m_legacy.m_drawIndexedIndirectArgs, true)
  496. ANKI_ZERO(stage2Mem.m_legacy.m_mdiDrawCounts, true)
  497. ANKI_ZERO(stage2Mem.m_meshlet.m_indirectDrawArgs, true)
  498. ANKI_ZERO(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs, true)
  499. ANKI_ZERO(stage2Mem.m_meshlet.m_meshletInstances, false)
  500. ANKI_ZERO(stage2Mem.m_meshlet.m_meshletsFailedHzb, false)
  501. ANKI_ZERO(stage3Mem.m_indirectDrawArgs, true)
  502. ANKI_ZERO(stage3Mem.m_dispatchMeshIndirectArgs, true)
  503. ANKI_ZERO(stage3Mem.m_meshletInstances, false)
  504. #undef ANKI_ZERO
  505. });
  506. }
  507. // 1st stage
  508. {
  509. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 1st stage: %s", in.m_passesName.cstr()));
  510. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  511. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavCompute);
  512. pass.newBufferDependency(zeroMemDep, BufferUsageBit::kUavCompute);
  513. if(frustumTestData && frustumTestData->m_hzbRt.isValid())
  514. {
  515. pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSrvCompute);
  516. }
  517. pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
  518. technique = in.m_technique, stage1Mem, bLegacyRendering, bMeshletRendering](RenderPassWorkContext& rpass) {
  519. ANKI_TRACE_SCOPED_EVENT(GpuVis1stStage);
  520. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  521. const Bool gatherAabbIndices = stage1Mem.m_visibleAabbIndices.isValid();
  522. const Bool genHash = stage1Mem.m_hash.isValid();
  523. if(frustumTestData)
  524. {
  525. cmdb.bindShaderProgram(
  526. m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash][bMeshletRendering][bLegacyRendering].get());
  527. }
  528. else
  529. {
  530. cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash][bMeshletRendering][bLegacyRendering].get());
  531. }
  532. BufferView aabbsBuffer;
  533. U32 aabbCount = 0;
  534. switch(technique)
  535. {
  536. case RenderingTechnique::kGBuffer:
  537. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getBufferView();
  538. aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
  539. break;
  540. case RenderingTechnique::kDepth:
  541. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getBufferView();
  542. aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
  543. break;
  544. case RenderingTechnique::kForward:
  545. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getBufferView();
  546. aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
  547. break;
  548. default:
  549. ANKI_ASSERT(0);
  550. }
  551. cmdb.bindSrv(0, 0, aabbsBuffer);
  552. cmdb.bindSrv(1, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  553. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  554. cmdb.bindSrv(3, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  555. cmdb.bindSrv(4, 0, GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
  556. cmdb.bindUav(0, 0, stage1Mem.m_counters);
  557. cmdb.bindUav(1, 0, (bLegacyRendering) ? stage1Mem.m_visibleRenderables : BufferView(getDummyGpuResources().m_buffer.get()));
  558. cmdb.bindUav(2, 0, (bMeshletRendering) ? stage1Mem.m_visibleMeshlets : BufferView(getDummyGpuResources().m_buffer.get()));
  559. cmdb.bindUav(3, 0, (bLegacyRendering) ? stage1Mem.m_renderablePrefixSums : BufferView(getDummyGpuResources().m_buffer.get()));
  560. cmdb.bindUav(4, 0, (bMeshletRendering) ? stage1Mem.m_meshletPrefixSums : BufferView(getDummyGpuResources().m_buffer.get()));
  561. cmdb.bindUav(5, 0, stage1Mem.m_gpuVisIndirectDispatchArgs);
  562. cmdb.bindUav(6, 0, m_outOfMemoryReadbackBuffer);
  563. if(gatherAabbIndices)
  564. {
  565. cmdb.bindUav(7, 0, stage1Mem.m_visibleAabbIndices);
  566. }
  567. if(genHash)
  568. {
  569. cmdb.bindUav(8, 0, stage1Mem.m_hash);
  570. }
  571. if(frustumTestData)
  572. {
  573. FrustumGpuVisibilityConsts* consts = allocateAndBindConstants<FrustumGpuVisibilityConsts>(cmdb, 0, 0);
  574. Array<Plane, 6> planes;
  575. extractClipPlanes(frustumTestData->m_viewProjMat, planes);
  576. for(U32 i = 0; i < 6; ++i)
  577. {
  578. consts->m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  579. }
  580. ANKI_ASSERT(kMaxLodCount == 3);
  581. consts->m_maxLodDistances[0] = lodDistances[0];
  582. consts->m_maxLodDistances[1] = lodDistances[1];
  583. consts->m_maxLodDistances[2] = kMaxF32;
  584. consts->m_maxLodDistances[3] = kMaxF32;
  585. consts->m_lodReferencePoint = lodReferencePoint;
  586. consts->m_viewProjectionMat = frustumTestData->m_viewProjMat;
  587. consts->m_finalRenderTargetSize = Vec2(frustumTestData->m_finalRenderTargetSize);
  588. if(frustumTestData->m_hzbRt.isValid())
  589. {
  590. rpass.bindSrv(5, 0, frustumTestData->m_hzbRt);
  591. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  592. }
  593. }
  594. else
  595. {
  596. DistanceGpuVisibilityConstants consts;
  597. consts.m_pointOfTest = distTestData->m_pointOfTest;
  598. consts.m_testRadius = distTestData->m_testRadius;
  599. consts.m_maxLodDistances[0] = lodDistances[0];
  600. consts.m_maxLodDistances[1] = lodDistances[1];
  601. consts.m_maxLodDistances[2] = kMaxF32;
  602. consts.m_maxLodDistances[3] = kMaxF32;
  603. consts.m_lodReferencePoint = lodReferencePoint;
  604. cmdb.setFastConstants(&consts, sizeof(consts));
  605. }
  606. dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
  607. });
  608. } // end 1st stage
  609. // 2nd stage
  610. {
  611. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 2nd stage: %s", in.m_passesName.cstr()));
  612. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  613. if(frustumTestData && frustumTestData->m_hzbRt.isValid())
  614. {
  615. pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSrvCompute);
  616. }
  617. pass.setWork([this, stage1Mem, stage2Mem, bLegacyRendering, bMeshletRendering, bHwMeshletRendering, out, frustumTestData,
  618. lodReferencePoint = in.m_lodReferencePoint, bStoreMeshletsFailedHzb](RenderPassWorkContext& rpass) {
  619. ANKI_TRACE_SCOPED_EVENT(GpuVis2ndStage);
  620. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  621. if(bLegacyRendering)
  622. {
  623. cmdb.bindShaderProgram(m_gatherGrProg.get());
  624. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  625. cmdb.bindSrv(1, 0, GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
  626. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  627. cmdb.bindSrv(3, 0, stage1Mem.m_visibleRenderables);
  628. cmdb.bindSrv(4, 0, stage1Mem.m_counters);
  629. cmdb.bindSrv(5, 0, stage1Mem.m_renderablePrefixSums);
  630. WeakArray<UVec2> firstDrawIndirectArgAndCount =
  631. allocateAndBindSrvStructuredBuffer<UVec2>(cmdb, 6, 0, out.m_legacy.m_bucketIndirectArgsRanges.getSize());
  632. for(U32 ibucket = 0; ibucket < out.m_legacy.m_bucketIndirectArgsRanges.getSize(); ++ibucket)
  633. {
  634. firstDrawIndirectArgAndCount[ibucket].x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
  635. firstDrawIndirectArgAndCount[ibucket].y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
  636. }
  637. cmdb.bindUav(0, 0, stage2Mem.m_legacy.m_instanceRateRenderables);
  638. cmdb.bindUav(1, 0, stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
  639. cmdb.bindUav(2, 0, stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
  640. cmdb.bindUav(3, 0, stage2Mem.m_legacy.m_mdiDrawCounts);
  641. cmdb.bindUav(4, 0, m_outOfMemoryReadbackBuffer);
  642. cmdb.dispatchComputeIndirect(
  643. BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
  644. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageLegacy))
  645. .setRange(sizeof(DispatchIndirectArgs)));
  646. }
  647. if(bMeshletRendering)
  648. {
  649. const Bool hzbTex = frustumTestData && frustumTestData->m_hzbRt.isValid();
  650. const Bool passthrough = frustumTestData == nullptr;
  651. const Bool meshShaders = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
  652. cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][meshShaders][bStoreMeshletsFailedHzb].get());
  653. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  654. cmdb.bindSrv(1, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  655. cmdb.bindSrv(2, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  656. cmdb.bindSrv(3, 0, UnifiedGeometryBuffer::getSingleton().getBufferView());
  657. if(hzbTex)
  658. {
  659. rpass.bindSrv(4, 0, frustumTestData->m_hzbRt);
  660. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  661. }
  662. cmdb.bindUav(0, 0, stage1Mem.m_counters);
  663. cmdb.bindSrv(5, 0, stage1Mem.m_meshletPrefixSums);
  664. cmdb.bindSrv(6, 0, stage1Mem.m_visibleMeshlets);
  665. cmdb.bindUav(1, 0, (bHwMeshletRendering) ? stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs : stage2Mem.m_meshlet.m_indirectDrawArgs);
  666. cmdb.bindUav(2, 0, stage2Mem.m_meshlet.m_meshletInstances);
  667. cmdb.bindUav(3, 0, m_outOfMemoryReadbackBuffer);
  668. if(bStoreMeshletsFailedHzb)
  669. {
  670. cmdb.bindUav(4, 0, stage2Mem.m_meshlet.m_meshletsFailedHzb);
  671. cmdb.bindUav(5, 0, stage1Mem.m_gpuVisIndirectDispatchArgs);
  672. }
  673. if(!passthrough)
  674. {
  675. GpuVisibilityMeshletConstants consts;
  676. consts.m_viewProjectionMatrix = frustumTestData->m_viewProjMat;
  677. consts.m_cameraPos = lodReferencePoint;
  678. consts.m_viewportSizef = Vec2(frustumTestData->m_finalRenderTargetSize);
  679. cmdb.setFastConstants(&consts, sizeof(consts));
  680. }
  681. cmdb.dispatchComputeIndirect(
  682. BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
  683. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageMeshlets))
  684. .setRange(sizeof(DispatchIndirectArgs)));
  685. }
  686. });
  687. } // end 2nd stage
  688. }
  689. void GpuVisibility::populateRenderGraphStage3(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
  690. {
  691. RenderGraphBuilder& rgraph = *in.m_rgraph;
  692. const GpuVisLimits limits = computeLimits(in.m_technique);
  693. const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
  694. const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
  695. const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
  696. if(!bMeshletRendering)
  697. {
  698. return;
  699. }
  700. // Set the output
  701. if(bHwMeshletRendering)
  702. {
  703. out.m_mesh.m_dispatchMeshIndirectArgsBuffer = out.m_stage3Mem.m_dispatchMeshIndirectArgs;
  704. }
  705. else
  706. {
  707. out.m_mesh.m_drawIndirectArgs = out.m_stage3Mem.m_indirectDrawArgs;
  708. }
  709. out.m_mesh.m_meshletInstancesBuffer = out.m_stage3Mem.m_meshletInstances;
  710. // Create the pass
  711. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 3rd stage: %s", in.m_passesName.cstr()));
  712. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  713. pass.newBufferDependency(m_persistentMemory.m_dep, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  714. pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSrvCompute);
  715. pass.setWork([this, hzbRt = *in.m_hzbRt, bHwMeshletRendering, stage1And2Mem = out.m_stage1And2Mem, stage3Mem = out.m_stage3Mem,
  716. in](RenderPassWorkContext& rpass) {
  717. ANKI_TRACE_SCOPED_EVENT(GpuVis3rdStage);
  718. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  719. const Bool hzbTex = true;
  720. const Bool passthrough = false;
  721. const Bool bStoreMeshletsFailedHzb = false;
  722. cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][bHwMeshletRendering][bStoreMeshletsFailedHzb].get());
  723. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  724. cmdb.bindSrv(1, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  725. cmdb.bindSrv(2, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  726. cmdb.bindSrv(3, 0, UnifiedGeometryBuffer::getSingleton().getBufferView());
  727. rpass.bindSrv(4, 0, hzbRt);
  728. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  729. cmdb.bindUav(0, 0, stage1And2Mem.m_counters);
  730. cmdb.bindSrv(5, 0, stage1And2Mem.m_meshletPrefixSums);
  731. cmdb.bindSrv(6, 0, stage1And2Mem.m_meshletsFailedHzb);
  732. cmdb.bindUav(1, 0, (bHwMeshletRendering) ? stage3Mem.m_dispatchMeshIndirectArgs : stage3Mem.m_indirectDrawArgs);
  733. cmdb.bindUav(2, 0, stage3Mem.m_meshletInstances);
  734. cmdb.bindUav(3, 0, m_outOfMemoryReadbackBuffer);
  735. GpuVisibilityMeshletConstants consts;
  736. consts.m_viewProjectionMatrix = in.m_viewProjectionMatrix;
  737. consts.m_cameraPos = in.m_lodReferencePoint;
  738. consts.m_viewportSizef = Vec2(in.m_viewportSize);
  739. cmdb.setFastConstants(&consts, sizeof(consts));
  740. cmdb.dispatchComputeIndirect(BufferView(stage1And2Mem.m_gpuVisIndirectDispatchArgs)
  741. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k3rdStageMeshlets))
  742. .setRange(sizeof(DispatchIndirectArgs)));
  743. });
  744. }
  745. Error GpuVisibilityNonRenderables::init()
  746. {
  747. ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin", m_prog));
  748. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  749. {
  750. for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
  751. {
  752. for(MutatorValue cpuFeedback = 0; cpuFeedback < 2; ++cpuFeedback)
  753. {
  754. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin",
  755. {{"HZB_TEST", hzb}, {"OBJECT_TYPE", MutatorValue(type)}, {"CPU_FEEDBACK", cpuFeedback}}, m_prog,
  756. m_grProgs[hzb][type][cpuFeedback]));
  757. }
  758. }
  759. }
  760. return Error::kNone;
  761. }
  762. void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderablesInput& in, GpuVisibilityNonRenderablesOutput& out)
  763. {
  764. ANKI_ASSERT(in.m_viewProjectionMat != Mat4::getZero());
  765. RenderGraphBuilder& rgraph = *in.m_rgraph;
  766. U32 objCount = 0;
  767. switch(in.m_objectType)
  768. {
  769. case GpuSceneNonRenderableObjectType::kLight:
  770. objCount = GpuSceneArrays::Light::getSingleton().getElementCount();
  771. break;
  772. case GpuSceneNonRenderableObjectType::kDecal:
  773. objCount = GpuSceneArrays::Decal::getSingleton().getElementCount();
  774. break;
  775. case GpuSceneNonRenderableObjectType::kFogDensityVolume:
  776. objCount = GpuSceneArrays::FogDensityVolume::getSingleton().getElementCount();
  777. break;
  778. case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
  779. objCount = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getElementCount();
  780. break;
  781. case GpuSceneNonRenderableObjectType::kReflectionProbe:
  782. objCount = GpuSceneArrays::ReflectionProbe::getSingleton().getElementCount();
  783. break;
  784. default:
  785. ANKI_ASSERT(0);
  786. }
  787. if(objCount == 0)
  788. {
  789. WeakArray<U32> count;
  790. out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, count);
  791. count[0] = 0;
  792. out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
  793. return;
  794. }
  795. if(in.m_cpuFeedbackBuffer.isValid())
  796. {
  797. ANKI_ASSERT(in.m_cpuFeedbackBuffer.getRange() == sizeof(U32) * (objCount * 2 + 1));
  798. }
  799. const Bool firstRunInFrame = m_lastFrameIdx != getRenderer().getFrameCount();
  800. if(firstRunInFrame)
  801. {
  802. // 1st run in this frame, do some bookkeeping
  803. m_lastFrameIdx = getRenderer().getFrameCount();
  804. m_counterBufferOffset = 0;
  805. m_counterBufferZeroingHandle = {};
  806. }
  807. U32 counterBufferElementSize;
  808. if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
  809. {
  810. counterBufferElementSize = sizeof(GpuVisibilityNonRenderablesCounters);
  811. }
  812. else
  813. {
  814. counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment,
  815. U32(sizeof(GpuVisibilityNonRenderablesCounters)));
  816. }
  817. if(!m_counterBuffer.isCreated() || m_counterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
  818. {
  819. // Counter buffer not created or not big enough, create a new one
  820. BufferInitInfo buffInit("GpuVisibilityNonRenderablesCounters");
  821. buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2 : counterBufferElementSize * kInitialCounterArraySize;
  822. buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
  823. m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
  824. m_counterBufferZeroingHandle = rgraph.importBuffer(BufferView(m_counterBuffer.get()), buffInit.m_usage);
  825. NonGraphicsRenderPass& pass =
  826. rgraph.newNonGraphicsRenderPass(generateTempPassName("Non-renderables vis: Clear counter buff: %s", in.m_passesName.cstr()));
  827. pass.newBufferDependency(m_counterBufferZeroingHandle, BufferUsageBit::kUavCompute);
  828. pass.setWork([counterBuffer = m_counterBuffer](RenderPassWorkContext& rgraph) {
  829. ANKI_TRACE_SCOPED_EVENT(GpuVisNonRenderablesSetup);
  830. fillBuffer(*rgraph.m_commandBuffer, BufferView(counterBuffer.get()), 0);
  831. });
  832. m_counterBufferOffset = 0;
  833. }
  834. else if(!firstRunInFrame)
  835. {
  836. m_counterBufferOffset += counterBufferElementSize;
  837. }
  838. // Allocate memory for the result
  839. out.m_visiblesBuffer = allocateStructuredBuffer<U32>(objCount + 1);
  840. out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
  841. // Create the renderpass
  842. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Non-renderables vis: %s", in.m_passesName.cstr()));
  843. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  844. pass.newBufferDependency(out.m_visiblesBufferHandle, BufferUsageBit::kUavCompute);
  845. if(in.m_hzbRt)
  846. {
  847. pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSrvCompute);
  848. }
  849. if(m_counterBufferZeroingHandle.isValid()) [[unlikely]]
  850. {
  851. pass.newBufferDependency(m_counterBufferZeroingHandle, BufferUsageBit::kSrvCompute | BufferUsageBit::kUavCompute);
  852. }
  853. pass.setWork([this, objType = in.m_objectType, feedbackBuffer = in.m_cpuFeedbackBuffer, viewProjectionMat = in.m_viewProjectionMat,
  854. visibleIndicesBuffHandle = out.m_visiblesBufferHandle, counterBuffer = m_counterBuffer, counterBufferOffset = m_counterBufferOffset,
  855. objCount](RenderPassWorkContext& rgraph) {
  856. ANKI_TRACE_SCOPED_EVENT(GpuVisNonRenderables);
  857. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  858. const Bool needsFeedback = feedbackBuffer.isValid();
  859. cmdb.bindShaderProgram(m_grProgs[0][objType][needsFeedback].get());
  860. BufferView objBuffer;
  861. switch(objType)
  862. {
  863. case GpuSceneNonRenderableObjectType::kLight:
  864. objBuffer = GpuSceneArrays::Light::getSingleton().getBufferView();
  865. break;
  866. case GpuSceneNonRenderableObjectType::kDecal:
  867. objBuffer = GpuSceneArrays::Decal::getSingleton().getBufferView();
  868. break;
  869. case GpuSceneNonRenderableObjectType::kFogDensityVolume:
  870. objBuffer = GpuSceneArrays::FogDensityVolume::getSingleton().getBufferView();
  871. break;
  872. case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
  873. objBuffer = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getBufferView();
  874. break;
  875. case GpuSceneNonRenderableObjectType::kReflectionProbe:
  876. objBuffer = GpuSceneArrays::ReflectionProbe::getSingleton().getBufferView();
  877. break;
  878. default:
  879. ANKI_ASSERT(0);
  880. }
  881. cmdb.bindSrv(0, 0, objBuffer);
  882. GpuVisibilityNonRenderableConstants consts;
  883. Array<Plane, 6> planes;
  884. extractClipPlanes(viewProjectionMat, planes);
  885. for(U32 i = 0; i < 6; ++i)
  886. {
  887. consts.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  888. }
  889. cmdb.setFastConstants(&consts, sizeof(consts));
  890. rgraph.bindUav(0, 0, visibleIndicesBuffHandle);
  891. cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(GpuVisibilityNonRenderablesCounters)));
  892. if(needsFeedback)
  893. {
  894. cmdb.bindUav(2, 0, feedbackBuffer);
  895. }
  896. dispatchPPCompute(cmdb, 64, 1, objCount, 1);
  897. });
  898. }
  899. Error GpuVisibilityAccelerationStructures::init()
  900. {
  901. ANKI_CHECK(
  902. loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", {}, m_visibilityProg, m_visibilityGrProg, "Visibility"));
  903. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", {}, m_visibilityProg,
  904. m_zeroRemainingInstancesGrProg, "ZeroRemainingInstances"));
  905. BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
  906. inf.m_size = sizeof(U32) * 2;
  907. inf.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
  908. m_counterBuffer = GrManager::getSingleton().newBuffer(inf);
  909. zeroBuffer(m_counterBuffer.get());
  910. return Error::kNone;
  911. }
  912. void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccelerationStructuresInput& in,
  913. GpuVisibilityAccelerationStructuresOutput& out)
  914. {
  915. in.validate();
  916. RenderGraphBuilder& rgraph = *in.m_rgraph;
  917. #if ANKI_ASSERTIONS_ENABLED
  918. ANKI_ASSERT(m_lastFrameIdx != getRenderer().getFrameCount());
  919. m_lastFrameIdx = getRenderer().getFrameCount();
  920. #endif
  921. // Allocate the transient buffers
  922. const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
  923. out.m_instancesBuffer = allocateStructuredBuffer<AccelerationStructureInstance>(aabbCount);
  924. out.m_dependency = rgraph.importBuffer(out.m_instancesBuffer, BufferUsageBit::kNone);
  925. out.m_renderablesBuffer = allocateStructuredBuffer<LodAndRenderableIndex>(aabbCount + 1);
  926. const BufferView zeroInstancesAndSbtBuildDispatchArgsBuff = allocateStructuredBuffer<DispatchIndirectArgs>(2);
  927. out.m_buildSbtIndirectArgsBuffer = BufferView(zeroInstancesAndSbtBuildDispatchArgsBuff).incrementOffset(sizeof(DispatchIndirectArgs));
  928. // Create vis pass
  929. {
  930. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis: %s", in.m_passesName.cstr()));
  931. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  932. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavCompute);
  933. pass.setWork([this, viewProjMat = in.m_viewProjectionMatrix, lodDistances = in.m_lodDistances, pointOfTest = in.m_pointOfTest,
  934. testRadius = in.m_testRadius, instancesBuff = out.m_instancesBuffer, visRenderablesBuff = out.m_renderablesBuffer,
  935. zeroInstancesAndSbtBuildDispatchArgsBuff](RenderPassWorkContext& rgraph) {
  936. ANKI_TRACE_SCOPED_EVENT(GpuVisibilityAccelStruct);
  937. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  938. cmdb.bindShaderProgram(m_visibilityGrProg.get());
  939. GpuVisibilityAccelerationStructuresConstants consts;
  940. Array<Plane, 6> planes;
  941. extractClipPlanes(viewProjMat, planes);
  942. for(U32 i = 0; i < 6; ++i)
  943. {
  944. consts.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  945. }
  946. consts.m_pointOfTest = pointOfTest;
  947. consts.m_testRadius = testRadius;
  948. ANKI_ASSERT(kMaxLodCount == 3);
  949. consts.m_maxLodDistances[0] = lodDistances[0];
  950. consts.m_maxLodDistances[1] = lodDistances[1];
  951. consts.m_maxLodDistances[2] = kMaxF32;
  952. consts.m_maxLodDistances[3] = kMaxF32;
  953. cmdb.setFastConstants(&consts, sizeof(consts));
  954. cmdb.bindSrv(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferView());
  955. cmdb.bindSrv(1, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  956. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  957. cmdb.bindSrv(3, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  958. cmdb.bindUav(0, 0, instancesBuff);
  959. cmdb.bindUav(1, 0, visRenderablesBuff);
  960. cmdb.bindUav(2, 0, BufferView(m_counterBuffer.get()));
  961. cmdb.bindUav(3, 0, zeroInstancesAndSbtBuildDispatchArgsBuff);
  962. const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
  963. dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
  964. });
  965. }
  966. // Zero remaining instances
  967. {
  968. NonGraphicsRenderPass& pass =
  969. rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis zero remaining instances: %s", in.m_passesName.cstr()));
  970. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavCompute | BufferUsageBit::kIndirectCompute);
  971. pass.setWork([this, zeroInstancesAndSbtBuildDispatchArgsBuff, instancesBuff = out.m_instancesBuffer,
  972. visRenderablesBuff = out.m_renderablesBuffer](RenderPassWorkContext& rgraph) {
  973. ANKI_TRACE_SCOPED_EVENT(GpuVisibilityAccelStructZero);
  974. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  975. cmdb.bindShaderProgram(m_zeroRemainingInstancesGrProg.get());
  976. cmdb.bindSrv(0, 0, visRenderablesBuff);
  977. cmdb.bindUav(0, 0, instancesBuff);
  978. cmdb.dispatchComputeIndirect(BufferView(zeroInstancesAndSbtBuildDispatchArgsBuff).setRange(sizeof(DispatchIndirectArgs)));
  979. });
  980. }
  981. }
  982. } // end namespace anki