GpuVisibility.cpp 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #include <AnKi/Renderer/Utils/GpuVisibility.h>
  6. #include <AnKi/Renderer/Renderer.h>
  7. #include <AnKi/Scene/RenderStateBucket.h>
  8. #include <AnKi/Scene/GpuSceneArray.h>
  9. #include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
  10. #include <AnKi/Core/GpuMemory/RebarTransientMemoryPool.h>
  11. #include <AnKi/Core/GpuMemory/GpuSceneBuffer.h>
  12. #include <AnKi/Collision/Functions.h>
  13. #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
  14. #include <AnKi/Core/GpuMemory/UnifiedGeometryBuffer.h>
  15. #include <AnKi/Core/StatsSet.h>
  16. #include <AnKi/Util/CVarSet.h>
  17. #include <AnKi/Core/App.h>
  18. namespace anki {
  19. constexpr U32 kMaxVisibleObjects = 30 * 1024;
  20. constexpr U32 kMaxVisiblePrimitives = 40'000'000;
  21. constexpr U32 kMaxVisibleMeshlets = kMaxVisiblePrimitives / kMaxPrimitivesPerMeshlet;
  22. static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem",
  23. StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
  24. static StatCounter g_maxGpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem: max ever used/frame",
  25. StatFlag::kBytes | StatFlag::kMainThreadUpdates);
  26. class GpuVisLimits
  27. {
  28. public:
  29. U32 m_maxVisibleLegacyRenderables = 0;
  30. U32 m_totalLegacyRenderables = 0;
  31. U32 m_maxVisibleMeshlets = 0;
  32. };
  33. static GpuVisLimits computeLimits(RenderingTechnique t)
  34. {
  35. GpuVisLimits out;
  36. const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
  37. const U32 meshletUserCount = buckets.getBucketsActiveUserCountWithMeshletSupport(t);
  38. ANKI_ASSERT(meshletUserCount == 0 || (g_meshletRenderingCVar || GrManager::getSingleton().getDeviceCapabilities().m_meshShaders));
  39. out.m_totalLegacyRenderables = buckets.getBucketsActiveUserCountWithNoMeshletSupport(t);
  40. out.m_maxVisibleLegacyRenderables = min(out.m_totalLegacyRenderables, kMaxVisibleObjects);
  41. out.m_maxVisibleMeshlets = (meshletUserCount) ? min(kMaxVisibleMeshlets, buckets.getBucketsLod0MeshletCount(t)) : 0;
  42. return out;
  43. }
  44. class GpuVisMemoryStats : public RendererObject, public MakeSingletonSimple<GpuVisMemoryStats>
  45. {
  46. public:
  47. void informAboutAllocation(PtrSize size)
  48. {
  49. if(m_frameIdx != getRenderer().getFrameCount())
  50. {
  51. // First call in the frame, update the stat var
  52. m_frameIdx = getRenderer().getFrameCount();
  53. m_maxMemUsedInFrame = max(m_maxMemUsedInFrame, m_memUsedThisFrame);
  54. m_memUsedThisFrame = 0;
  55. g_maxGpuVisMemoryAllocatedStatVar.set(m_maxMemUsedInFrame);
  56. }
  57. m_memUsedThisFrame += size;
  58. }
  59. private:
  60. PtrSize m_memUsedThisFrame = 0;
  61. PtrSize m_maxMemUsedInFrame = 0;
  62. U64 m_frameIdx = kMaxU64;
  63. };
  64. template<typename T>
  65. static BufferView allocateStructuredBuffer(U32 count)
  66. {
  67. BufferView out = {};
  68. if(count > 0)
  69. {
  70. g_gpuVisMemoryAllocatedStatVar.increment(sizeof(T) * count);
  71. out = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
  72. GpuVisMemoryStats::getSingleton().informAboutAllocation(sizeof(T) * count);
  73. }
  74. return out;
  75. }
  76. Error GpuVisibility::init()
  77. {
  78. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  79. {
  80. for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
  81. {
  82. for(MutatorValue genHash = 0; genHash < 2; ++genHash)
  83. {
  84. for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
  85. {
  86. for(MutatorValue gatherLegacy = 0; gatherLegacy < 2; ++gatherLegacy)
  87. {
  88. if(gatherLegacy == 0 && gatherMeshlets == 0)
  89. {
  90. continue; // Not allowed
  91. }
  92. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
  93. {{"HZB_TEST", hzb},
  94. {"DISTANCE_TEST", 0},
  95. {"GATHER_AABBS", gatherAabbs},
  96. {"HASH_VISIBLES", genHash},
  97. {"GATHER_MESHLETS", gatherMeshlets},
  98. {"GATHER_LEGACY", gatherLegacy}},
  99. m_1stStageProg, m_frustumGrProgs[hzb][gatherAabbs][genHash][gatherMeshlets][gatherLegacy]));
  100. }
  101. }
  102. }
  103. }
  104. }
  105. for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
  106. {
  107. for(MutatorValue genHash = 0; genHash < 2; ++genHash)
  108. {
  109. for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
  110. {
  111. for(MutatorValue gatherLegacy = 0; gatherLegacy < 2; ++gatherLegacy)
  112. {
  113. if(gatherLegacy == 0 && gatherMeshlets == 0)
  114. {
  115. continue; // Not allowed
  116. }
  117. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
  118. {{"HZB_TEST", 0},
  119. {"DISTANCE_TEST", 1},
  120. {"GATHER_AABBS", gatherAabbs},
  121. {"HASH_VISIBLES", genHash},
  122. {"GATHER_MESHLETS", gatherMeshlets},
  123. {"GATHER_LEGACY", gatherLegacy}},
  124. m_1stStageProg, m_distGrProgs[gatherAabbs][genHash][gatherMeshlets][gatherLegacy]));
  125. }
  126. }
  127. }
  128. }
  129. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
  130. {{"HZB_TEST", 0}, {"PASSTHROUGH", 0}, {"MESH_SHADERS", 0}, {"STORE_MESHLETS_FAILED_HZB", 1}}, m_2ndStageProg,
  131. m_gatherGrProg, "Legacy"));
  132. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  133. {
  134. for(MutatorValue passthrough = 0; passthrough < 2; ++passthrough)
  135. {
  136. for(MutatorValue meshShaders = 0; meshShaders < 2; ++meshShaders)
  137. {
  138. for(MutatorValue storeMeshletsFailedHzb = 0; storeMeshletsFailedHzb < 2; ++storeMeshletsFailedHzb)
  139. {
  140. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
  141. {{"HZB_TEST", hzb},
  142. {"PASSTHROUGH", passthrough},
  143. {"MESH_SHADERS", meshShaders},
  144. {"STORE_MESHLETS_FAILED_HZB", storeMeshletsFailedHzb}},
  145. m_2ndStageProg, m_meshletGrProgs[hzb][passthrough][meshShaders][storeMeshletsFailedHzb],
  146. "Meshlets"));
  147. }
  148. }
  149. }
  150. }
  151. return Error::kNone;
  152. }
  153. void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out)
  154. {
  155. ANKI_ASSERT(in.m_lodReferencePoint.x() != kMaxF32);
  156. if(RenderStateBucketContainer::getSingleton().getBucketsActiveUserCount(in.m_technique) == 0) [[unlikely]]
  157. {
  158. // Early exit
  159. in = {};
  160. return;
  161. }
  162. RenderGraphBuilder& rgraph = *in.m_rgraph;
  163. class DistanceTestData
  164. {
  165. public:
  166. Vec3 m_pointOfTest;
  167. F32 m_testRadius;
  168. };
  169. class FrustumTestData
  170. {
  171. public:
  172. RenderTargetHandle m_hzbRt;
  173. Mat4 m_viewProjMat;
  174. UVec2 m_finalRenderTargetSize;
  175. };
  176. FrustumTestData* frustumTestData = nullptr;
  177. DistanceTestData* distTestData = nullptr;
  178. Bool bStoreMeshletsFailedHzb = false;
  179. if(distanceBased)
  180. {
  181. distTestData = newInstance<DistanceTestData>(getRenderer().getFrameMemoryPool());
  182. const DistanceGpuVisibilityInput& din = static_cast<DistanceGpuVisibilityInput&>(in);
  183. distTestData->m_pointOfTest = din.m_pointOfTest;
  184. distTestData->m_testRadius = din.m_testRadius;
  185. }
  186. else
  187. {
  188. frustumTestData = newInstance<FrustumTestData>(getRenderer().getFrameMemoryPool());
  189. const FrustumGpuVisibilityInput& fin = static_cast<FrustumGpuVisibilityInput&>(in);
  190. frustumTestData->m_viewProjMat = fin.m_viewProjectionMatrix;
  191. frustumTestData->m_finalRenderTargetSize = fin.m_viewportSize;
  192. if(fin.m_hzbRt)
  193. {
  194. frustumTestData->m_hzbRt = *fin.m_hzbRt;
  195. }
  196. bStoreMeshletsFailedHzb = fin.m_twoPhaseOcclusionCulling;
  197. }
  198. const Bool firstCallInFrame = m_persistentMemory.m_frameIdx != getRenderer().getFrameCount();
  199. if(firstCallInFrame)
  200. {
  201. m_persistentMemory.m_frameIdx = getRenderer().getFrameCount();
  202. }
  203. // OoM
  204. if(firstCallInFrame)
  205. {
  206. U32 data;
  207. PtrSize dataReadSize;
  208. getRenderer().getReadbackManager().readMostRecentData(m_outOfMemoryReadback, &data, sizeof(data), dataReadSize);
  209. if(dataReadSize == sizeof(U32) && data != 0)
  210. {
  211. CString who;
  212. switch(data)
  213. {
  214. case 0b1:
  215. who = "Stage 1";
  216. break;
  217. case 0b10:
  218. who = "Stage 2";
  219. break;
  220. case 0b11:
  221. who = "Both stages";
  222. break;
  223. default:
  224. ANKI_ASSERT(0);
  225. }
  226. ANKI_RESOURCE_LOGE("GPU visibility went out of memory: %s", who.cstr());
  227. }
  228. m_outOfMemoryReadbackBuffer = getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_outOfMemoryReadback, 1);
  229. }
  230. // Get some limits
  231. const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
  232. const U32 bucketCount = buckets.getBucketCount(in.m_technique);
  233. const GpuVisLimits limits = computeLimits(in.m_technique);
  234. const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
  235. const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
  236. const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
  237. const Bool bLegacyRendering = limits.m_maxVisibleLegacyRenderables > 0;
  238. if(bStoreMeshletsFailedHzb)
  239. {
  240. ANKI_ASSERT(bMeshletRendering && frustumTestData->m_hzbRt.isValid());
  241. }
  242. // Allocate persistent memory for the frame
  243. if(firstCallInFrame)
  244. {
  245. GpuVisLimits maxLimits;
  246. for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
  247. {
  248. const GpuVisLimits limits = computeLimits(t);
  249. maxLimits.m_maxVisibleLegacyRenderables = max(maxLimits.m_maxVisibleLegacyRenderables, limits.m_maxVisibleLegacyRenderables);
  250. maxLimits.m_maxVisibleMeshlets = max(maxLimits.m_maxVisibleMeshlets, limits.m_maxVisibleMeshlets);
  251. }
  252. m_persistentMemory.m_stage1.m_visibleRenderables =
  253. allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleLegacyRenderables);
  254. m_persistentMemory.m_stage1.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(maxLimits.m_maxVisibleMeshlets);
  255. m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(maxLimits.m_maxVisibleLegacyRenderables);
  256. m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs =
  257. allocateStructuredBuffer<DrawIndexedIndirectArgs>(maxLimits.m_maxVisibleLegacyRenderables);
  258. m_persistentMemory.m_stage2Meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
  259. m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb =
  260. allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleMeshlets);
  261. m_persistentMemory.m_stage3.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
  262. m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
  263. : m_persistentMemory.m_stage1.m_visibleRenderables,
  264. BufferUsageBit::kNone);
  265. }
  266. // Compute the MDI sub-ranges
  267. if(limits.m_maxVisibleLegacyRenderables)
  268. {
  269. newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), bucketCount, out.m_legacy.m_bucketIndirectArgsRanges);
  270. U32 ibucket = 0;
  271. U32 offset = 0;
  272. buckets.iterateBuckets(in.m_technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletCount) {
  273. out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance = offset;
  274. if(meshletCount == 0 && userCount > 0)
  275. {
  276. out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount =
  277. max(1u, U32(U64(userCount) * limits.m_maxVisibleLegacyRenderables / limits.m_totalLegacyRenderables));
  278. offset += out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
  279. }
  280. ++ibucket;
  281. });
  282. // The last element should point to the limit of the buffer
  283. InstanceRange& last = out.m_legacy.m_bucketIndirectArgsRanges.getBack();
  284. ANKI_ASSERT(limits.m_maxVisibleLegacyRenderables >= last.m_firstInstance);
  285. last.m_instanceCount = limits.m_maxVisibleLegacyRenderables - last.m_firstInstance;
  286. }
  287. // Allocate memory for stage 1
  288. class Stage1Mem
  289. {
  290. public:
  291. BufferView m_counters;
  292. BufferView m_visibleRenderables;
  293. BufferView m_visibleMeshlets;
  294. BufferView m_renderablePrefixSums;
  295. BufferView m_meshletPrefixSums;
  296. BufferView m_gpuVisIndirectDispatchArgs;
  297. BufferView m_visibleAabbIndices;
  298. BufferView m_hash;
  299. } stage1Mem;
  300. stage1Mem.m_counters = allocateStructuredBuffer<U32>(U32(GpuVisibilityCounter::kCount));
  301. if(in.m_limitMemory)
  302. {
  303. PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
  304. if(newRange)
  305. {
  306. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleRenderables.getRange());
  307. stage1Mem.m_visibleRenderables = BufferView(m_persistentMemory.m_stage1.m_visibleRenderables).setRange(newRange);
  308. }
  309. newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
  310. if(newRange)
  311. {
  312. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleMeshlets.getRange());
  313. stage1Mem.m_visibleMeshlets = BufferView(m_persistentMemory.m_stage1.m_visibleMeshlets).setRange(newRange);
  314. }
  315. }
  316. else
  317. {
  318. stage1Mem.m_visibleRenderables = allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(limits.m_maxVisibleLegacyRenderables);
  319. stage1Mem.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(limits.m_maxVisibleMeshlets);
  320. }
  321. stage1Mem.m_renderablePrefixSums = allocateStructuredBuffer<U32>(bucketCount);
  322. stage1Mem.m_meshletPrefixSums = allocateStructuredBuffer<U32>(bucketCount);
  323. stage1Mem.m_gpuVisIndirectDispatchArgs = allocateStructuredBuffer<DispatchIndirectArgs>(U32(GpuVisibilityIndirectDispatches::kCount));
  324. if(in.m_gatherAabbIndices)
  325. {
  326. stage1Mem.m_visibleAabbIndices = allocateStructuredBuffer<U32>(buckets.getBucketsActiveUserCount(in.m_technique));
  327. }
  328. if(in.m_hashVisibles)
  329. {
  330. stage1Mem.m_hash = allocateStructuredBuffer<GpuVisibilityHash>(1);
  331. }
  332. // Allocate memory for stage 2
  333. class Stage2Mem
  334. {
  335. public:
  336. class
  337. {
  338. public:
  339. BufferView m_instanceRateRenderables;
  340. BufferView m_drawIndexedIndirectArgs;
  341. BufferView m_mdiDrawCounts;
  342. } m_legacy;
  343. class
  344. {
  345. public:
  346. BufferView m_indirectDrawArgs;
  347. BufferView m_dispatchMeshIndirectArgs;
  348. BufferView m_meshletInstances;
  349. BufferView m_meshletsFailedHzb;
  350. } m_meshlet;
  351. } stage2Mem;
  352. if(bLegacyRendering)
  353. {
  354. if(in.m_limitMemory)
  355. {
  356. PtrSize newRange = sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables;
  357. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables.getRange());
  358. stage2Mem.m_legacy.m_instanceRateRenderables = BufferView(m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables).setRange(newRange);
  359. newRange = sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables;
  360. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs.getRange());
  361. stage2Mem.m_legacy.m_drawIndexedIndirectArgs = BufferView(m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs).setRange(newRange);
  362. }
  363. else
  364. {
  365. stage2Mem.m_legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(limits.m_maxVisibleLegacyRenderables);
  366. stage2Mem.m_legacy.m_drawIndexedIndirectArgs = allocateStructuredBuffer<DrawIndexedIndirectArgs>(limits.m_maxVisibleLegacyRenderables);
  367. }
  368. stage2Mem.m_legacy.m_mdiDrawCounts = allocateStructuredBuffer<U32>(bucketCount);
  369. }
  370. if(bMeshletRendering)
  371. {
  372. if(bHwMeshletRendering)
  373. {
  374. stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
  375. }
  376. else
  377. {
  378. stage2Mem.m_meshlet.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
  379. }
  380. const U32 newCount = limits.m_maxVisibleMeshlets;
  381. if(in.m_limitMemory)
  382. {
  383. ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
  384. stage2Mem.m_meshlet.m_meshletInstances =
  385. BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
  386. }
  387. else
  388. {
  389. stage2Mem.m_meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
  390. }
  391. if(bStoreMeshletsFailedHzb)
  392. {
  393. const U32 newCount = limits.m_maxVisibleMeshlets;
  394. if(in.m_limitMemory)
  395. {
  396. ANKI_ASSERT(newCount * sizeof(GpuVisibilityVisibleMeshletDesc) <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
  397. stage2Mem.m_meshlet.m_meshletsFailedHzb =
  398. BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newCount * sizeof(GpuVisibilityVisibleMeshletDesc));
  399. }
  400. else
  401. {
  402. stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(newCount);
  403. }
  404. }
  405. }
  406. // Stage 3 memory
  407. class Stage3Mem
  408. {
  409. public:
  410. BufferView m_indirectDrawArgs;
  411. BufferView m_dispatchMeshIndirectArgs;
  412. BufferView m_meshletInstances;
  413. } stage3Mem;
  414. if(bStoreMeshletsFailedHzb)
  415. {
  416. if(bHwMeshletRendering)
  417. {
  418. stage3Mem.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
  419. }
  420. else
  421. {
  422. stage3Mem.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
  423. }
  424. const U32 newCount = limits.m_maxVisibleMeshlets;
  425. if(in.m_limitMemory)
  426. {
  427. ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
  428. stage3Mem.m_meshletInstances =
  429. BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
  430. }
  431. else
  432. {
  433. stage3Mem.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
  434. }
  435. }
  436. // Setup output
  437. out.m_legacy.m_renderableInstancesBuffer = stage2Mem.m_legacy.m_instanceRateRenderables;
  438. out.m_legacy.m_mdiDrawCountsBuffer = stage2Mem.m_legacy.m_mdiDrawCounts;
  439. out.m_legacy.m_drawIndexedIndirectArgsBuffer = stage2Mem.m_legacy.m_drawIndexedIndirectArgs;
  440. out.m_mesh.m_dispatchMeshIndirectArgsBuffer = stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs;
  441. out.m_mesh.m_drawIndirectArgs = stage2Mem.m_meshlet.m_indirectDrawArgs;
  442. out.m_mesh.m_meshletInstancesBuffer = stage2Mem.m_meshlet.m_meshletInstances;
  443. out.m_visibleAaabbIndicesBuffer = stage1Mem.m_visibleAabbIndices;
  444. out.m_visiblesHashBuffer = stage1Mem.m_hash;
  445. if(bHwMeshletRendering)
  446. {
  447. out.m_mesh.m_firstMeshletBuffer = stage1Mem.m_meshletPrefixSums;
  448. }
  449. if(bStoreMeshletsFailedHzb)
  450. {
  451. out.m_stage1And2Mem.m_meshletsFailedHzb = stage2Mem.m_meshlet.m_meshletsFailedHzb;
  452. out.m_stage1And2Mem.m_counters = stage1Mem.m_counters;
  453. out.m_stage1And2Mem.m_meshletPrefixSums = stage1Mem.m_meshletPrefixSums;
  454. out.m_stage1And2Mem.m_gpuVisIndirectDispatchArgs = stage1Mem.m_gpuVisIndirectDispatchArgs;
  455. out.m_stage3Mem.m_indirectDrawArgs = stage3Mem.m_indirectDrawArgs;
  456. out.m_stage3Mem.m_dispatchMeshIndirectArgs = stage3Mem.m_dispatchMeshIndirectArgs;
  457. out.m_stage3Mem.m_meshletInstances = stage3Mem.m_meshletInstances;
  458. }
  459. // Use one buffer as a depedency. Doesn't matter which
  460. out.m_dependency =
  461. (in.m_limitMemory) ? m_persistentMemory.m_dep : rgraph.importBuffer(stage1Mem.m_gpuVisIndirectDispatchArgs, BufferUsageBit::kNone);
  462. // Zero some stuff
  463. const BufferHandle zeroMemDep = rgraph.importBuffer(stage1Mem.m_counters, BufferUsageBit::kNone);
  464. {
  465. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis zero: %s", in.m_passesName.cstr()));
  466. pass.newBufferDependency(zeroMemDep, BufferUsageBit::kCopyDestination);
  467. pass.setWork([stage1Mem, stage2Mem, stage3Mem](RenderPassWorkContext& rpass) {
  468. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  469. constexpr Bool debugZeroing = false; // For debugging purposes zero everything
  470. #define ANKI_ZERO(buff, alwaysZero) \
  471. if((alwaysZero || debugZeroing) && buff.isValid()) \
  472. { \
  473. cmdb.pushDebugMarker(#buff, Vec3(1.0f, 1.0f, 1.0f)); \
  474. cmdb.fillBuffer(buff, 0); \
  475. cmdb.popDebugMarker(); \
  476. }
  477. #define ANKI_ZERO_PART(buff, alwaysZero, sizeToZero) \
  478. if((alwaysZero || debugZeroing) && buff.isValid()) \
  479. { \
  480. cmdb.pushDebugMarker(#buff, Vec3(1.0f, 1.0f, 1.0f)); \
  481. cmdb.fillBuffer((debugZeroing) ? buff : BufferView(buff).setRange(sizeToZero), 0); \
  482. cmdb.popDebugMarker(); \
  483. }
  484. ANKI_ZERO(stage1Mem.m_counters, true)
  485. ANKI_ZERO(stage1Mem.m_visibleRenderables, false)
  486. ANKI_ZERO(stage1Mem.m_visibleMeshlets, false)
  487. ANKI_ZERO(stage1Mem.m_renderablePrefixSums, true)
  488. ANKI_ZERO(stage1Mem.m_meshletPrefixSums, true)
  489. ANKI_ZERO(stage1Mem.m_gpuVisIndirectDispatchArgs, false)
  490. ANKI_ZERO_PART(stage1Mem.m_visibleAabbIndices, true, sizeof(U32))
  491. ANKI_ZERO(stage1Mem.m_hash, true)
  492. ANKI_ZERO(stage2Mem.m_legacy.m_instanceRateRenderables, false)
  493. ANKI_ZERO(stage2Mem.m_legacy.m_drawIndexedIndirectArgs, true)
  494. ANKI_ZERO(stage2Mem.m_legacy.m_mdiDrawCounts, true)
  495. ANKI_ZERO(stage2Mem.m_meshlet.m_indirectDrawArgs, true)
  496. ANKI_ZERO(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs, true)
  497. ANKI_ZERO(stage2Mem.m_meshlet.m_meshletInstances, false)
  498. ANKI_ZERO(stage2Mem.m_meshlet.m_meshletsFailedHzb, false)
  499. ANKI_ZERO(stage3Mem.m_indirectDrawArgs, true)
  500. ANKI_ZERO(stage3Mem.m_dispatchMeshIndirectArgs, true)
  501. ANKI_ZERO(stage3Mem.m_meshletInstances, false)
  502. #undef ANKI_ZERO
  503. });
  504. }
  505. // 1st stage
  506. {
  507. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 1st stage: %s", in.m_passesName.cstr()));
  508. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  509. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavCompute);
  510. pass.newBufferDependency(zeroMemDep, BufferUsageBit::kUavCompute);
  511. if(frustumTestData && frustumTestData->m_hzbRt.isValid())
  512. {
  513. pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSrvCompute);
  514. }
  515. pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
  516. technique = in.m_technique, stage1Mem, bLegacyRendering, bMeshletRendering](RenderPassWorkContext& rpass) {
  517. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  518. const Bool gatherAabbIndices = stage1Mem.m_visibleAabbIndices.isValid();
  519. const Bool genHash = stage1Mem.m_hash.isValid();
  520. if(frustumTestData)
  521. {
  522. cmdb.bindShaderProgram(
  523. m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash][bMeshletRendering][bLegacyRendering].get());
  524. }
  525. else
  526. {
  527. cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash][bMeshletRendering][bLegacyRendering].get());
  528. }
  529. BufferView aabbsBuffer;
  530. U32 aabbCount = 0;
  531. switch(technique)
  532. {
  533. case RenderingTechnique::kGBuffer:
  534. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getBufferView();
  535. aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
  536. break;
  537. case RenderingTechnique::kDepth:
  538. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getBufferView();
  539. aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
  540. break;
  541. case RenderingTechnique::kForward:
  542. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getBufferView();
  543. aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
  544. break;
  545. default:
  546. ANKI_ASSERT(0);
  547. }
  548. cmdb.bindSrv(0, 0, aabbsBuffer);
  549. cmdb.bindSrv(1, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  550. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  551. cmdb.bindSrv(3, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  552. cmdb.bindSrv(4, 0, GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
  553. cmdb.bindUav(0, 0, stage1Mem.m_counters);
  554. cmdb.bindUav(1, 0, (bLegacyRendering) ? stage1Mem.m_visibleRenderables : BufferView(&getRenderer().getDummyBuffer()));
  555. cmdb.bindUav(2, 0, (bMeshletRendering) ? stage1Mem.m_visibleMeshlets : BufferView(&getRenderer().getDummyBuffer()));
  556. cmdb.bindUav(3, 0, (bLegacyRendering) ? stage1Mem.m_renderablePrefixSums : BufferView(&getRenderer().getDummyBuffer()));
  557. cmdb.bindUav(4, 0, (bMeshletRendering) ? stage1Mem.m_meshletPrefixSums : BufferView(&getRenderer().getDummyBuffer()));
  558. cmdb.bindUav(5, 0, stage1Mem.m_gpuVisIndirectDispatchArgs);
  559. cmdb.bindUav(6, 0, m_outOfMemoryReadbackBuffer);
  560. if(gatherAabbIndices)
  561. {
  562. cmdb.bindUav(7, 0, stage1Mem.m_visibleAabbIndices);
  563. }
  564. if(genHash)
  565. {
  566. cmdb.bindUav(8, 0, stage1Mem.m_hash);
  567. }
  568. if(frustumTestData)
  569. {
  570. FrustumGpuVisibilityConsts* consts = allocateAndBindConstants<FrustumGpuVisibilityConsts>(cmdb, 0, 0);
  571. Array<Plane, 6> planes;
  572. extractClipPlanes(frustumTestData->m_viewProjMat, planes);
  573. for(U32 i = 0; i < 6; ++i)
  574. {
  575. consts->m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  576. }
  577. ANKI_ASSERT(kMaxLodCount == 3);
  578. consts->m_maxLodDistances[0] = lodDistances[0];
  579. consts->m_maxLodDistances[1] = lodDistances[1];
  580. consts->m_maxLodDistances[2] = kMaxF32;
  581. consts->m_maxLodDistances[3] = kMaxF32;
  582. consts->m_lodReferencePoint = lodReferencePoint;
  583. consts->m_viewProjectionMat = frustumTestData->m_viewProjMat;
  584. consts->m_finalRenderTargetSize = Vec2(frustumTestData->m_finalRenderTargetSize);
  585. if(frustumTestData->m_hzbRt.isValid())
  586. {
  587. rpass.bindSrv(5, 0, frustumTestData->m_hzbRt);
  588. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  589. }
  590. }
  591. else
  592. {
  593. DistanceGpuVisibilityConstants consts;
  594. consts.m_pointOfTest = distTestData->m_pointOfTest;
  595. consts.m_testRadius = distTestData->m_testRadius;
  596. consts.m_maxLodDistances[0] = lodDistances[0];
  597. consts.m_maxLodDistances[1] = lodDistances[1];
  598. consts.m_maxLodDistances[2] = kMaxF32;
  599. consts.m_maxLodDistances[3] = kMaxF32;
  600. consts.m_lodReferencePoint = lodReferencePoint;
  601. cmdb.setFastConstants(&consts, sizeof(consts));
  602. }
  603. dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
  604. });
  605. } // end 1st stage
  606. // 2nd stage
  607. {
  608. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 2nd stage: %s", in.m_passesName.cstr()));
  609. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  610. if(frustumTestData && frustumTestData->m_hzbRt.isValid())
  611. {
  612. pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSrvCompute);
  613. }
  614. pass.setWork([this, stage1Mem, stage2Mem, bLegacyRendering, bMeshletRendering, bHwMeshletRendering, out, frustumTestData,
  615. lodReferencePoint = in.m_lodReferencePoint, bStoreMeshletsFailedHzb](RenderPassWorkContext& rpass) {
  616. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  617. if(bLegacyRendering)
  618. {
  619. cmdb.bindShaderProgram(m_gatherGrProg.get());
  620. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  621. cmdb.bindSrv(1, 0, GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
  622. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  623. cmdb.bindSrv(3, 0, stage1Mem.m_visibleRenderables);
  624. cmdb.bindSrv(4, 0, stage1Mem.m_counters);
  625. cmdb.bindSrv(5, 0, stage1Mem.m_renderablePrefixSums);
  626. WeakArray<UVec2> firstDrawIndirectArgAndCount =
  627. allocateAndBindSrvStructuredBuffer<UVec2>(cmdb, 6, 0, out.m_legacy.m_bucketIndirectArgsRanges.getSize());
  628. for(U32 ibucket = 0; ibucket < out.m_legacy.m_bucketIndirectArgsRanges.getSize(); ++ibucket)
  629. {
  630. firstDrawIndirectArgAndCount[ibucket].x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
  631. firstDrawIndirectArgAndCount[ibucket].y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
  632. }
  633. cmdb.bindUav(0, 0, stage2Mem.m_legacy.m_instanceRateRenderables);
  634. cmdb.bindUav(1, 0, stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
  635. cmdb.bindUav(2, 0, stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
  636. cmdb.bindUav(3, 0, stage2Mem.m_legacy.m_mdiDrawCounts);
  637. cmdb.bindUav(4, 0, m_outOfMemoryReadbackBuffer);
  638. cmdb.dispatchComputeIndirect(
  639. BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
  640. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageLegacy))
  641. .setRange(sizeof(DispatchIndirectArgs)));
  642. }
  643. if(bMeshletRendering)
  644. {
  645. const Bool hzbTex = frustumTestData && frustumTestData->m_hzbRt.isValid();
  646. const Bool passthrough = frustumTestData == nullptr;
  647. const Bool meshShaders = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
  648. cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][meshShaders][bStoreMeshletsFailedHzb].get());
  649. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  650. cmdb.bindSrv(1, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  651. cmdb.bindSrv(2, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  652. cmdb.bindSrv(3, 0, UnifiedGeometryBuffer::getSingleton().getBufferView());
  653. if(hzbTex)
  654. {
  655. rpass.bindSrv(4, 0, frustumTestData->m_hzbRt);
  656. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  657. }
  658. cmdb.bindUav(0, 0, stage1Mem.m_counters);
  659. cmdb.bindSrv(5, 0, stage1Mem.m_meshletPrefixSums);
  660. cmdb.bindSrv(6, 0, stage1Mem.m_visibleMeshlets);
  661. cmdb.bindUav(1, 0, (bHwMeshletRendering) ? stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs : stage2Mem.m_meshlet.m_indirectDrawArgs);
  662. cmdb.bindUav(2, 0, stage2Mem.m_meshlet.m_meshletInstances);
  663. cmdb.bindUav(3, 0, m_outOfMemoryReadbackBuffer);
  664. if(bStoreMeshletsFailedHzb)
  665. {
  666. cmdb.bindUav(4, 0, stage2Mem.m_meshlet.m_meshletsFailedHzb);
  667. cmdb.bindUav(5, 0, stage1Mem.m_gpuVisIndirectDispatchArgs);
  668. }
  669. if(!passthrough)
  670. {
  671. GpuVisibilityMeshletConstants consts;
  672. consts.m_viewProjectionMatrix = frustumTestData->m_viewProjMat;
  673. consts.m_cameraPos = lodReferencePoint;
  674. consts.m_viewportSizef = Vec2(frustumTestData->m_finalRenderTargetSize);
  675. cmdb.setFastConstants(&consts, sizeof(consts));
  676. }
  677. cmdb.dispatchComputeIndirect(
  678. BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
  679. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageMeshlets))
  680. .setRange(sizeof(DispatchIndirectArgs)));
  681. }
  682. });
  683. } // end 2nd stage
  684. }
  685. void GpuVisibility::populateRenderGraphStage3(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
  686. {
  687. RenderGraphBuilder& rgraph = *in.m_rgraph;
  688. const GpuVisLimits limits = computeLimits(in.m_technique);
  689. const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
  690. const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
  691. const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
  692. if(!bMeshletRendering)
  693. {
  694. return;
  695. }
  696. // Set the output
  697. if(bHwMeshletRendering)
  698. {
  699. out.m_mesh.m_dispatchMeshIndirectArgsBuffer = out.m_stage3Mem.m_dispatchMeshIndirectArgs;
  700. }
  701. else
  702. {
  703. out.m_mesh.m_drawIndirectArgs = out.m_stage3Mem.m_indirectDrawArgs;
  704. }
  705. out.m_mesh.m_meshletInstancesBuffer = out.m_stage3Mem.m_meshletInstances;
  706. // Create the pass
  707. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 3rd stage: %s", in.m_passesName.cstr()));
  708. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  709. pass.newBufferDependency(m_persistentMemory.m_dep, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  710. pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSrvCompute);
  711. pass.setWork([this, hzbRt = *in.m_hzbRt, bHwMeshletRendering, stage1And2Mem = out.m_stage1And2Mem, stage3Mem = out.m_stage3Mem,
  712. in](RenderPassWorkContext& rpass) {
  713. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  714. const Bool hzbTex = true;
  715. const Bool passthrough = false;
  716. const Bool bStoreMeshletsFailedHzb = false;
  717. cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][bHwMeshletRendering][bStoreMeshletsFailedHzb].get());
  718. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  719. cmdb.bindSrv(1, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  720. cmdb.bindSrv(2, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  721. cmdb.bindSrv(3, 0, UnifiedGeometryBuffer::getSingleton().getBufferView());
  722. rpass.bindSrv(4, 0, hzbRt);
  723. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  724. cmdb.bindUav(0, 0, stage1And2Mem.m_counters);
  725. cmdb.bindSrv(5, 0, stage1And2Mem.m_meshletPrefixSums);
  726. cmdb.bindSrv(6, 0, stage1And2Mem.m_meshletsFailedHzb);
  727. cmdb.bindUav(1, 0, (bHwMeshletRendering) ? stage3Mem.m_dispatchMeshIndirectArgs : stage3Mem.m_indirectDrawArgs);
  728. cmdb.bindUav(2, 0, stage3Mem.m_meshletInstances);
  729. cmdb.bindUav(3, 0, m_outOfMemoryReadbackBuffer);
  730. GpuVisibilityMeshletConstants consts;
  731. consts.m_viewProjectionMatrix = in.m_viewProjectionMatrix;
  732. consts.m_cameraPos = in.m_lodReferencePoint;
  733. consts.m_viewportSizef = Vec2(in.m_viewportSize);
  734. cmdb.setFastConstants(&consts, sizeof(consts));
  735. cmdb.dispatchComputeIndirect(BufferView(stage1And2Mem.m_gpuVisIndirectDispatchArgs)
  736. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k3rdStageMeshlets))
  737. .setRange(sizeof(DispatchIndirectArgs)));
  738. });
  739. }
  740. Error GpuVisibilityNonRenderables::init()
  741. {
  742. ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin", m_prog));
  743. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  744. {
  745. for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
  746. {
  747. for(MutatorValue cpuFeedback = 0; cpuFeedback < 2; ++cpuFeedback)
  748. {
  749. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin",
  750. {{"HZB_TEST", hzb}, {"OBJECT_TYPE", MutatorValue(type)}, {"CPU_FEEDBACK", cpuFeedback}}, m_prog,
  751. m_grProgs[hzb][type][cpuFeedback]));
  752. }
  753. }
  754. }
  755. return Error::kNone;
  756. }
  757. void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderablesInput& in, GpuVisibilityNonRenderablesOutput& out)
  758. {
  759. ANKI_ASSERT(in.m_viewProjectionMat != Mat4::getZero());
  760. RenderGraphBuilder& rgraph = *in.m_rgraph;
  761. U32 objCount = 0;
  762. switch(in.m_objectType)
  763. {
  764. case GpuSceneNonRenderableObjectType::kLight:
  765. objCount = GpuSceneArrays::Light::getSingleton().getElementCount();
  766. break;
  767. case GpuSceneNonRenderableObjectType::kDecal:
  768. objCount = GpuSceneArrays::Decal::getSingleton().getElementCount();
  769. break;
  770. case GpuSceneNonRenderableObjectType::kFogDensityVolume:
  771. objCount = GpuSceneArrays::FogDensityVolume::getSingleton().getElementCount();
  772. break;
  773. case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
  774. objCount = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getElementCount();
  775. break;
  776. case GpuSceneNonRenderableObjectType::kReflectionProbe:
  777. objCount = GpuSceneArrays::ReflectionProbe::getSingleton().getElementCount();
  778. break;
  779. default:
  780. ANKI_ASSERT(0);
  781. }
  782. if(objCount == 0)
  783. {
  784. WeakArray<U32> count;
  785. out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, count);
  786. count[0] = 0;
  787. out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
  788. return;
  789. }
  790. if(in.m_cpuFeedbackBuffer.isValid())
  791. {
  792. ANKI_ASSERT(in.m_cpuFeedbackBuffer.getRange() == sizeof(U32) * (objCount * 2 + 1));
  793. }
  794. const Bool firstRunInFrame = m_lastFrameIdx != getRenderer().getFrameCount();
  795. if(firstRunInFrame)
  796. {
  797. // 1st run in this frame, do some bookkeeping
  798. m_lastFrameIdx = getRenderer().getFrameCount();
  799. m_counterBufferOffset = 0;
  800. m_counterBufferZeroingHandle = {};
  801. }
  802. U32 counterBufferElementSize;
  803. if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
  804. {
  805. counterBufferElementSize = sizeof(GpuVisibilityNonRenderablesCounters);
  806. }
  807. else
  808. {
  809. counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment,
  810. U32(sizeof(GpuVisibilityNonRenderablesCounters)));
  811. }
  812. if(!m_counterBuffer.isCreated() || m_counterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
  813. {
  814. // Counter buffer not created or not big enough, create a new one
  815. BufferInitInfo buffInit("GpuVisibilityNonRenderablesCounters");
  816. buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2 : counterBufferElementSize * kInitialCounterArraySize;
  817. buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
  818. m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
  819. m_counterBufferZeroingHandle = rgraph.importBuffer(BufferView(m_counterBuffer.get()), buffInit.m_usage);
  820. NonGraphicsRenderPass& pass =
  821. rgraph.newNonGraphicsRenderPass(generateTempPassName("Non-renderables vis: Clear counter buff: %s", in.m_passesName.cstr()));
  822. pass.newBufferDependency(m_counterBufferZeroingHandle, BufferUsageBit::kCopyDestination);
  823. pass.setWork([counterBuffer = m_counterBuffer](RenderPassWorkContext& rgraph) {
  824. rgraph.m_commandBuffer->fillBuffer(BufferView(counterBuffer.get()), 0);
  825. });
  826. m_counterBufferOffset = 0;
  827. }
  828. else if(!firstRunInFrame)
  829. {
  830. m_counterBufferOffset += counterBufferElementSize;
  831. }
  832. // Allocate memory for the result
  833. out.m_visiblesBuffer = allocateStructuredBuffer<U32>(objCount + 1);
  834. out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
  835. // Create the renderpass
  836. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Non-renderables vis: %s", in.m_passesName.cstr()));
  837. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  838. pass.newBufferDependency(out.m_visiblesBufferHandle, BufferUsageBit::kUavCompute);
  839. if(in.m_hzbRt)
  840. {
  841. pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSrvCompute);
  842. }
  843. if(m_counterBufferZeroingHandle.isValid()) [[unlikely]]
  844. {
  845. pass.newBufferDependency(m_counterBufferZeroingHandle, BufferUsageBit::kSrvCompute | BufferUsageBit::kUavCompute);
  846. }
  847. pass.setWork([this, objType = in.m_objectType, feedbackBuffer = in.m_cpuFeedbackBuffer, viewProjectionMat = in.m_viewProjectionMat,
  848. visibleIndicesBuffHandle = out.m_visiblesBufferHandle, counterBuffer = m_counterBuffer, counterBufferOffset = m_counterBufferOffset,
  849. objCount](RenderPassWorkContext& rgraph) {
  850. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  851. const Bool needsFeedback = feedbackBuffer.isValid();
  852. cmdb.bindShaderProgram(m_grProgs[0][objType][needsFeedback].get());
  853. BufferView objBuffer;
  854. switch(objType)
  855. {
  856. case GpuSceneNonRenderableObjectType::kLight:
  857. objBuffer = GpuSceneArrays::Light::getSingleton().getBufferView();
  858. break;
  859. case GpuSceneNonRenderableObjectType::kDecal:
  860. objBuffer = GpuSceneArrays::Decal::getSingleton().getBufferView();
  861. break;
  862. case GpuSceneNonRenderableObjectType::kFogDensityVolume:
  863. objBuffer = GpuSceneArrays::FogDensityVolume::getSingleton().getBufferView();
  864. break;
  865. case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
  866. objBuffer = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getBufferView();
  867. break;
  868. case GpuSceneNonRenderableObjectType::kReflectionProbe:
  869. objBuffer = GpuSceneArrays::ReflectionProbe::getSingleton().getBufferView();
  870. break;
  871. default:
  872. ANKI_ASSERT(0);
  873. }
  874. cmdb.bindSrv(0, 0, objBuffer);
  875. GpuVisibilityNonRenderableConstants consts;
  876. Array<Plane, 6> planes;
  877. extractClipPlanes(viewProjectionMat, planes);
  878. for(U32 i = 0; i < 6; ++i)
  879. {
  880. consts.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  881. }
  882. cmdb.setFastConstants(&consts, sizeof(consts));
  883. rgraph.bindUav(0, 0, visibleIndicesBuffHandle);
  884. cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(GpuVisibilityNonRenderablesCounters)));
  885. if(needsFeedback)
  886. {
  887. cmdb.bindUav(2, 0, feedbackBuffer);
  888. }
  889. dispatchPPCompute(cmdb, 64, 1, objCount, 1);
  890. });
  891. }
  892. Error GpuVisibilityAccelerationStructures::init()
  893. {
  894. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", m_visibilityProg, m_visibilityGrProg));
  895. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructuresZeroRemainingInstances.ankiprogbin", m_zeroRemainingInstancesProg,
  896. m_zeroRemainingInstancesGrProg));
  897. BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
  898. inf.m_size = sizeof(U32) * 2;
  899. inf.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
  900. m_counterBuffer = GrManager::getSingleton().newBuffer(inf);
  901. zeroBuffer(m_counterBuffer.get());
  902. return Error::kNone;
  903. }
  904. void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccelerationStructuresInput& in,
  905. GpuVisibilityAccelerationStructuresOutput& out)
  906. {
  907. in.validate();
  908. RenderGraphBuilder& rgraph = *in.m_rgraph;
  909. #if ANKI_ASSERTIONS_ENABLED
  910. ANKI_ASSERT(m_lastFrameIdx != getRenderer().getFrameCount());
  911. m_lastFrameIdx = getRenderer().getFrameCount();
  912. #endif
  913. // Allocate the transient buffers
  914. const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
  915. out.m_instancesBuffer = allocateStructuredBuffer<AccelerationStructureInstance>(aabbCount);
  916. out.m_someBufferHandle = rgraph.importBuffer(out.m_instancesBuffer, BufferUsageBit::kUavCompute);
  917. out.m_renderableIndicesBuffer = allocateStructuredBuffer<U32>(aabbCount + 1);
  918. const BufferView zeroInstancesDispatchArgsBuff = allocateStructuredBuffer<DispatchIndirectArgs>(1);
  919. // Create vis pass
  920. {
  921. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis: %s", in.m_passesName.cstr()));
  922. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  923. pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavCompute);
  924. pass.setWork([this, viewProjMat = in.m_viewProjectionMatrix, lodDistances = in.m_lodDistances, pointOfTest = in.m_pointOfTest,
  925. testRadius = in.m_testRadius, instancesBuff = out.m_instancesBuffer, indicesBuff = out.m_renderableIndicesBuffer,
  926. zeroInstancesDispatchArgsBuff](RenderPassWorkContext& rgraph) {
  927. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  928. cmdb.bindShaderProgram(m_visibilityGrProg.get());
  929. GpuVisibilityAccelerationStructuresConstants consts;
  930. Array<Plane, 6> planes;
  931. extractClipPlanes(viewProjMat, planes);
  932. for(U32 i = 0; i < 6; ++i)
  933. {
  934. consts.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  935. }
  936. consts.m_pointOfTest = pointOfTest;
  937. consts.m_testRadius = testRadius;
  938. ANKI_ASSERT(kMaxLodCount == 3);
  939. consts.m_maxLodDistances[0] = lodDistances[0];
  940. consts.m_maxLodDistances[1] = lodDistances[1];
  941. consts.m_maxLodDistances[2] = kMaxF32;
  942. consts.m_maxLodDistances[3] = kMaxF32;
  943. cmdb.setFastConstants(&consts, sizeof(consts));
  944. cmdb.bindSrv(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferView());
  945. cmdb.bindSrv(1, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  946. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  947. cmdb.bindSrv(3, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  948. cmdb.bindUav(0, 0, instancesBuff);
  949. cmdb.bindUav(1, 0, indicesBuff);
  950. cmdb.bindUav(2, 0, BufferView(m_counterBuffer.get(), 0, sizeof(U32) * 2));
  951. cmdb.bindUav(3, 0, zeroInstancesDispatchArgsBuff);
  952. const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
  953. dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
  954. });
  955. }
  956. // Zero remaining instances
  957. {
  958. NonGraphicsRenderPass& pass =
  959. rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis zero remaining instances: %s", in.m_passesName.cstr()));
  960. pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavCompute);
  961. pass.setWork([this, zeroInstancesDispatchArgsBuff, instancesBuff = out.m_instancesBuffer,
  962. indicesBuff = out.m_renderableIndicesBuffer](RenderPassWorkContext& rgraph) {
  963. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  964. cmdb.bindShaderProgram(m_zeroRemainingInstancesGrProg.get());
  965. cmdb.bindSrv(0, 0, indicesBuff);
  966. cmdb.bindUav(0, 0, instancesBuff);
  967. cmdb.dispatchComputeIndirect(zeroInstancesDispatchArgsBuff);
  968. });
  969. }
  970. }
  971. } // end namespace anki