GpuVisibility.cpp 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #include <AnKi/Renderer/Utils/GpuVisibility.h>
  6. #include <AnKi/Renderer/Renderer.h>
  7. #include <AnKi/Scene/RenderStateBucket.h>
  8. #include <AnKi/Scene/GpuSceneArray.h>
  9. #include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
  10. #include <AnKi/Core/GpuMemory/RebarTransientMemoryPool.h>
  11. #include <AnKi/Core/GpuMemory/GpuSceneBuffer.h>
  12. #include <AnKi/Collision/Functions.h>
  13. #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
  14. #include <AnKi/Core/GpuMemory/UnifiedGeometryBuffer.h>
  15. #include <AnKi/Core/StatsSet.h>
  16. #include <AnKi/Core/CVarSet.h>
  17. namespace anki {
  18. constexpr U32 kMaxVisibleObjects = 30 * 1024;
  19. constexpr U32 kMaxVisiblePrimitives = 40'000'000;
  20. constexpr U32 kMaxVisibleMeshlets = kMaxVisiblePrimitives / kMaxPrimitivesPerMeshlet;
  21. static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem",
  22. StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
  23. static StatCounter g_maxGpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem: max ever used/frame",
  24. StatFlag::kBytes | StatFlag::kMainThreadUpdates);
  25. class GpuVisLimits
  26. {
  27. public:
  28. U32 m_maxVisibleLegacyRenderables = 0;
  29. U32 m_totalLegacyRenderables = 0;
  30. U32 m_maxVisibleMeshlets = 0;
  31. };
  32. static GpuVisLimits computeLimits(RenderingTechnique t)
  33. {
  34. GpuVisLimits out;
  35. const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
  36. const U32 meshletUserCount = buckets.getBucketsActiveUserCountWithMeshletSupport(t);
  37. ANKI_ASSERT(meshletUserCount == 0 || (g_meshletRenderingCVar.get() || GrManager::getSingleton().getDeviceCapabilities().m_meshShaders));
  38. out.m_totalLegacyRenderables = buckets.getBucketsActiveUserCountWithNoMeshletSupport(t);
  39. out.m_maxVisibleLegacyRenderables = min(out.m_totalLegacyRenderables, kMaxVisibleObjects);
  40. out.m_maxVisibleMeshlets = (meshletUserCount) ? min(kMaxVisibleMeshlets, buckets.getBucketsLod0MeshletCount(t)) : 0;
  41. return out;
  42. }
  43. class GpuVisMemoryStats : public RendererObject, public MakeSingletonSimple<GpuVisMemoryStats>
  44. {
  45. public:
  46. void informAboutAllocation(PtrSize size)
  47. {
  48. if(m_frameIdx != getRenderer().getFrameCount())
  49. {
  50. // First call in the frame, update the stat var
  51. m_frameIdx = getRenderer().getFrameCount();
  52. m_maxMemUsedInFrame = max(m_maxMemUsedInFrame, m_memUsedThisFrame);
  53. m_memUsedThisFrame = 0;
  54. g_maxGpuVisMemoryAllocatedStatVar.set(m_maxMemUsedInFrame);
  55. }
  56. m_memUsedThisFrame += size;
  57. }
  58. private:
  59. PtrSize m_memUsedThisFrame = 0;
  60. PtrSize m_maxMemUsedInFrame = 0;
  61. U64 m_frameIdx = kMaxU64;
  62. };
  63. template<typename T>
  64. static BufferView allocateStructuredBuffer(U32 count)
  65. {
  66. BufferView out = {};
  67. if(count > 0)
  68. {
  69. g_gpuVisMemoryAllocatedStatVar.increment(sizeof(T) * count);
  70. out = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
  71. GpuVisMemoryStats::getSingleton().informAboutAllocation(sizeof(T) * count);
  72. }
  73. return out;
  74. }
  75. Error GpuVisibility::init()
  76. {
  77. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  78. {
  79. for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
  80. {
  81. for(MutatorValue genHash = 0; genHash < 2; ++genHash)
  82. {
  83. for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
  84. {
  85. for(MutatorValue gatherLegacy = 0; gatherLegacy < 2; ++gatherLegacy)
  86. {
  87. if(gatherLegacy == 0 && gatherMeshlets == 0)
  88. {
  89. continue; // Not allowed
  90. }
  91. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
  92. {{"HZB_TEST", hzb},
  93. {"DISTANCE_TEST", 0},
  94. {"GATHER_AABBS", gatherAabbs},
  95. {"HASH_VISIBLES", genHash},
  96. {"GATHER_MESHLETS", gatherMeshlets},
  97. {"GATHER_LEGACY", gatherLegacy}},
  98. m_1stStageProg, m_frustumGrProgs[hzb][gatherAabbs][genHash][gatherMeshlets][gatherLegacy]));
  99. }
  100. }
  101. }
  102. }
  103. }
  104. for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
  105. {
  106. for(MutatorValue genHash = 0; genHash < 2; ++genHash)
  107. {
  108. for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
  109. {
  110. for(MutatorValue gatherLegacy = 0; gatherLegacy < 2; ++gatherLegacy)
  111. {
  112. if(gatherLegacy == 0 && gatherMeshlets == 0)
  113. {
  114. continue; // Not allowed
  115. }
  116. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
  117. {{"HZB_TEST", 0},
  118. {"DISTANCE_TEST", 1},
  119. {"GATHER_AABBS", gatherAabbs},
  120. {"HASH_VISIBLES", genHash},
  121. {"GATHER_MESHLETS", gatherMeshlets},
  122. {"GATHER_LEGACY", gatherLegacy}},
  123. m_1stStageProg, m_distGrProgs[gatherAabbs][genHash][gatherMeshlets][gatherLegacy]));
  124. }
  125. }
  126. }
  127. }
  128. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
  129. {{"HZB_TEST", 0}, {"PASSTHROUGH", 0}, {"MESH_SHADERS", 0}, {"STORE_MESHLETS_FAILED_HZB", 1}}, m_2ndStageProg,
  130. m_gatherGrProg, "Legacy"));
  131. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  132. {
  133. for(MutatorValue passthrough = 0; passthrough < 2; ++passthrough)
  134. {
  135. for(MutatorValue meshShaders = 0; meshShaders < 2; ++meshShaders)
  136. {
  137. for(MutatorValue storeMeshletsFailedHzb = 0; storeMeshletsFailedHzb < 2; ++storeMeshletsFailedHzb)
  138. {
  139. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
  140. {{"HZB_TEST", hzb},
  141. {"PASSTHROUGH", passthrough},
  142. {"MESH_SHADERS", meshShaders},
  143. {"STORE_MESHLETS_FAILED_HZB", storeMeshletsFailedHzb}},
  144. m_2ndStageProg, m_meshletGrProgs[hzb][passthrough][meshShaders][storeMeshletsFailedHzb],
  145. "Meshlets"));
  146. }
  147. }
  148. }
  149. }
  150. return Error::kNone;
  151. }
  152. void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out)
  153. {
  154. ANKI_ASSERT(in.m_lodReferencePoint.x() != kMaxF32);
  155. if(RenderStateBucketContainer::getSingleton().getBucketsActiveUserCount(in.m_technique) == 0) [[unlikely]]
  156. {
  157. // Early exit
  158. in = {};
  159. return;
  160. }
  161. RenderGraphBuilder& rgraph = *in.m_rgraph;
  162. class DistanceTestData
  163. {
  164. public:
  165. Vec3 m_pointOfTest;
  166. F32 m_testRadius;
  167. };
  168. class FrustumTestData
  169. {
  170. public:
  171. RenderTargetHandle m_hzbRt;
  172. Mat4 m_viewProjMat;
  173. UVec2 m_finalRenderTargetSize;
  174. };
  175. FrustumTestData* frustumTestData = nullptr;
  176. DistanceTestData* distTestData = nullptr;
  177. Bool bStoreMeshletsFailedHzb = false;
  178. if(distanceBased)
  179. {
  180. distTestData = newInstance<DistanceTestData>(getRenderer().getFrameMemoryPool());
  181. const DistanceGpuVisibilityInput& din = static_cast<DistanceGpuVisibilityInput&>(in);
  182. distTestData->m_pointOfTest = din.m_pointOfTest;
  183. distTestData->m_testRadius = din.m_testRadius;
  184. }
  185. else
  186. {
  187. frustumTestData = newInstance<FrustumTestData>(getRenderer().getFrameMemoryPool());
  188. const FrustumGpuVisibilityInput& fin = static_cast<FrustumGpuVisibilityInput&>(in);
  189. frustumTestData->m_viewProjMat = fin.m_viewProjectionMatrix;
  190. frustumTestData->m_finalRenderTargetSize = fin.m_viewportSize;
  191. if(fin.m_hzbRt)
  192. {
  193. frustumTestData->m_hzbRt = *fin.m_hzbRt;
  194. }
  195. bStoreMeshletsFailedHzb = fin.m_twoPhaseOcclusionCulling;
  196. }
  197. const Bool firstCallInFrame = m_persistentMemory.m_frameIdx != getRenderer().getFrameCount();
  198. if(firstCallInFrame)
  199. {
  200. m_persistentMemory.m_frameIdx = getRenderer().getFrameCount();
  201. }
  202. // OoM
  203. if(firstCallInFrame)
  204. {
  205. U32 data;
  206. PtrSize dataReadSize;
  207. getRenderer().getReadbackManager().readMostRecentData(m_outOfMemoryReadback, &data, sizeof(data), dataReadSize);
  208. if(dataReadSize == sizeof(U32) && data != 0)
  209. {
  210. CString who;
  211. switch(data)
  212. {
  213. case 0b1:
  214. who = "Stage 1";
  215. break;
  216. case 0b10:
  217. who = "Stage 2";
  218. break;
  219. case 0b11:
  220. who = "Both stages";
  221. break;
  222. default:
  223. ANKI_ASSERT(0);
  224. }
  225. ANKI_RESOURCE_LOGE("GPU visibility went out of memory: %s", who.cstr());
  226. }
  227. m_outOfMemoryReadbackBuffer = getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_outOfMemoryReadback, 1);
  228. }
  229. // Get some limits
  230. const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
  231. const U32 bucketCount = buckets.getBucketCount(in.m_technique);
  232. const GpuVisLimits limits = computeLimits(in.m_technique);
  233. const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
  234. const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
  235. const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
  236. const Bool bLegacyRendering = limits.m_maxVisibleLegacyRenderables > 0;
  237. if(bStoreMeshletsFailedHzb)
  238. {
  239. ANKI_ASSERT(bMeshletRendering && frustumTestData->m_hzbRt.isValid());
  240. }
  241. // Allocate persistent memory for the frame
  242. if(firstCallInFrame)
  243. {
  244. GpuVisLimits maxLimits;
  245. for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
  246. {
  247. const GpuVisLimits limits = computeLimits(t);
  248. maxLimits.m_maxVisibleLegacyRenderables = max(maxLimits.m_maxVisibleLegacyRenderables, limits.m_maxVisibleLegacyRenderables);
  249. maxLimits.m_maxVisibleMeshlets = max(maxLimits.m_maxVisibleMeshlets, limits.m_maxVisibleMeshlets);
  250. }
  251. m_persistentMemory.m_stage1.m_visibleRenderables =
  252. allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleLegacyRenderables);
  253. m_persistentMemory.m_stage1.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(maxLimits.m_maxVisibleMeshlets);
  254. m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(maxLimits.m_maxVisibleLegacyRenderables);
  255. m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs =
  256. allocateStructuredBuffer<DrawIndexedIndirectArgs>(maxLimits.m_maxVisibleLegacyRenderables);
  257. m_persistentMemory.m_stage2Meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
  258. m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb =
  259. allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleMeshlets);
  260. m_persistentMemory.m_stage3.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
  261. m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
  262. : m_persistentMemory.m_stage1.m_visibleRenderables,
  263. BufferUsageBit::kNone);
  264. }
  265. // Compute the MDI sub-ranges
  266. if(limits.m_maxVisibleLegacyRenderables)
  267. {
  268. newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), bucketCount, out.m_legacy.m_bucketIndirectArgsRanges);
  269. U32 ibucket = 0;
  270. U32 offset = 0;
  271. buckets.iterateBuckets(in.m_technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletCount) {
  272. out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance = offset;
  273. if(meshletCount == 0 && userCount > 0)
  274. {
  275. out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount =
  276. max(1u, U32(U64(userCount) * limits.m_maxVisibleLegacyRenderables / limits.m_totalLegacyRenderables));
  277. offset += out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
  278. }
  279. ++ibucket;
  280. });
  281. // The last element should point to the limit of the buffer
  282. InstanceRange& last = out.m_legacy.m_bucketIndirectArgsRanges.getBack();
  283. ANKI_ASSERT(limits.m_maxVisibleLegacyRenderables >= last.m_firstInstance);
  284. last.m_instanceCount = limits.m_maxVisibleLegacyRenderables - last.m_firstInstance;
  285. }
  286. // Allocate memory for stage 1
  287. class Stage1Mem
  288. {
  289. public:
  290. BufferView m_counters;
  291. BufferView m_visibleRenderables;
  292. BufferView m_visibleMeshlets;
  293. BufferView m_renderablePrefixSums;
  294. BufferView m_meshletPrefixSums;
  295. BufferView m_gpuVisIndirectDispatchArgs;
  296. BufferView m_visibleAabbIndices;
  297. BufferView m_hash;
  298. } stage1Mem;
  299. stage1Mem.m_counters = allocateStructuredBuffer<U32>(U32(GpuVisibilityCounter::kCount));
  300. if(in.m_limitMemory)
  301. {
  302. PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
  303. if(newRange)
  304. {
  305. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleRenderables.getRange());
  306. stage1Mem.m_visibleRenderables = BufferView(m_persistentMemory.m_stage1.m_visibleRenderables).setRange(newRange);
  307. }
  308. newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
  309. if(newRange)
  310. {
  311. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleMeshlets.getRange());
  312. stage1Mem.m_visibleMeshlets = BufferView(m_persistentMemory.m_stage1.m_visibleMeshlets).setRange(newRange);
  313. }
  314. }
  315. else
  316. {
  317. stage1Mem.m_visibleRenderables = allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(limits.m_maxVisibleLegacyRenderables);
  318. stage1Mem.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(limits.m_maxVisibleMeshlets);
  319. }
  320. stage1Mem.m_renderablePrefixSums = allocateStructuredBuffer<U32>(bucketCount);
  321. stage1Mem.m_meshletPrefixSums = allocateStructuredBuffer<U32>(bucketCount);
  322. stage1Mem.m_gpuVisIndirectDispatchArgs = allocateStructuredBuffer<DispatchIndirectArgs>(U32(GpuVisibilityIndirectDispatches::kCount));
  323. if(in.m_gatherAabbIndices)
  324. {
  325. stage1Mem.m_visibleAabbIndices = allocateStructuredBuffer<U32>(buckets.getBucketsActiveUserCount(in.m_technique));
  326. }
  327. if(in.m_hashVisibles)
  328. {
  329. stage1Mem.m_hash = allocateStructuredBuffer<GpuVisibilityHash>(1);
  330. }
  331. // Allocate memory for stage 2
  332. class Stage2Mem
  333. {
  334. public:
  335. class
  336. {
  337. public:
  338. BufferView m_instanceRateRenderables;
  339. BufferView m_drawIndexedIndirectArgs;
  340. BufferView m_mdiDrawCounts;
  341. } m_legacy;
  342. class
  343. {
  344. public:
  345. BufferView m_indirectDrawArgs;
  346. BufferView m_dispatchMeshIndirectArgs;
  347. BufferView m_meshletInstances;
  348. BufferView m_meshletsFailedHzb;
  349. } m_meshlet;
  350. } stage2Mem;
  351. if(bLegacyRendering)
  352. {
  353. if(in.m_limitMemory)
  354. {
  355. PtrSize newRange = sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables;
  356. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables.getRange());
  357. stage2Mem.m_legacy.m_instanceRateRenderables = BufferView(m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables).setRange(newRange);
  358. newRange = sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables;
  359. ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs.getRange());
  360. stage2Mem.m_legacy.m_drawIndexedIndirectArgs = BufferView(m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs).setRange(newRange);
  361. }
  362. else
  363. {
  364. stage2Mem.m_legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(limits.m_maxVisibleLegacyRenderables);
  365. stage2Mem.m_legacy.m_drawIndexedIndirectArgs = allocateStructuredBuffer<DrawIndexedIndirectArgs>(limits.m_maxVisibleLegacyRenderables);
  366. }
  367. stage2Mem.m_legacy.m_mdiDrawCounts = allocateStructuredBuffer<U32>(bucketCount);
  368. }
  369. if(bMeshletRendering)
  370. {
  371. if(bHwMeshletRendering)
  372. {
  373. stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
  374. }
  375. else
  376. {
  377. stage2Mem.m_meshlet.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
  378. }
  379. const U32 newCount = limits.m_maxVisibleMeshlets;
  380. if(in.m_limitMemory)
  381. {
  382. ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
  383. stage2Mem.m_meshlet.m_meshletInstances =
  384. BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
  385. }
  386. else
  387. {
  388. stage2Mem.m_meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
  389. }
  390. if(bStoreMeshletsFailedHzb)
  391. {
  392. const U32 newCount = limits.m_maxVisibleMeshlets;
  393. if(in.m_limitMemory)
  394. {
  395. ANKI_ASSERT(newCount * sizeof(GpuVisibilityVisibleMeshletDesc) <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
  396. stage2Mem.m_meshlet.m_meshletsFailedHzb =
  397. BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newCount * sizeof(GpuVisibilityVisibleMeshletDesc));
  398. }
  399. else
  400. {
  401. stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(newCount);
  402. }
  403. }
  404. }
  405. // Stage 3 memory
  406. class Stage3Mem
  407. {
  408. public:
  409. BufferView m_indirectDrawArgs;
  410. BufferView m_dispatchMeshIndirectArgs;
  411. BufferView m_meshletInstances;
  412. } stage3Mem;
  413. if(bStoreMeshletsFailedHzb)
  414. {
  415. if(bHwMeshletRendering)
  416. {
  417. stage3Mem.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
  418. }
  419. else
  420. {
  421. stage3Mem.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
  422. }
  423. const U32 newCount = limits.m_maxVisibleMeshlets;
  424. if(in.m_limitMemory)
  425. {
  426. ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
  427. stage3Mem.m_meshletInstances =
  428. BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
  429. }
  430. else
  431. {
  432. stage3Mem.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
  433. }
  434. }
  435. // Setup output
  436. out.m_legacy.m_renderableInstancesBuffer = stage2Mem.m_legacy.m_instanceRateRenderables;
  437. out.m_legacy.m_mdiDrawCountsBuffer = stage2Mem.m_legacy.m_mdiDrawCounts;
  438. out.m_legacy.m_drawIndexedIndirectArgsBuffer = stage2Mem.m_legacy.m_drawIndexedIndirectArgs;
  439. out.m_mesh.m_dispatchMeshIndirectArgsBuffer = stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs;
  440. out.m_mesh.m_drawIndirectArgs = stage2Mem.m_meshlet.m_indirectDrawArgs;
  441. out.m_mesh.m_meshletInstancesBuffer = stage2Mem.m_meshlet.m_meshletInstances;
  442. out.m_visibleAaabbIndicesBuffer = stage1Mem.m_visibleAabbIndices;
  443. out.m_visiblesHashBuffer = stage1Mem.m_hash;
  444. if(bHwMeshletRendering)
  445. {
  446. out.m_mesh.m_firstMeshletBuffer = stage1Mem.m_meshletPrefixSums;
  447. }
  448. if(bStoreMeshletsFailedHzb)
  449. {
  450. out.m_stage1And2Mem.m_meshletsFailedHzb = stage2Mem.m_meshlet.m_meshletsFailedHzb;
  451. out.m_stage1And2Mem.m_counters = stage1Mem.m_counters;
  452. out.m_stage1And2Mem.m_meshletPrefixSums = stage1Mem.m_meshletPrefixSums;
  453. out.m_stage1And2Mem.m_gpuVisIndirectDispatchArgs = stage1Mem.m_gpuVisIndirectDispatchArgs;
  454. out.m_stage3Mem.m_indirectDrawArgs = stage3Mem.m_indirectDrawArgs;
  455. out.m_stage3Mem.m_dispatchMeshIndirectArgs = stage3Mem.m_dispatchMeshIndirectArgs;
  456. out.m_stage3Mem.m_meshletInstances = stage3Mem.m_meshletInstances;
  457. }
  458. // Use one buffer as a depedency. Doesn't matter which
  459. out.m_dependency =
  460. (in.m_limitMemory) ? m_persistentMemory.m_dep : rgraph.importBuffer(stage1Mem.m_gpuVisIndirectDispatchArgs, BufferUsageBit::kNone);
  461. // Zero some stuff
  462. const BufferHandle zeroMemDep = rgraph.importBuffer(stage1Mem.m_counters, BufferUsageBit::kNone);
  463. {
  464. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis zero: %s", in.m_passesName.cstr()));
  465. pass.newBufferDependency(zeroMemDep, BufferUsageBit::kCopyDestination);
  466. pass.setWork([stage1Mem, stage2Mem, stage3Mem](RenderPassWorkContext& rpass) {
  467. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  468. constexpr Bool debugZeroing = false; // For debugging purposes zero everything
  469. #define ANKI_ZERO(buff, alwaysZero) \
  470. if((alwaysZero || debugZeroing) && buff.isValid()) \
  471. { \
  472. cmdb.pushDebugMarker(#buff, Vec3(1.0f, 1.0f, 1.0f)); \
  473. cmdb.fillBuffer(buff, 0); \
  474. cmdb.popDebugMarker(); \
  475. }
  476. ANKI_ZERO(stage1Mem.m_counters, true)
  477. ANKI_ZERO(stage1Mem.m_visibleRenderables, false)
  478. ANKI_ZERO(stage1Mem.m_visibleMeshlets, false)
  479. ANKI_ZERO(stage1Mem.m_renderablePrefixSums, true)
  480. ANKI_ZERO(stage1Mem.m_meshletPrefixSums, true)
  481. ANKI_ZERO(stage1Mem.m_gpuVisIndirectDispatchArgs, false)
  482. ANKI_ZERO(stage1Mem.m_visibleAabbIndices, false)
  483. ANKI_ZERO(stage1Mem.m_hash, true)
  484. ANKI_ZERO(stage2Mem.m_legacy.m_instanceRateRenderables, false)
  485. ANKI_ZERO(stage2Mem.m_legacy.m_drawIndexedIndirectArgs, true)
  486. ANKI_ZERO(stage2Mem.m_legacy.m_mdiDrawCounts, true)
  487. ANKI_ZERO(stage2Mem.m_meshlet.m_indirectDrawArgs, true)
  488. ANKI_ZERO(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs, true)
  489. ANKI_ZERO(stage2Mem.m_meshlet.m_meshletInstances, false)
  490. ANKI_ZERO(stage2Mem.m_meshlet.m_meshletsFailedHzb, false)
  491. ANKI_ZERO(stage3Mem.m_indirectDrawArgs, true)
  492. ANKI_ZERO(stage3Mem.m_dispatchMeshIndirectArgs, true)
  493. ANKI_ZERO(stage3Mem.m_meshletInstances, false)
  494. #undef ANKI_ZERO
  495. });
  496. }
  497. // 1st stage
  498. {
  499. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 1st stage: %s", in.m_passesName.cstr()));
  500. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  501. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavCompute);
  502. pass.newBufferDependency(zeroMemDep, BufferUsageBit::kUavCompute);
  503. if(frustumTestData && frustumTestData->m_hzbRt.isValid())
  504. {
  505. pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSrvCompute);
  506. }
  507. pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
  508. technique = in.m_technique, stage1Mem, bLegacyRendering, bMeshletRendering](RenderPassWorkContext& rpass) {
  509. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  510. const Bool gatherAabbIndices = stage1Mem.m_visibleAabbIndices.isValid();
  511. const Bool genHash = stage1Mem.m_hash.isValid();
  512. if(frustumTestData)
  513. {
  514. cmdb.bindShaderProgram(
  515. m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash][bMeshletRendering][bLegacyRendering].get());
  516. }
  517. else
  518. {
  519. cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash][bMeshletRendering][bLegacyRendering].get());
  520. }
  521. BufferView aabbsBuffer;
  522. U32 aabbCount = 0;
  523. switch(technique)
  524. {
  525. case RenderingTechnique::kGBuffer:
  526. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getBufferView();
  527. aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
  528. break;
  529. case RenderingTechnique::kDepth:
  530. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getBufferView();
  531. aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
  532. break;
  533. case RenderingTechnique::kForward:
  534. aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getBufferView();
  535. aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
  536. break;
  537. default:
  538. ANKI_ASSERT(0);
  539. }
  540. cmdb.bindSrv(0, 0, aabbsBuffer);
  541. cmdb.bindSrv(1, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  542. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  543. cmdb.bindSrv(3, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  544. cmdb.bindSrv(4, 0, GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
  545. cmdb.bindUav(0, 0, stage1Mem.m_counters);
  546. cmdb.bindUav(1, 0, (bLegacyRendering) ? stage1Mem.m_visibleRenderables : BufferView(&getRenderer().getDummyBuffer()));
  547. cmdb.bindUav(2, 0, (bMeshletRendering) ? stage1Mem.m_visibleMeshlets : BufferView(&getRenderer().getDummyBuffer()));
  548. cmdb.bindUav(3, 0, (bLegacyRendering) ? stage1Mem.m_renderablePrefixSums : BufferView(&getRenderer().getDummyBuffer()));
  549. cmdb.bindUav(4, 0, (bMeshletRendering) ? stage1Mem.m_meshletPrefixSums : BufferView(&getRenderer().getDummyBuffer()));
  550. cmdb.bindUav(5, 0, stage1Mem.m_gpuVisIndirectDispatchArgs);
  551. cmdb.bindUav(6, 0, m_outOfMemoryReadbackBuffer);
  552. if(gatherAabbIndices)
  553. {
  554. cmdb.bindUav(7, 0, stage1Mem.m_visibleAabbIndices);
  555. }
  556. if(genHash)
  557. {
  558. cmdb.bindUav(8, 0, stage1Mem.m_hash);
  559. }
  560. if(frustumTestData)
  561. {
  562. FrustumGpuVisibilityConsts* consts = allocateAndBindConstants<FrustumGpuVisibilityConsts>(cmdb, 0, 0);
  563. Array<Plane, 6> planes;
  564. extractClipPlanes(frustumTestData->m_viewProjMat, planes);
  565. for(U32 i = 0; i < 6; ++i)
  566. {
  567. consts->m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  568. }
  569. ANKI_ASSERT(kMaxLodCount == 3);
  570. consts->m_maxLodDistances[0] = lodDistances[0];
  571. consts->m_maxLodDistances[1] = lodDistances[1];
  572. consts->m_maxLodDistances[2] = kMaxF32;
  573. consts->m_maxLodDistances[3] = kMaxF32;
  574. consts->m_lodReferencePoint = lodReferencePoint;
  575. consts->m_viewProjectionMat = frustumTestData->m_viewProjMat;
  576. consts->m_finalRenderTargetSize = Vec2(frustumTestData->m_finalRenderTargetSize);
  577. if(frustumTestData->m_hzbRt.isValid())
  578. {
  579. rpass.bindSrv(5, 0, frustumTestData->m_hzbRt);
  580. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  581. }
  582. }
  583. else
  584. {
  585. DistanceGpuVisibilityConstants consts;
  586. consts.m_pointOfTest = distTestData->m_pointOfTest;
  587. consts.m_testRadius = distTestData->m_testRadius;
  588. consts.m_maxLodDistances[0] = lodDistances[0];
  589. consts.m_maxLodDistances[1] = lodDistances[1];
  590. consts.m_maxLodDistances[2] = kMaxF32;
  591. consts.m_maxLodDistances[3] = kMaxF32;
  592. consts.m_lodReferencePoint = lodReferencePoint;
  593. cmdb.setFastConstants(&consts, sizeof(consts));
  594. }
  595. dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
  596. });
  597. } // end 1st stage
  598. // 2nd stage
  599. {
  600. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 2nd stage: %s", in.m_passesName.cstr()));
  601. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  602. if(frustumTestData && frustumTestData->m_hzbRt.isValid())
  603. {
  604. pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSrvCompute);
  605. }
  606. pass.setWork([this, stage1Mem, stage2Mem, bLegacyRendering, bMeshletRendering, bHwMeshletRendering, out, frustumTestData,
  607. lodReferencePoint = in.m_lodReferencePoint, bStoreMeshletsFailedHzb](RenderPassWorkContext& rpass) {
  608. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  609. if(bLegacyRendering)
  610. {
  611. cmdb.bindShaderProgram(m_gatherGrProg.get());
  612. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  613. cmdb.bindSrv(1, 0, GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
  614. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  615. cmdb.bindSrv(3, 0, stage1Mem.m_visibleRenderables);
  616. cmdb.bindSrv(4, 0, stage1Mem.m_counters);
  617. cmdb.bindSrv(5, 0, stage1Mem.m_renderablePrefixSums);
  618. WeakArray<UVec2> firstDrawIndirectArgAndCount =
  619. allocateAndBindSrvStructuredBuffer<UVec2>(cmdb, 6, 0, out.m_legacy.m_bucketIndirectArgsRanges.getSize());
  620. for(U32 ibucket = 0; ibucket < out.m_legacy.m_bucketIndirectArgsRanges.getSize(); ++ibucket)
  621. {
  622. firstDrawIndirectArgAndCount[ibucket].x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
  623. firstDrawIndirectArgAndCount[ibucket].y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
  624. }
  625. cmdb.bindUav(0, 0, stage2Mem.m_legacy.m_instanceRateRenderables);
  626. cmdb.bindUav(1, 0, stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
  627. cmdb.bindUav(2, 0, stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
  628. cmdb.bindUav(3, 0, stage2Mem.m_legacy.m_mdiDrawCounts);
  629. cmdb.bindUav(4, 0, m_outOfMemoryReadbackBuffer);
  630. cmdb.dispatchComputeIndirect(
  631. BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
  632. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageLegacy))
  633. .setRange(sizeof(DispatchIndirectArgs)));
  634. }
  635. if(bMeshletRendering)
  636. {
  637. const Bool hzbTex = frustumTestData && frustumTestData->m_hzbRt.isValid();
  638. const Bool passthrough = frustumTestData == nullptr;
  639. const Bool meshShaders = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
  640. cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][meshShaders][bStoreMeshletsFailedHzb].get());
  641. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  642. cmdb.bindSrv(1, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  643. cmdb.bindSrv(2, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  644. cmdb.bindSrv(3, 0, UnifiedGeometryBuffer::getSingleton().getBufferView());
  645. if(hzbTex)
  646. {
  647. rpass.bindSrv(4, 0, frustumTestData->m_hzbRt);
  648. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  649. }
  650. cmdb.bindUav(0, 0, stage1Mem.m_counters);
  651. cmdb.bindSrv(5, 0, stage1Mem.m_meshletPrefixSums);
  652. cmdb.bindSrv(6, 0, stage1Mem.m_visibleMeshlets);
  653. cmdb.bindUav(1, 0, (bHwMeshletRendering) ? stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs : stage2Mem.m_meshlet.m_indirectDrawArgs);
  654. cmdb.bindUav(2, 0, stage2Mem.m_meshlet.m_meshletInstances);
  655. cmdb.bindUav(3, 0, m_outOfMemoryReadbackBuffer);
  656. if(bStoreMeshletsFailedHzb)
  657. {
  658. cmdb.bindUav(4, 0, stage2Mem.m_meshlet.m_meshletsFailedHzb);
  659. cmdb.bindUav(5, 0, stage1Mem.m_gpuVisIndirectDispatchArgs);
  660. }
  661. if(!passthrough)
  662. {
  663. GpuVisibilityMeshletConstants consts;
  664. consts.m_viewProjectionMatrix = frustumTestData->m_viewProjMat;
  665. consts.m_cameraPos = lodReferencePoint;
  666. consts.m_viewportSizef = Vec2(frustumTestData->m_finalRenderTargetSize);
  667. cmdb.setFastConstants(&consts, sizeof(consts));
  668. }
  669. cmdb.dispatchComputeIndirect(
  670. BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
  671. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageMeshlets))
  672. .setRange(sizeof(DispatchIndirectArgs)));
  673. }
  674. });
  675. } // end 2nd stage
  676. }
  677. void GpuVisibility::populateRenderGraphStage3(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
  678. {
  679. RenderGraphBuilder& rgraph = *in.m_rgraph;
  680. const GpuVisLimits limits = computeLimits(in.m_technique);
  681. const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
  682. const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
  683. const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
  684. if(!bMeshletRendering)
  685. {
  686. return;
  687. }
  688. // Set the output
  689. if(bHwMeshletRendering)
  690. {
  691. out.m_mesh.m_dispatchMeshIndirectArgsBuffer = out.m_stage3Mem.m_dispatchMeshIndirectArgs;
  692. }
  693. else
  694. {
  695. out.m_mesh.m_drawIndirectArgs = out.m_stage3Mem.m_indirectDrawArgs;
  696. }
  697. out.m_mesh.m_meshletInstancesBuffer = out.m_stage3Mem.m_meshletInstances;
  698. // Create the pass
  699. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 3rd stage: %s", in.m_passesName.cstr()));
  700. pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  701. pass.newBufferDependency(m_persistentMemory.m_dep, BufferUsageBit::kIndirectCompute | BufferUsageBit::kUavCompute);
  702. pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSrvCompute);
  703. pass.setWork([this, hzbRt = *in.m_hzbRt, bHwMeshletRendering, stage1And2Mem = out.m_stage1And2Mem, stage3Mem = out.m_stage3Mem,
  704. in](RenderPassWorkContext& rpass) {
  705. CommandBuffer& cmdb = *rpass.m_commandBuffer;
  706. const Bool hzbTex = true;
  707. const Bool passthrough = false;
  708. const Bool bStoreMeshletsFailedHzb = false;
  709. cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][bHwMeshletRendering][bStoreMeshletsFailedHzb].get());
  710. cmdb.bindSrv(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  711. cmdb.bindSrv(1, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  712. cmdb.bindSrv(2, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  713. cmdb.bindSrv(3, 0, UnifiedGeometryBuffer::getSingleton().getBufferView());
  714. rpass.bindSrv(4, 0, hzbRt);
  715. cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
  716. cmdb.bindUav(0, 0, stage1And2Mem.m_counters);
  717. cmdb.bindSrv(5, 0, stage1And2Mem.m_meshletPrefixSums);
  718. cmdb.bindSrv(6, 0, stage1And2Mem.m_meshletsFailedHzb);
  719. cmdb.bindUav(1, 0, (bHwMeshletRendering) ? stage3Mem.m_dispatchMeshIndirectArgs : stage3Mem.m_indirectDrawArgs);
  720. cmdb.bindUav(2, 0, stage3Mem.m_meshletInstances);
  721. cmdb.bindUav(3, 0, m_outOfMemoryReadbackBuffer);
  722. GpuVisibilityMeshletConstants consts;
  723. consts.m_viewProjectionMatrix = in.m_viewProjectionMatrix;
  724. consts.m_cameraPos = in.m_lodReferencePoint;
  725. consts.m_viewportSizef = Vec2(in.m_viewportSize);
  726. cmdb.setFastConstants(&consts, sizeof(consts));
  727. cmdb.dispatchComputeIndirect(BufferView(stage1And2Mem.m_gpuVisIndirectDispatchArgs)
  728. .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k3rdStageMeshlets))
  729. .setRange(sizeof(DispatchIndirectArgs)));
  730. });
  731. }
  732. Error GpuVisibilityNonRenderables::init()
  733. {
  734. ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin", m_prog));
  735. for(MutatorValue hzb = 0; hzb < 2; ++hzb)
  736. {
  737. for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
  738. {
  739. for(MutatorValue cpuFeedback = 0; cpuFeedback < 2; ++cpuFeedback)
  740. {
  741. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin",
  742. {{"HZB_TEST", hzb}, {"OBJECT_TYPE", MutatorValue(type)}, {"CPU_FEEDBACK", cpuFeedback}}, m_prog,
  743. m_grProgs[hzb][type][cpuFeedback]));
  744. }
  745. }
  746. }
  747. return Error::kNone;
  748. }
  749. void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderablesInput& in, GpuVisibilityNonRenderablesOutput& out)
  750. {
  751. ANKI_ASSERT(in.m_viewProjectionMat != Mat4::getZero());
  752. RenderGraphBuilder& rgraph = *in.m_rgraph;
  753. U32 objCount = 0;
  754. switch(in.m_objectType)
  755. {
  756. case GpuSceneNonRenderableObjectType::kLight:
  757. objCount = GpuSceneArrays::Light::getSingleton().getElementCount();
  758. break;
  759. case GpuSceneNonRenderableObjectType::kDecal:
  760. objCount = GpuSceneArrays::Decal::getSingleton().getElementCount();
  761. break;
  762. case GpuSceneNonRenderableObjectType::kFogDensityVolume:
  763. objCount = GpuSceneArrays::FogDensityVolume::getSingleton().getElementCount();
  764. break;
  765. case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
  766. objCount = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getElementCount();
  767. break;
  768. case GpuSceneNonRenderableObjectType::kReflectionProbe:
  769. objCount = GpuSceneArrays::ReflectionProbe::getSingleton().getElementCount();
  770. break;
  771. default:
  772. ANKI_ASSERT(0);
  773. }
  774. if(objCount == 0)
  775. {
  776. WeakArray<U32> count;
  777. out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, count);
  778. count[0] = 0;
  779. out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
  780. return;
  781. }
  782. if(in.m_cpuFeedbackBuffer.isValid())
  783. {
  784. ANKI_ASSERT(in.m_cpuFeedbackBuffer.getRange() == sizeof(U32) * (objCount * 2 + 1));
  785. }
  786. const Bool firstRunInFrame = m_lastFrameIdx != getRenderer().getFrameCount();
  787. if(firstRunInFrame)
  788. {
  789. // 1st run in this frame, do some bookkeeping
  790. m_lastFrameIdx = getRenderer().getFrameCount();
  791. m_counterBufferOffset = 0;
  792. m_counterBufferZeroingHandle = {};
  793. }
  794. U32 counterBufferElementSize;
  795. if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
  796. {
  797. counterBufferElementSize = sizeof(GpuVisibilityNonRenderablesCounters);
  798. }
  799. else
  800. {
  801. counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment,
  802. U32(sizeof(GpuVisibilityNonRenderablesCounters)));
  803. }
  804. if(!m_counterBuffer.isCreated() || m_counterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
  805. {
  806. // Counter buffer not created or not big enough, create a new one
  807. BufferInitInfo buffInit("GpuVisibilityNonRenderablesCounters");
  808. buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2 : counterBufferElementSize * kInitialCounterArraySize;
  809. buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
  810. m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
  811. m_counterBufferZeroingHandle = rgraph.importBuffer(BufferView(m_counterBuffer.get()), buffInit.m_usage);
  812. NonGraphicsRenderPass& pass =
  813. rgraph.newNonGraphicsRenderPass(generateTempPassName("Non-renderables vis: Clear counter buff: %s", in.m_passesName.cstr()));
  814. pass.newBufferDependency(m_counterBufferZeroingHandle, BufferUsageBit::kCopyDestination);
  815. pass.setWork([counterBuffer = m_counterBuffer](RenderPassWorkContext& rgraph) {
  816. rgraph.m_commandBuffer->fillBuffer(BufferView(counterBuffer.get()), 0);
  817. });
  818. m_counterBufferOffset = 0;
  819. }
  820. else if(!firstRunInFrame)
  821. {
  822. m_counterBufferOffset += counterBufferElementSize;
  823. }
  824. // Allocate memory for the result
  825. out.m_visiblesBuffer = allocateStructuredBuffer<U32>(objCount + 1);
  826. out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
  827. // Create the renderpass
  828. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Non-renderables vis: %s", in.m_passesName.cstr()));
  829. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  830. pass.newBufferDependency(out.m_visiblesBufferHandle, BufferUsageBit::kUavCompute);
  831. if(in.m_hzbRt)
  832. {
  833. pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSrvCompute);
  834. }
  835. if(m_counterBufferZeroingHandle.isValid()) [[unlikely]]
  836. {
  837. pass.newBufferDependency(m_counterBufferZeroingHandle, BufferUsageBit::kSrvCompute | BufferUsageBit::kUavCompute);
  838. }
  839. pass.setWork([this, objType = in.m_objectType, feedbackBuffer = in.m_cpuFeedbackBuffer, viewProjectionMat = in.m_viewProjectionMat,
  840. visibleIndicesBuffHandle = out.m_visiblesBufferHandle, counterBuffer = m_counterBuffer, counterBufferOffset = m_counterBufferOffset,
  841. objCount](RenderPassWorkContext& rgraph) {
  842. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  843. const Bool needsFeedback = feedbackBuffer.isValid();
  844. cmdb.bindShaderProgram(m_grProgs[0][objType][needsFeedback].get());
  845. BufferView objBuffer;
  846. switch(objType)
  847. {
  848. case GpuSceneNonRenderableObjectType::kLight:
  849. objBuffer = GpuSceneArrays::Light::getSingleton().getBufferView();
  850. break;
  851. case GpuSceneNonRenderableObjectType::kDecal:
  852. objBuffer = GpuSceneArrays::Decal::getSingleton().getBufferView();
  853. break;
  854. case GpuSceneNonRenderableObjectType::kFogDensityVolume:
  855. objBuffer = GpuSceneArrays::FogDensityVolume::getSingleton().getBufferView();
  856. break;
  857. case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
  858. objBuffer = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getBufferView();
  859. break;
  860. case GpuSceneNonRenderableObjectType::kReflectionProbe:
  861. objBuffer = GpuSceneArrays::ReflectionProbe::getSingleton().getBufferView();
  862. break;
  863. default:
  864. ANKI_ASSERT(0);
  865. }
  866. cmdb.bindSrv(0, 0, objBuffer);
  867. GpuVisibilityNonRenderableConstants consts;
  868. Array<Plane, 6> planes;
  869. extractClipPlanes(viewProjectionMat, planes);
  870. for(U32 i = 0; i < 6; ++i)
  871. {
  872. consts.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  873. }
  874. cmdb.setFastConstants(&consts, sizeof(consts));
  875. rgraph.bindUav(0, 0, visibleIndicesBuffHandle);
  876. cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(GpuVisibilityNonRenderablesCounters)));
  877. if(needsFeedback)
  878. {
  879. cmdb.bindUav(2, 0, feedbackBuffer);
  880. }
  881. dispatchPPCompute(cmdb, 64, 1, objCount, 1);
  882. });
  883. }
  884. Error GpuVisibilityAccelerationStructures::init()
  885. {
  886. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", m_visibilityProg, m_visibilityGrProg));
  887. ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructuresZeroRemainingInstances.ankiprogbin", m_zeroRemainingInstancesProg,
  888. m_zeroRemainingInstancesGrProg));
  889. BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
  890. inf.m_size = sizeof(U32) * 2;
  891. inf.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
  892. m_counterBuffer = GrManager::getSingleton().newBuffer(inf);
  893. zeroBuffer(m_counterBuffer.get());
  894. return Error::kNone;
  895. }
  896. void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccelerationStructuresInput& in,
  897. GpuVisibilityAccelerationStructuresOutput& out)
  898. {
  899. in.validate();
  900. RenderGraphBuilder& rgraph = *in.m_rgraph;
  901. #if ANKI_ASSERTIONS_ENABLED
  902. ANKI_ASSERT(m_lastFrameIdx != getRenderer().getFrameCount());
  903. m_lastFrameIdx = getRenderer().getFrameCount();
  904. #endif
  905. // Allocate the transient buffers
  906. const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
  907. out.m_instancesBuffer = allocateStructuredBuffer<AccelerationStructureInstance>(aabbCount);
  908. out.m_someBufferHandle = rgraph.importBuffer(out.m_instancesBuffer, BufferUsageBit::kUavCompute);
  909. out.m_renderableIndicesBuffer = allocateStructuredBuffer<U32>(aabbCount + 1);
  910. const BufferView zeroInstancesDispatchArgsBuff = allocateStructuredBuffer<DispatchIndirectArgs>(1);
  911. // Create vis pass
  912. {
  913. NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis: %s", in.m_passesName.cstr()));
  914. pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
  915. pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavCompute);
  916. pass.setWork([this, viewProjMat = in.m_viewProjectionMatrix, lodDistances = in.m_lodDistances, pointOfTest = in.m_pointOfTest,
  917. testRadius = in.m_testRadius, instancesBuff = out.m_instancesBuffer, indicesBuff = out.m_renderableIndicesBuffer,
  918. zeroInstancesDispatchArgsBuff](RenderPassWorkContext& rgraph) {
  919. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  920. cmdb.bindShaderProgram(m_visibilityGrProg.get());
  921. GpuVisibilityAccelerationStructuresConstants consts;
  922. Array<Plane, 6> planes;
  923. extractClipPlanes(viewProjMat, planes);
  924. for(U32 i = 0; i < 6; ++i)
  925. {
  926. consts.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
  927. }
  928. consts.m_pointOfTest = pointOfTest;
  929. consts.m_testRadius = testRadius;
  930. ANKI_ASSERT(kMaxLodCount == 3);
  931. consts.m_maxLodDistances[0] = lodDistances[0];
  932. consts.m_maxLodDistances[1] = lodDistances[1];
  933. consts.m_maxLodDistances[2] = kMaxF32;
  934. consts.m_maxLodDistances[3] = kMaxF32;
  935. cmdb.setFastConstants(&consts, sizeof(consts));
  936. cmdb.bindSrv(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferView());
  937. cmdb.bindSrv(1, 0, GpuSceneArrays::Renderable::getSingleton().getBufferView());
  938. cmdb.bindSrv(2, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
  939. cmdb.bindSrv(3, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
  940. cmdb.bindUav(0, 0, instancesBuff);
  941. cmdb.bindUav(1, 0, indicesBuff);
  942. cmdb.bindUav(2, 0, BufferView(m_counterBuffer.get(), 0, sizeof(U32) * 2));
  943. cmdb.bindUav(3, 0, zeroInstancesDispatchArgsBuff);
  944. const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
  945. dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
  946. });
  947. }
  948. // Zero remaining instances
  949. {
  950. NonGraphicsRenderPass& pass =
  951. rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis zero remaining instances: %s", in.m_passesName.cstr()));
  952. pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavCompute);
  953. pass.setWork([this, zeroInstancesDispatchArgsBuff, instancesBuff = out.m_instancesBuffer,
  954. indicesBuff = out.m_renderableIndicesBuffer](RenderPassWorkContext& rgraph) {
  955. CommandBuffer& cmdb = *rgraph.m_commandBuffer;
  956. cmdb.bindShaderProgram(m_zeroRemainingInstancesGrProg.get());
  957. cmdb.bindSrv(0, 0, indicesBuff);
  958. cmdb.bindUav(0, 0, instancesBuff);
  959. cmdb.dispatchComputeIndirect(zeroInstancesDispatchArgsBuff);
  960. });
  961. }
  962. }
  963. } // end namespace anki