clusterizer.cpp 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <float.h>
  5. #include <math.h>
  6. #include <string.h>
  7. // The block below auto-detects SIMD ISA that can be used on the target platform
  8. #ifndef MESHOPTIMIZER_NO_SIMD
  9. #if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
  10. #define SIMD_SSE
  11. #include <emmintrin.h>
  12. #elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
  13. #define SIMD_NEON
  14. #include <arm_neon.h>
  15. #endif
  16. #endif // !MESHOPTIMIZER_NO_SIMD
  17. // This work is based on:
  18. // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
  19. // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
  20. // Jack Ritter. An Efficient Bounding Sphere. 1990
  21. // Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
  22. // Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
  23. namespace meshopt
  24. {
  25. // This must be <= 256 since meshlet indices are stored as bytes
  26. const size_t kMeshletMaxVertices = 256;
  27. // A reasonable limit is around 2*max_vertices or less
  28. const size_t kMeshletMaxTriangles = 512;
  29. // We keep a limited number of seed triangles and add a few triangles per finished meshlet
  30. const size_t kMeshletMaxSeeds = 256;
  31. const size_t kMeshletAddSeeds = 4;
  32. // To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
  33. const int kMeshletMaxTreeDepth = 50;
  34. struct TriangleAdjacency2
  35. {
  36. unsigned int* counts;
  37. unsigned int* offsets;
  38. unsigned int* data;
  39. };
  40. static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
  41. {
  42. size_t face_count = index_count / 3;
  43. // allocate arrays
  44. adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
  45. adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
  46. adjacency.data = allocator.allocate<unsigned int>(index_count);
  47. // fill triangle counts
  48. memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
  49. for (size_t i = 0; i < index_count; ++i)
  50. {
  51. assert(indices[i] < vertex_count);
  52. adjacency.counts[indices[i]]++;
  53. }
  54. // fill offset table
  55. unsigned int offset = 0;
  56. for (size_t i = 0; i < vertex_count; ++i)
  57. {
  58. adjacency.offsets[i] = offset;
  59. offset += adjacency.counts[i];
  60. }
  61. assert(offset == index_count);
  62. // fill triangle data
  63. for (size_t i = 0; i < face_count; ++i)
  64. {
  65. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  66. adjacency.data[adjacency.offsets[a]++] = unsigned(i);
  67. adjacency.data[adjacency.offsets[b]++] = unsigned(i);
  68. adjacency.data[adjacency.offsets[c]++] = unsigned(i);
  69. }
  70. // fix offsets that have been disturbed by the previous pass
  71. for (size_t i = 0; i < vertex_count; ++i)
  72. {
  73. assert(adjacency.offsets[i] >= adjacency.counts[i]);
  74. adjacency.offsets[i] -= adjacency.counts[i];
  75. }
  76. }
  77. static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
  78. {
  79. size_t face_count = index_count / 3;
  80. // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
  81. const unsigned int sparse_seen = 1u << 31;
  82. assert(index_count < sparse_seen);
  83. // allocate arrays
  84. adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
  85. adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
  86. adjacency.data = allocator.allocate<unsigned int>(index_count);
  87. // fill triangle counts
  88. for (size_t i = 0; i < index_count; ++i)
  89. assert(indices[i] < vertex_count);
  90. for (size_t i = 0; i < index_count; ++i)
  91. adjacency.counts[indices[i]] = 0;
  92. for (size_t i = 0; i < index_count; ++i)
  93. adjacency.counts[indices[i]]++;
  94. // fill offset table; uses sparse_seen bit to tag visited vertices
  95. unsigned int offset = 0;
  96. for (size_t i = 0; i < index_count; ++i)
  97. {
  98. unsigned int v = indices[i];
  99. if ((adjacency.counts[v] & sparse_seen) == 0)
  100. {
  101. adjacency.offsets[v] = offset;
  102. offset += adjacency.counts[v];
  103. adjacency.counts[v] |= sparse_seen;
  104. }
  105. }
  106. assert(offset == index_count);
  107. // fill triangle data
  108. for (size_t i = 0; i < face_count; ++i)
  109. {
  110. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  111. adjacency.data[adjacency.offsets[a]++] = unsigned(i);
  112. adjacency.data[adjacency.offsets[b]++] = unsigned(i);
  113. adjacency.data[adjacency.offsets[c]++] = unsigned(i);
  114. }
  115. // fix offsets that have been disturbed by the previous pass
  116. // also fix counts (that were marked with sparse_seen by the first pass)
  117. for (size_t i = 0; i < index_count; ++i)
  118. {
  119. unsigned int v = indices[i];
  120. if (adjacency.counts[v] & sparse_seen)
  121. {
  122. adjacency.counts[v] &= ~sparse_seen;
  123. assert(adjacency.offsets[v] >= adjacency.counts[v]);
  124. adjacency.offsets[v] -= adjacency.counts[v];
  125. }
  126. }
  127. }
  128. static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
  129. {
  130. // for sparse inputs, it's faster to only clear vertices referenced by the index buffer
  131. if (vertex_count <= index_count)
  132. memset(used, -1, vertex_count * sizeof(short));
  133. else
  134. for (size_t i = 0; i < index_count; ++i)
  135. {
  136. assert(indices[i] < vertex_count);
  137. used[indices[i]] = -1;
  138. }
  139. }
  140. static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
  141. {
  142. static const float kAxes[7][3] = {
  143. // X, Y, Z
  144. {1, 0, 0},
  145. {0, 1, 0},
  146. {0, 0, 1},
  147. // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
  148. {0.57735026f, 0.57735026f, 0.57735026f},
  149. {-0.57735026f, 0.57735026f, 0.57735026f},
  150. {0.57735026f, -0.57735026f, 0.57735026f},
  151. {0.57735026f, 0.57735026f, -0.57735026f},
  152. };
  153. assert(count > 0);
  154. assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
  155. size_t points_stride_float = points_stride / sizeof(float);
  156. size_t radii_stride_float = radii_stride / sizeof(float);
  157. // find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
  158. size_t pmin[7], pmax[7];
  159. float tmin[7], tmax[7];
  160. for (size_t axis = 0; axis < axis_count; ++axis)
  161. {
  162. pmin[axis] = pmax[axis] = 0;
  163. tmin[axis] = FLT_MAX;
  164. tmax[axis] = -FLT_MAX;
  165. }
  166. for (size_t i = 0; i < count; ++i)
  167. {
  168. const float* p = points + i * points_stride_float;
  169. float r = radii[i * radii_stride_float];
  170. for (size_t axis = 0; axis < axis_count; ++axis)
  171. {
  172. const float* ax = kAxes[axis];
  173. float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
  174. float tpmin = tp - r, tpmax = tp + r;
  175. pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
  176. pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
  177. tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
  178. tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
  179. }
  180. }
  181. // find the pair of points with largest distance
  182. size_t paxis = 0;
  183. float paxisdr = 0;
  184. for (size_t axis = 0; axis < axis_count; ++axis)
  185. {
  186. const float* p1 = points + pmin[axis] * points_stride_float;
  187. const float* p2 = points + pmax[axis] * points_stride_float;
  188. float r1 = radii[pmin[axis] * radii_stride_float];
  189. float r2 = radii[pmax[axis] * radii_stride_float];
  190. float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
  191. float dr = sqrtf(d2) + r1 + r2;
  192. if (dr > paxisdr)
  193. {
  194. paxisdr = dr;
  195. paxis = axis;
  196. }
  197. }
  198. // use the longest segment as the initial sphere diameter
  199. const float* p1 = points + pmin[paxis] * points_stride_float;
  200. const float* p2 = points + pmax[paxis] * points_stride_float;
  201. float r1 = radii[pmin[paxis] * radii_stride_float];
  202. float r2 = radii[pmax[paxis] * radii_stride_float];
  203. float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
  204. float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
  205. float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
  206. float radius = paxisdr / 2;
  207. // iteratively adjust the sphere up until all points fit
  208. for (size_t i = 0; i < count; ++i)
  209. {
  210. const float* p = points + i * points_stride_float;
  211. float r = radii[i * radii_stride_float];
  212. float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
  213. float d = sqrtf(d2);
  214. if (d + r > radius)
  215. {
  216. float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
  217. center[0] += k * (p[0] - center[0]);
  218. center[1] += k * (p[1] - center[1]);
  219. center[2] += k * (p[2] - center[2]);
  220. radius = (radius + d + r) / 2;
  221. }
  222. }
  223. result[0] = center[0];
  224. result[1] = center[1];
  225. result[2] = center[2];
  226. result[3] = radius;
  227. }
  228. struct Cone
  229. {
  230. float px, py, pz;
  231. float nx, ny, nz;
  232. };
  233. static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
  234. {
  235. float cone = 1.f - spread * cone_weight;
  236. float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
  237. return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
  238. }
  239. static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
  240. {
  241. Cone result = acc;
  242. float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
  243. result.px *= center_scale;
  244. result.py *= center_scale;
  245. result.pz *= center_scale;
  246. float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
  247. float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
  248. result.nx *= axis_scale;
  249. result.ny *= axis_scale;
  250. result.nz *= axis_scale;
  251. return result;
  252. }
  253. static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
  254. {
  255. (void)vertex_count;
  256. size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
  257. size_t face_count = index_count / 3;
  258. float mesh_area = 0;
  259. for (size_t i = 0; i < face_count; ++i)
  260. {
  261. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  262. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  263. const float* p0 = vertex_positions + vertex_stride_float * a;
  264. const float* p1 = vertex_positions + vertex_stride_float * b;
  265. const float* p2 = vertex_positions + vertex_stride_float * c;
  266. float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
  267. float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
  268. float normalx = p10[1] * p20[2] - p10[2] * p20[1];
  269. float normaly = p10[2] * p20[0] - p10[0] * p20[2];
  270. float normalz = p10[0] * p20[1] - p10[1] * p20[0];
  271. float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
  272. float invarea = (area == 0.f) ? 0.f : 1.f / area;
  273. triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
  274. triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
  275. triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
  276. triangles[i].nx = normalx * invarea;
  277. triangles[i].ny = normaly * invarea;
  278. triangles[i].nz = normalz * invarea;
  279. mesh_area += area;
  280. }
  281. return mesh_area;
  282. }
  283. static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
  284. {
  285. short& av = used[a];
  286. short& bv = used[b];
  287. short& cv = used[c];
  288. bool result = false;
  289. int used_extra = (av < 0) + (bv < 0) + (cv < 0);
  290. if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
  291. {
  292. meshlets[meshlet_offset] = meshlet;
  293. for (size_t j = 0; j < meshlet.vertex_count; ++j)
  294. used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
  295. meshlet.vertex_offset += meshlet.vertex_count;
  296. meshlet.triangle_offset += meshlet.triangle_count * 3;
  297. meshlet.vertex_count = 0;
  298. meshlet.triangle_count = 0;
  299. result = true;
  300. }
  301. if (av < 0)
  302. {
  303. av = short(meshlet.vertex_count);
  304. meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
  305. }
  306. if (bv < 0)
  307. {
  308. bv = short(meshlet.vertex_count);
  309. meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
  310. }
  311. if (cv < 0)
  312. {
  313. cv = short(meshlet.vertex_count);
  314. meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
  315. }
  316. meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
  317. meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
  318. meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
  319. meshlet.triangle_count++;
  320. return result;
  321. }
  322. static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
  323. {
  324. unsigned int best_triangle = ~0u;
  325. int best_priority = 5;
  326. float best_score = FLT_MAX;
  327. for (size_t i = 0; i < meshlet.vertex_count; ++i)
  328. {
  329. unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
  330. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  331. size_t neighbors_size = adjacency.counts[index];
  332. for (size_t j = 0; j < neighbors_size; ++j)
  333. {
  334. unsigned int triangle = neighbors[j];
  335. unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
  336. int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
  337. assert(extra <= 2);
  338. int priority = -1;
  339. // triangles that don't add new vertices to meshlets are max. priority
  340. if (extra == 0)
  341. priority = 0;
  342. // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
  343. else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
  344. priority = 1;
  345. // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
  346. else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
  347. priority = 1 + extra;
  348. // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
  349. else
  350. priority = 2 + extra;
  351. // since topology-based priority is always more important than the score, we can skip scoring in some cases
  352. if (priority > best_priority)
  353. continue;
  354. const Cone& tri_cone = triangles[triangle];
  355. float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
  356. float distance = sqrtf(dx * dx + dy * dy + dz * dz);
  357. float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
  358. float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
  359. // note that topology-based priority is always more important than the score
  360. // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
  361. if (priority < best_priority || score < best_score)
  362. {
  363. best_triangle = triangle;
  364. best_priority = priority;
  365. best_score = score;
  366. }
  367. }
  368. }
  369. return best_triangle;
  370. }
  371. static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
  372. {
  373. unsigned int best_seeds[kMeshletAddSeeds];
  374. unsigned int best_live[kMeshletAddSeeds];
  375. float best_score[kMeshletAddSeeds];
  376. for (size_t i = 0; i < kMeshletAddSeeds; ++i)
  377. {
  378. best_seeds[i] = ~0u;
  379. best_live[i] = ~0u;
  380. best_score[i] = FLT_MAX;
  381. }
  382. for (size_t i = 0; i < meshlet.vertex_count; ++i)
  383. {
  384. unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
  385. unsigned int best_neighbor = ~0u;
  386. unsigned int best_neighbor_live = ~0u;
  387. // find the neighbor with the smallest live metric
  388. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  389. size_t neighbors_size = adjacency.counts[index];
  390. for (size_t j = 0; j < neighbors_size; ++j)
  391. {
  392. unsigned int triangle = neighbors[j];
  393. unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
  394. unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
  395. if (live < best_neighbor_live)
  396. {
  397. best_neighbor = triangle;
  398. best_neighbor_live = live;
  399. }
  400. }
  401. // add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
  402. if (best_neighbor == ~0u)
  403. continue;
  404. float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
  405. float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
  406. for (size_t j = 0; j < kMeshletAddSeeds; ++j)
  407. {
  408. // non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
  409. if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
  410. {
  411. best_seeds[j] = best_neighbor;
  412. best_live[j] = best_neighbor_live;
  413. best_score[j] = best_neighbor_score;
  414. break;
  415. }
  416. }
  417. }
  418. // add surviving seeds to the meshlet
  419. size_t seed_count = 0;
  420. for (size_t i = 0; i < kMeshletAddSeeds; ++i)
  421. if (best_seeds[i] != ~0u)
  422. seeds[seed_count++] = best_seeds[i];
  423. return seed_count;
  424. }
  425. static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
  426. {
  427. size_t result = 0;
  428. for (size_t i = 0; i < seed_count; ++i)
  429. {
  430. unsigned int index = seeds[i];
  431. seeds[result] = index;
  432. result += emitted_flags[index] == 0;
  433. }
  434. return result;
  435. }
  436. static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
  437. {
  438. unsigned int best_seed = ~0u;
  439. unsigned int best_live = ~0u;
  440. float best_score = FLT_MAX;
  441. for (size_t i = 0; i < seed_count; ++i)
  442. {
  443. unsigned int index = seeds[i];
  444. unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
  445. unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
  446. float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
  447. float score = sqrtf(dx * dx + dy * dy + dz * dz);
  448. if (live < best_live || (live == best_live && score < best_score))
  449. {
  450. best_seed = index;
  451. best_live = live;
  452. best_score = score;
  453. }
  454. }
  455. return best_seed;
  456. }
  457. struct KDNode
  458. {
  459. union
  460. {
  461. float split;
  462. unsigned int index;
  463. };
  464. // leaves: axis = 3, children = number of points including this one
  465. // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
  466. unsigned int axis : 2;
  467. unsigned int children : 30;
  468. };
  469. static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
  470. {
  471. size_t m = 0;
  472. // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
  473. for (size_t i = 0; i < count; ++i)
  474. {
  475. float v = points[indices[i] * stride + axis];
  476. // swap(m, i) unconditionally
  477. unsigned int t = indices[m];
  478. indices[m] = indices[i];
  479. indices[i] = t;
  480. // when v >= pivot, we swap i with m without advancing it, preserving invariants
  481. m += v < pivot;
  482. }
  483. return m;
  484. }
  485. static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
  486. {
  487. assert(offset + count <= node_count);
  488. (void)node_count;
  489. KDNode& result = nodes[offset];
  490. result.index = indices[0];
  491. result.axis = 3;
  492. result.children = unsigned(count);
  493. // all remaining points are stored in nodes immediately following the leaf
  494. for (size_t i = 1; i < count; ++i)
  495. {
  496. KDNode& tail = nodes[offset + i];
  497. tail.index = indices[i];
  498. tail.axis = 3;
  499. tail.children = ~0u >> 2; // bogus value to prevent misuse
  500. }
  501. return offset + count;
  502. }
  503. static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
  504. {
  505. assert(count > 0);
  506. assert(offset < node_count);
  507. if (count <= leaf_size)
  508. return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
  509. float mean[3] = {};
  510. float vars[3] = {};
  511. float runc = 1, runs = 1;
  512. // gather statistics on the points in the subtree using Welford's algorithm
  513. for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
  514. {
  515. const float* point = points + indices[i] * stride;
  516. for (int k = 0; k < 3; ++k)
  517. {
  518. float delta = point[k] - mean[k];
  519. mean[k] += delta * runs;
  520. vars[k] += delta * (point[k] - mean[k]);
  521. }
  522. }
  523. // split axis is one where the variance is largest
  524. int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
  525. float split = mean[axis];
  526. size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
  527. // when the partition is degenerate simply consolidate the points into a single node
  528. // this also ensures recursion depth is bounded on pathological inputs
  529. if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
  530. return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
  531. KDNode& result = nodes[offset];
  532. result.split = split;
  533. result.axis = axis;
  534. // left subtree is right after our node
  535. size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
  536. // distance to the right subtree is represented explicitly
  537. assert(next_offset - offset > 1);
  538. result.children = unsigned(next_offset - offset - 1);
  539. return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
  540. }
  541. static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
  542. {
  543. const KDNode& node = nodes[root];
  544. if (node.children == 0)
  545. return;
  546. if (node.axis == 3)
  547. {
  548. // leaf
  549. bool inactive = true;
  550. for (unsigned int i = 0; i < node.children; ++i)
  551. {
  552. unsigned int index = nodes[root + i].index;
  553. if (emitted_flags[index])
  554. continue;
  555. inactive = false;
  556. const float* point = points + index * stride;
  557. float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
  558. float distance = sqrtf(dx * dx + dy * dy + dz * dz);
  559. if (distance < limit)
  560. {
  561. result = index;
  562. limit = distance;
  563. }
  564. }
  565. // deactivate leaves that no longer have items to emit
  566. if (inactive)
  567. nodes[root].children = 0;
  568. }
  569. else
  570. {
  571. // branch; we order recursion to process the node that search position is in first
  572. float delta = position[node.axis] - node.split;
  573. unsigned int first = (delta <= 0) ? 0 : node.children;
  574. unsigned int second = first ^ node.children;
  575. // deactivate branches that no longer have items to emit to accelerate traversal
  576. // note that we do this *before* recursing which delays deactivation but keeps tail calls
  577. if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
  578. nodes[root].children = 0;
  579. // recursion depth is bounded by tree depth (which is limited by construction)
  580. kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
  581. // only process the other node if it can have a match based on closest distance so far
  582. if (fabsf(delta) <= limit)
  583. kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
  584. }
  585. }
  586. struct BVHBoxT
  587. {
  588. float min[4];
  589. float max[4];
  590. };
  591. struct BVHBox
  592. {
  593. float min[3];
  594. float max[3];
  595. };
  596. #if defined(SIMD_SSE)
  597. static float boxMerge(BVHBoxT& box, const BVHBox& other)
  598. {
  599. __m128 min = _mm_loadu_ps(box.min);
  600. __m128 max = _mm_loadu_ps(box.max);
  601. // note: over-read is safe because BVHBox array is allocated with padding
  602. min = _mm_min_ps(min, _mm_loadu_ps(other.min));
  603. max = _mm_max_ps(max, _mm_loadu_ps(other.max));
  604. _mm_storeu_ps(box.min, min);
  605. _mm_storeu_ps(box.max, max);
  606. __m128 size = _mm_sub_ps(max, min);
  607. __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
  608. __m128 mul = _mm_mul_ps(size, size_yzx);
  609. __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
  610. __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
  611. return _mm_cvtss_f32(sum_xyz);
  612. }
  613. #elif defined(SIMD_NEON)
  614. static float boxMerge(BVHBoxT& box, const BVHBox& other)
  615. {
  616. float32x4_t min = vld1q_f32(box.min);
  617. float32x4_t max = vld1q_f32(box.max);
  618. // note: over-read is safe because BVHBox array is allocated with padding
  619. min = vminq_f32(min, vld1q_f32(other.min));
  620. max = vmaxq_f32(max, vld1q_f32(other.max));
  621. vst1q_f32(box.min, min);
  622. vst1q_f32(box.max, max);
  623. float32x4_t size = vsubq_f32(max, min);
  624. float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
  625. float32x4_t mul = vmulq_f32(size, size_yzx);
  626. float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
  627. float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
  628. return sum_xyz;
  629. }
  630. #else
  631. static float boxMerge(BVHBoxT& box, const BVHBox& other)
  632. {
  633. for (int k = 0; k < 3; ++k)
  634. {
  635. box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
  636. box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
  637. }
  638. float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
  639. return sx * sy + sx * sz + sy * sz;
  640. }
  641. #endif
  642. inline unsigned int radixFloat(unsigned int v)
  643. {
  644. // if sign bit is 0, flip sign bit
  645. // if sign bit is 1, flip everything
  646. unsigned int mask = (int(v) >> 31) | 0x80000000;
  647. return v ^ mask;
  648. }
  649. static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
  650. {
  651. memset(hist, 0, sizeof(hist));
  652. const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);
  653. // compute 3 10-bit histograms in parallel (dropping 2 LSB)
  654. for (size_t i = 0; i < count; ++i)
  655. {
  656. unsigned int id = radixFloat(bits[i]);
  657. hist[(id >> 2) & 1023][0]++;
  658. hist[(id >> 12) & 1023][1]++;
  659. hist[(id >> 22) & 1023][2]++;
  660. }
  661. unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
  662. // replace histogram data with prefix histogram sums in-place
  663. for (int i = 0; i < 1024; ++i)
  664. {
  665. unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
  666. hist[i][0] = sum0;
  667. hist[i][1] = sum1;
  668. hist[i][2] = sum2;
  669. sum0 += hx;
  670. sum1 += hy;
  671. sum2 += hz;
  672. }
  673. assert(sum0 == count && sum1 == count && sum2 == count);
  674. }
  675. static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
  676. {
  677. const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);
  678. int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
  679. for (size_t i = 0; i < count; ++i)
  680. {
  681. unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
  682. destination[hist[id][pass]++] = source[i];
  683. }
  684. }
  685. static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
  686. {
  687. (void)vertex_count;
  688. for (size_t i = 0; i < face_count; ++i)
  689. {
  690. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  691. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  692. const float* va = vertex_positions + vertex_stride_float * a;
  693. const float* vb = vertex_positions + vertex_stride_float * b;
  694. const float* vc = vertex_positions + vertex_stride_float * c;
  695. BVHBox& box = boxes[i];
  696. for (int k = 0; k < 3; ++k)
  697. {
  698. box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
  699. box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
  700. box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
  701. box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
  702. centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
  703. }
  704. }
  705. }
  706. static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
  707. {
  708. // count number of unique vertices
  709. size_t used_vertices = 0;
  710. for (size_t i = 0; i < count; ++i)
  711. {
  712. unsigned int index = order[i];
  713. unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
  714. used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
  715. used[a] = used[b] = used[c] = 1;
  716. if (out)
  717. out[i] = unsigned(used_vertices);
  718. }
  719. // reset used[] for future invocations
  720. for (size_t i = 0; i < count; ++i)
  721. {
  722. unsigned int index = order[i];
  723. unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
  724. used[a] = used[b] = used[c] = -1;
  725. }
  726. return used_vertices;
  727. }
  728. static void bvhPackLeaf(unsigned char* boundary, size_t count)
  729. {
  730. // mark meshlet boundary for future reassembly
  731. assert(count > 0);
  732. boundary[0] = 1;
  733. memset(boundary + 1, 0, count - 1);
  734. }
  735. static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
  736. {
  737. for (size_t i = 0; i < count;)
  738. {
  739. size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
  740. if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
  741. {
  742. bvhPackLeaf(boundary + i, chunk);
  743. i += chunk;
  744. continue;
  745. }
  746. // chunk is vertex bound, split it into smaller meshlets
  747. assert(chunk > max_vertices / 3);
  748. bvhPackLeaf(boundary + i, max_vertices / 3);
  749. i += max_vertices / 3;
  750. }
  751. }
  752. static bool bvhDivisible(size_t count, size_t min, size_t max)
  753. {
  754. // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
  755. // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
  756. // we avoid expensive integer divisions in the common case where min is <= max/2
  757. return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
  758. }
  759. static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
  760. {
  761. BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
  762. BVHBoxT accumr = accuml;
  763. for (size_t i = 0; i < count; ++i)
  764. {
  765. float larea = boxMerge(accuml, boxes[order[i]]);
  766. float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
  767. areas[i] = larea;
  768. areas[i + count] = rarea;
  769. }
  770. }
  771. static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
  772. {
  773. bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
  774. size_t end = aligned ? count - min : count - 1;
  775. float rmaxfill = 1.f / float(int(maxfill));
  776. // find best split that minimizes SAH
  777. size_t bestsplit = 0;
  778. float bestcost = FLT_MAX;
  779. for (size_t i = min - 1; i < end; i += step)
  780. {
  781. size_t lsplit = i + 1, rsplit = count - (i + 1);
  782. if (!bvhDivisible(lsplit, min, max))
  783. continue;
  784. if (aligned && !bvhDivisible(rsplit, min, max))
  785. continue;
  786. // areas[x] = inclusive surface area of boxes[0..x]
  787. // areas[count-1-x] = inclusive surface area of boxes[x..count-1]
  788. float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
  789. float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
  790. if (cost > bestcost)
  791. continue;
  792. // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
  793. // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
  794. size_t lfill = vertices ? vertices[i] : lsplit;
  795. size_t rfill = vertices ? vertices[i] : rsplit;
  796. // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
  797. int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
  798. int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
  799. cost += fill * (float(lrest) * larea + float(rrest) * rarea);
  800. if (cost < bestcost)
  801. {
  802. bestcost = cost;
  803. bestsplit = i + 1;
  804. }
  805. }
  806. *out_cost = bestcost;
  807. return bestsplit;
  808. }
  809. static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
  810. {
  811. size_t l = 0, r = split;
  812. for (size_t i = 0; i < count; ++i)
  813. {
  814. unsigned char side = sides[order[i]];
  815. target[side ? r : l] = order[i];
  816. l += 1;
  817. l -= side;
  818. r += side;
  819. }
  820. assert(l == split && r == count);
  821. }
  822. static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
  823. {
  824. if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
  825. return bvhPackLeaf(boundary, count);
  826. unsigned int* axes[3] = {orderx, ordery, orderz};
  827. // we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
  828. size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
  829. // if we could not pack the meshlet, we must be vertex bound
  830. size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
  831. size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
  832. // find best split that minimizes SAH
  833. int bestk = -1;
  834. size_t bestsplit = 0;
  835. float bestcost = FLT_MAX;
  836. for (int k = 0; k < 3; ++k)
  837. {
  838. float* areas = static_cast<float*>(scratch);
  839. unsigned int* vertices = NULL;
  840. bvhComputeArea(areas, boxes, axes[k], count);
  841. if (count <= max_triangles)
  842. {
  843. // for vertex bound clusters, count number of unique vertices for each split
  844. vertices = reinterpret_cast<unsigned int*>(areas + 2 * count);
  845. bvhCountVertices(axes[k], count, used, indices, vertices);
  846. }
  847. float axiscost = FLT_MAX;
  848. size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
  849. if (axissplit && axiscost < bestcost)
  850. {
  851. bestk = k;
  852. bestcost = axiscost;
  853. bestsplit = axissplit;
  854. }
  855. }
  856. // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
  857. if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
  858. return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
  859. // mark sides of split for partitioning
  860. unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
  861. for (size_t i = 0; i < bestsplit; ++i)
  862. sides[axes[bestk][i]] = 0;
  863. for (size_t i = bestsplit; i < count; ++i)
  864. sides[axes[bestk][i]] = 1;
  865. // partition all axes into two sides, maintaining order
  866. unsigned int* temp = static_cast<unsigned int*>(scratch);
  867. for (int k = 0; k < 3; ++k)
  868. {
  869. if (k == bestk)
  870. continue;
  871. unsigned int* axis = axes[k];
  872. memcpy(temp, axis, sizeof(unsigned int) * count);
  873. bvhPartition(axis, temp, sides, bestsplit, count);
  874. }
  875. // recursion depth is bounded due to max depth check above
  876. bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
  877. bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
  878. }
  879. } // namespace meshopt
  880. size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
  881. {
  882. using namespace meshopt;
  883. assert(index_count % 3 == 0);
  884. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  885. assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
  886. (void)kMeshletMaxVertices;
  887. (void)kMeshletMaxTriangles;
  888. // meshlet construction is limited by max vertices and max triangles per meshlet
  889. // the worst case is that the input is an unindexed stream since this equally stresses both limits
  890. // note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
  891. size_t max_vertices_conservative = max_vertices - 2;
  892. size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
  893. size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
  894. return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
  895. }
  896. size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
  897. {
  898. using namespace meshopt;
  899. assert(index_count % 3 == 0);
  900. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  901. assert(vertex_positions_stride % sizeof(float) == 0);
  902. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  903. assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
  904. assert(cone_weight >= 0 && cone_weight <= 1);
  905. assert(split_factor >= 0);
  906. if (index_count == 0)
  907. return 0;
  908. meshopt_Allocator allocator;
  909. TriangleAdjacency2 adjacency = {};
  910. if (vertex_count > index_count && index_count < (1u << 31))
  911. buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
  912. else
  913. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  914. // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
  915. unsigned int* live_triangles = adjacency.counts;
  916. size_t face_count = index_count / 3;
  917. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  918. memset(emitted_flags, 0, face_count);
  919. // for each triangle, precompute centroid & normal to use for scoring
  920. Cone* triangles = allocator.allocate<Cone>(face_count);
  921. float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
  922. // assuming each meshlet is a square patch, expected radius is sqrt(expected area)
  923. float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
  924. float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
  925. // build a kd-tree for nearest neighbor lookup
  926. unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
  927. for (size_t i = 0; i < face_count; ++i)
  928. kdindices[i] = unsigned(i);
  929. KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
  930. kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
  931. // find a specific corner of the mesh to use as a starting point for meshlet flow
  932. float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
  933. for (size_t i = 0; i < face_count; ++i)
  934. {
  935. const Cone& tri = triangles[i];
  936. cornerx = cornerx > tri.px ? tri.px : cornerx;
  937. cornery = cornery > tri.py ? tri.py : cornery;
  938. cornerz = cornerz > tri.pz ? tri.pz : cornerz;
  939. }
  940. // index of the vertex in the meshlet, -1 if the vertex isn't used
  941. short* used = allocator.allocate<short>(vertex_count);
  942. clearUsed(used, vertex_count, indices, index_count);
  943. // initial seed triangle is the one closest to the corner
  944. unsigned int initial_seed = ~0u;
  945. float initial_score = FLT_MAX;
  946. for (size_t i = 0; i < face_count; ++i)
  947. {
  948. const Cone& tri = triangles[i];
  949. float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
  950. float score = sqrtf(dx * dx + dy * dy + dz * dz);
  951. if (initial_seed == ~0u || score < initial_score)
  952. {
  953. initial_seed = unsigned(i);
  954. initial_score = score;
  955. }
  956. }
  957. // seed triangles to continue meshlet flow
  958. unsigned int seeds[kMeshletMaxSeeds] = {};
  959. size_t seed_count = 0;
  960. meshopt_Meshlet meshlet = {};
  961. size_t meshlet_offset = 0;
  962. Cone meshlet_cone_acc = {};
  963. for (;;)
  964. {
  965. Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
  966. unsigned int best_triangle = ~0u;
  967. // for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
  968. // to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
  969. if (meshlet_offset == 0 && meshlet.triangle_count == 0)
  970. best_triangle = initial_seed;
  971. else
  972. best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
  973. bool split = false;
  974. // when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
  975. if (best_triangle == ~0u)
  976. {
  977. float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
  978. unsigned int index = ~0u;
  979. float distance = FLT_MAX;
  980. kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
  981. best_triangle = index;
  982. split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
  983. }
  984. if (best_triangle == ~0u)
  985. break;
  986. int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
  987. // if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
  988. if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
  989. {
  990. seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
  991. seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
  992. seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
  993. unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
  994. // we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
  995. best_triangle = best_seed != ~0u ? best_seed : best_triangle;
  996. }
  997. unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
  998. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  999. // add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
  1000. if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
  1001. {
  1002. meshlet_offset++;
  1003. memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
  1004. }
  1005. // remove emitted triangle from adjacency data
  1006. // this makes sure that we spend less time traversing these lists on subsequent iterations
  1007. // live triangle counts are updated as a byproduct of these adjustments
  1008. for (size_t k = 0; k < 3; ++k)
  1009. {
  1010. unsigned int index = indices[best_triangle * 3 + k];
  1011. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  1012. size_t neighbors_size = adjacency.counts[index];
  1013. for (size_t i = 0; i < neighbors_size; ++i)
  1014. {
  1015. unsigned int tri = neighbors[i];
  1016. if (tri == best_triangle)
  1017. {
  1018. neighbors[i] = neighbors[neighbors_size - 1];
  1019. adjacency.counts[index]--;
  1020. break;
  1021. }
  1022. }
  1023. }
  1024. // update aggregated meshlet cone data for scoring subsequent triangles
  1025. meshlet_cone_acc.px += triangles[best_triangle].px;
  1026. meshlet_cone_acc.py += triangles[best_triangle].py;
  1027. meshlet_cone_acc.pz += triangles[best_triangle].pz;
  1028. meshlet_cone_acc.nx += triangles[best_triangle].nx;
  1029. meshlet_cone_acc.ny += triangles[best_triangle].ny;
  1030. meshlet_cone_acc.nz += triangles[best_triangle].nz;
  1031. assert(!emitted_flags[best_triangle]);
  1032. emitted_flags[best_triangle] = 1;
  1033. }
  1034. if (meshlet.triangle_count)
  1035. meshlets[meshlet_offset++] = meshlet;
  1036. assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
  1037. assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
  1038. return meshlet_offset;
  1039. }
  1040. size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
  1041. {
  1042. return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
  1043. }
  1044. size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
  1045. {
  1046. using namespace meshopt;
  1047. assert(index_count % 3 == 0);
  1048. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  1049. assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
  1050. meshopt_Allocator allocator;
  1051. // index of the vertex in the meshlet, -1 if the vertex isn't used
  1052. short* used = allocator.allocate<short>(vertex_count);
  1053. clearUsed(used, vertex_count, indices, index_count);
  1054. meshopt_Meshlet meshlet = {};
  1055. size_t meshlet_offset = 0;
  1056. for (size_t i = 0; i < index_count; i += 3)
  1057. {
  1058. unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
  1059. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  1060. // appends triangle to the meshlet and writes previous meshlet to the output if full
  1061. meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
  1062. }
  1063. if (meshlet.triangle_count)
  1064. meshlets[meshlet_offset++] = meshlet;
  1065. assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
  1066. assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
  1067. return meshlet_offset;
  1068. }
  1069. size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
  1070. {
  1071. using namespace meshopt;
  1072. assert(index_count % 3 == 0);
  1073. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  1074. assert(vertex_positions_stride % sizeof(float) == 0);
  1075. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  1076. assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
  1077. if (index_count == 0)
  1078. return 0;
  1079. size_t face_count = index_count / 3;
  1080. size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
  1081. meshopt_Allocator allocator;
  1082. // 3 floats plus 1 uint for sorting, or
  1083. // 2 floats plus 1 uint for pivoting, or
  1084. // 1 uint plus 1 byte for partitioning
  1085. float* scratch = allocator.allocate<float>(face_count * 4);
  1086. // compute bounding boxes and centroids for sorting
  1087. BVHBox* boxes = allocator.allocate<BVHBox>(face_count + 1); // padding for SIMD
  1088. bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
  1089. memset(boxes + face_count, 0, sizeof(BVHBox));
  1090. unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
  1091. unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
  1092. for (int k = 0; k < 3; ++k)
  1093. {
  1094. unsigned int* order = axes + k * face_count;
  1095. const float* keys = scratch + k * face_count;
  1096. unsigned int hist[1024][3];
  1097. computeHistogram(hist, keys, face_count);
  1098. // 3-pass radix sort computes the resulting order into axes
  1099. for (size_t i = 0; i < face_count; ++i)
  1100. temp[i] = unsigned(i);
  1101. radixPass(order, temp, keys, face_count, hist, 0);
  1102. radixPass(temp, order, keys, face_count, hist, 1);
  1103. radixPass(order, temp, keys, face_count, hist, 2);
  1104. }
  1105. // index of the vertex in the meshlet, -1 if the vertex isn't used
  1106. short* used = allocator.allocate<short>(vertex_count);
  1107. clearUsed(used, vertex_count, indices, index_count);
  1108. unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
  1109. bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
  1110. // compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
  1111. size_t meshlet_count = 0;
  1112. for (size_t i = 0; i < face_count; ++i)
  1113. {
  1114. assert(boundary[i] <= 1);
  1115. meshlet_count += boundary[i];
  1116. }
  1117. size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
  1118. // pack triangles into meshlets according to the order and boundaries marked by bvhSplit
  1119. meshopt_Meshlet meshlet = {};
  1120. size_t meshlet_offset = 0;
  1121. size_t meshlet_pending = meshlet_count;
  1122. for (size_t i = 0; i < face_count; ++i)
  1123. {
  1124. assert(boundary[i] <= 1);
  1125. bool split = i > 0 && boundary[i] == 1;
  1126. // while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
  1127. if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
  1128. split = false;
  1129. unsigned int index = axes[i];
  1130. assert(index < face_count);
  1131. unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
  1132. // appends triangle to the meshlet and writes previous meshlet to the output if full
  1133. meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
  1134. meshlet_pending -= boundary[i];
  1135. }
  1136. if (meshlet.triangle_count)
  1137. meshlets[meshlet_offset++] = meshlet;
  1138. assert(meshlet_offset <= meshlet_bound);
  1139. assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
  1140. return meshlet_offset;
  1141. }
  1142. meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
  1143. {
  1144. using namespace meshopt;
  1145. assert(index_count % 3 == 0);
  1146. assert(index_count / 3 <= kMeshletMaxTriangles);
  1147. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  1148. assert(vertex_positions_stride % sizeof(float) == 0);
  1149. (void)vertex_count;
  1150. size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
  1151. // compute triangle normals and gather triangle corners
  1152. float normals[kMeshletMaxTriangles][3];
  1153. float corners[kMeshletMaxTriangles][3][3];
  1154. size_t triangles = 0;
  1155. for (size_t i = 0; i < index_count; i += 3)
  1156. {
  1157. unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
  1158. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  1159. const float* p0 = vertex_positions + vertex_stride_float * a;
  1160. const float* p1 = vertex_positions + vertex_stride_float * b;
  1161. const float* p2 = vertex_positions + vertex_stride_float * c;
  1162. float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
  1163. float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
  1164. float normalx = p10[1] * p20[2] - p10[2] * p20[1];
  1165. float normaly = p10[2] * p20[0] - p10[0] * p20[2];
  1166. float normalz = p10[0] * p20[1] - p10[1] * p20[0];
  1167. float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
  1168. // no need to include degenerate triangles - they will be invisible anyway
  1169. if (area == 0.f)
  1170. continue;
  1171. // record triangle normals & corners for future use; normal and corner 0 define a plane equation
  1172. normals[triangles][0] = normalx / area;
  1173. normals[triangles][1] = normaly / area;
  1174. normals[triangles][2] = normalz / area;
  1175. memcpy(corners[triangles][0], p0, 3 * sizeof(float));
  1176. memcpy(corners[triangles][1], p1, 3 * sizeof(float));
  1177. memcpy(corners[triangles][2], p2, 3 * sizeof(float));
  1178. triangles++;
  1179. }
  1180. meshopt_Bounds bounds = {};
  1181. // degenerate cluster, no valid triangles => trivial reject (cone data is 0)
  1182. if (triangles == 0)
  1183. return bounds;
  1184. const float rzero = 0.f;
  1185. // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
  1186. float psphere[4] = {};
  1187. computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
  1188. float center[3] = {psphere[0], psphere[1], psphere[2]};
  1189. // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
  1190. float nsphere[4] = {};
  1191. computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
  1192. float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
  1193. float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
  1194. float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
  1195. axis[0] *= invaxislength;
  1196. axis[1] *= invaxislength;
  1197. axis[2] *= invaxislength;
  1198. // compute a tight cone around all normals, mindp = cos(angle/2)
  1199. float mindp = 1.f;
  1200. for (size_t i = 0; i < triangles; ++i)
  1201. {
  1202. float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
  1203. mindp = (dp < mindp) ? dp : mindp;
  1204. }
  1205. // fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
  1206. bounds.center[0] = center[0];
  1207. bounds.center[1] = center[1];
  1208. bounds.center[2] = center[2];
  1209. bounds.radius = psphere[3];
  1210. // degenerate cluster, normal cone is larger than a hemisphere => trivial accept
  1211. // note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
  1212. // we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
  1213. if (mindp <= 0.1f)
  1214. {
  1215. bounds.cone_cutoff = 1;
  1216. bounds.cone_cutoff_s8 = 127;
  1217. return bounds;
  1218. }
  1219. float maxt = 0;
  1220. // we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
  1221. for (size_t i = 0; i < triangles; ++i)
  1222. {
  1223. // dot(center-t*axis-corner, trinormal) = 0
  1224. // dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
  1225. float cx = center[0] - corners[i][0][0];
  1226. float cy = center[1] - corners[i][0][1];
  1227. float cz = center[2] - corners[i][0][2];
  1228. float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
  1229. float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
  1230. // dn should be larger than mindp cutoff above
  1231. assert(dn > 0.f);
  1232. float t = dc / dn;
  1233. maxt = (t > maxt) ? t : maxt;
  1234. }
  1235. // cone apex should be in the negative half-space of all cluster triangles by construction
  1236. bounds.cone_apex[0] = center[0] - axis[0] * maxt;
  1237. bounds.cone_apex[1] = center[1] - axis[1] * maxt;
  1238. bounds.cone_apex[2] = center[2] - axis[2] * maxt;
  1239. // note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
  1240. bounds.cone_axis[0] = axis[0];
  1241. bounds.cone_axis[1] = axis[1];
  1242. bounds.cone_axis[2] = axis[2];
  1243. // cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
  1244. // which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
  1245. bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
  1246. // quantize axis & cutoff to 8-bit SNORM format
  1247. bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
  1248. bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
  1249. bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
  1250. // for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
  1251. float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
  1252. float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
  1253. float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
  1254. // note that we need to round this up instead of rounding to nearest, hence +1
  1255. int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
  1256. bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
  1257. return bounds;
  1258. }
  1259. meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
  1260. {
  1261. using namespace meshopt;
  1262. assert(triangle_count <= kMeshletMaxTriangles);
  1263. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  1264. assert(vertex_positions_stride % sizeof(float) == 0);
  1265. unsigned int indices[kMeshletMaxTriangles * 3];
  1266. for (size_t i = 0; i < triangle_count * 3; ++i)
  1267. {
  1268. unsigned int index = meshlet_vertices[meshlet_triangles[i]];
  1269. assert(index < vertex_count);
  1270. indices[i] = index;
  1271. }
  1272. return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
  1273. }
  1274. meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
  1275. {
  1276. using namespace meshopt;
  1277. assert(positions_stride >= 12 && positions_stride <= 256);
  1278. assert(positions_stride % sizeof(float) == 0);
  1279. assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
  1280. assert(radii_stride % sizeof(float) == 0);
  1281. meshopt_Bounds bounds = {};
  1282. if (count == 0)
  1283. return bounds;
  1284. const float rzero = 0.f;
  1285. float psphere[4] = {};
  1286. computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
  1287. bounds.center[0] = psphere[0];
  1288. bounds.center[1] = psphere[1];
  1289. bounds.center[2] = psphere[2];
  1290. bounds.radius = psphere[3];
  1291. return bounds;
  1292. }
  1293. void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
  1294. {
  1295. using namespace meshopt;
  1296. assert(triangle_count <= kMeshletMaxTriangles);
  1297. assert(vertex_count <= kMeshletMaxVertices);
  1298. unsigned char* indices = meshlet_triangles;
  1299. unsigned int* vertices = meshlet_vertices;
  1300. // cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)
  1301. unsigned char cache[kMeshletMaxVertices];
  1302. memset(cache, 0, vertex_count);
  1303. // note that we start from a value that means all vertices aren't in cache
  1304. unsigned char cache_last = 128;
  1305. const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse
  1306. for (size_t i = 0; i < triangle_count; ++i)
  1307. {
  1308. int next = -1;
  1309. int next_match = -1;
  1310. for (size_t j = i; j < triangle_count; ++j)
  1311. {
  1312. unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];
  1313. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  1314. // score each triangle by how many vertices are in cache
  1315. // note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully
  1316. int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;
  1317. int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;
  1318. int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;
  1319. if (aok + bok + cok > next_match)
  1320. {
  1321. next = (int)j;
  1322. next_match = aok + bok + cok;
  1323. // note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal
  1324. if (next_match >= 2)
  1325. break;
  1326. }
  1327. }
  1328. assert(next >= 0);
  1329. unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];
  1330. // shift triangles before the next one forward so that we always keep an ordered partition
  1331. // note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence
  1332. memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));
  1333. indices[i * 3 + 0] = a;
  1334. indices[i * 3 + 1] = b;
  1335. indices[i * 3 + 2] = c;
  1336. // cache timestamp is the same between all vertices of each triangle to reduce overflow
  1337. cache_last++;
  1338. cache[a] = cache_last;
  1339. cache[b] = cache_last;
  1340. cache[c] = cache_last;
  1341. }
  1342. // reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
  1343. unsigned int order[kMeshletMaxVertices];
  1344. short remap[kMeshletMaxVertices];
  1345. memset(remap, -1, vertex_count * sizeof(short));
  1346. size_t vertex_offset = 0;
  1347. for (size_t i = 0; i < triangle_count * 3; ++i)
  1348. {
  1349. short& r = remap[indices[i]];
  1350. if (r < 0)
  1351. {
  1352. r = short(vertex_offset);
  1353. order[vertex_offset] = vertices[indices[i]];
  1354. vertex_offset++;
  1355. }
  1356. indices[i] = (unsigned char)r;
  1357. }
  1358. assert(vertex_offset <= vertex_count);
  1359. memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
  1360. }
  1361. #undef SIMD_SSE
  1362. #undef SIMD_NEON