vcacheoptimizer.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // This work is based on:
  6. // Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
  7. // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
  8. namespace meshopt
  9. {
  10. const size_t kCacheSizeMax = 16;
  11. const size_t kValenceMax = 8;
  12. struct VertexScoreTable
  13. {
  14. float cache[1 + kCacheSizeMax];
  15. float live[1 + kValenceMax];
  16. };
  17. // Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
  18. static const VertexScoreTable kVertexScoreTable = {
  19. {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
  20. {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
  21. };
  22. // Tuned to minimize the encoded index buffer size
  23. static const VertexScoreTable kVertexScoreTableStrip = {
  24. {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
  25. {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
  26. };
  27. struct TriangleAdjacency
  28. {
  29. unsigned int* counts;
  30. unsigned int* offsets;
  31. unsigned int* data;
  32. };
  33. static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
  34. {
  35. size_t face_count = index_count / 3;
  36. // allocate arrays
  37. adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
  38. adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
  39. adjacency.data = allocator.allocate<unsigned int>(index_count);
  40. // fill triangle counts
  41. memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
  42. for (size_t i = 0; i < index_count; ++i)
  43. {
  44. assert(indices[i] < vertex_count);
  45. adjacency.counts[indices[i]]++;
  46. }
  47. // fill offset table
  48. unsigned int offset = 0;
  49. for (size_t i = 0; i < vertex_count; ++i)
  50. {
  51. adjacency.offsets[i] = offset;
  52. offset += adjacency.counts[i];
  53. }
  54. assert(offset == index_count);
  55. // fill triangle data
  56. for (size_t i = 0; i < face_count; ++i)
  57. {
  58. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  59. adjacency.data[adjacency.offsets[a]++] = unsigned(i);
  60. adjacency.data[adjacency.offsets[b]++] = unsigned(i);
  61. adjacency.data[adjacency.offsets[c]++] = unsigned(i);
  62. }
  63. // fix offsets that have been disturbed by the previous pass
  64. for (size_t i = 0; i < vertex_count; ++i)
  65. {
  66. assert(adjacency.offsets[i] >= adjacency.counts[i]);
  67. adjacency.offsets[i] -= adjacency.counts[i];
  68. }
  69. }
  70. static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
  71. {
  72. // check dead-end stack
  73. while (dead_end_top)
  74. {
  75. unsigned int vertex = dead_end[--dead_end_top];
  76. if (live_triangles[vertex] > 0)
  77. return vertex;
  78. }
  79. // input order
  80. while (input_cursor < vertex_count)
  81. {
  82. if (live_triangles[input_cursor] > 0)
  83. return input_cursor;
  84. ++input_cursor;
  85. }
  86. return ~0u;
  87. }
  88. static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
  89. {
  90. unsigned int best_candidate = ~0u;
  91. int best_priority = -1;
  92. for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
  93. {
  94. unsigned int vertex = *next_candidate;
  95. // otherwise we don't need to process it
  96. if (live_triangles[vertex] > 0)
  97. {
  98. int priority = 0;
  99. // will it be in cache after fanning?
  100. if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
  101. {
  102. priority = timestamp - cache_timestamps[vertex]; // position in cache
  103. }
  104. if (priority > best_priority)
  105. {
  106. best_candidate = vertex;
  107. best_priority = priority;
  108. }
  109. }
  110. }
  111. return best_candidate;
  112. }
  113. static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
  114. {
  115. assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
  116. unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
  117. return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
  118. }
  119. static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
  120. {
  121. // input order
  122. while (input_cursor < face_count)
  123. {
  124. if (!emitted_flags[input_cursor])
  125. return input_cursor;
  126. ++input_cursor;
  127. }
  128. return ~0u;
  129. }
  130. } // namespace meshopt
  131. void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
  132. {
  133. using namespace meshopt;
  134. assert(index_count % 3 == 0);
  135. meshopt_Allocator allocator;
  136. // guard for empty meshes
  137. if (index_count == 0 || vertex_count == 0)
  138. return;
  139. // support in-place optimization
  140. if (destination == indices)
  141. {
  142. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  143. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  144. indices = indices_copy;
  145. }
  146. unsigned int cache_size = 16;
  147. assert(cache_size <= kCacheSizeMax);
  148. size_t face_count = index_count / 3;
  149. // build adjacency information
  150. TriangleAdjacency adjacency = {};
  151. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  152. // live triangle counts
  153. unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
  154. memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
  155. // emitted flags
  156. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  157. memset(emitted_flags, 0, face_count);
  158. // compute initial vertex scores
  159. float* vertex_scores = allocator.allocate<float>(vertex_count);
  160. for (size_t i = 0; i < vertex_count; ++i)
  161. vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
  162. // compute triangle scores
  163. float* triangle_scores = allocator.allocate<float>(face_count);
  164. for (size_t i = 0; i < face_count; ++i)
  165. {
  166. unsigned int a = indices[i * 3 + 0];
  167. unsigned int b = indices[i * 3 + 1];
  168. unsigned int c = indices[i * 3 + 2];
  169. triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
  170. }
  171. unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
  172. unsigned int* cache = cache_holder;
  173. unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
  174. size_t cache_count = 0;
  175. unsigned int current_triangle = 0;
  176. unsigned int input_cursor = 1;
  177. unsigned int output_triangle = 0;
  178. while (current_triangle != ~0u)
  179. {
  180. assert(output_triangle < face_count);
  181. unsigned int a = indices[current_triangle * 3 + 0];
  182. unsigned int b = indices[current_triangle * 3 + 1];
  183. unsigned int c = indices[current_triangle * 3 + 2];
  184. // output indices
  185. destination[output_triangle * 3 + 0] = a;
  186. destination[output_triangle * 3 + 1] = b;
  187. destination[output_triangle * 3 + 2] = c;
  188. output_triangle++;
  189. // update emitted flags
  190. emitted_flags[current_triangle] = true;
  191. triangle_scores[current_triangle] = 0;
  192. // new triangle
  193. size_t cache_write = 0;
  194. cache_new[cache_write++] = a;
  195. cache_new[cache_write++] = b;
  196. cache_new[cache_write++] = c;
  197. // old triangles
  198. for (size_t i = 0; i < cache_count; ++i)
  199. {
  200. unsigned int index = cache[i];
  201. cache_new[cache_write] = index;
  202. cache_write += (index != a && index != b && index != c);
  203. }
  204. unsigned int* cache_temp = cache;
  205. cache = cache_new, cache_new = cache_temp;
  206. cache_count = cache_write > cache_size ? cache_size : cache_write;
  207. // update live triangle counts
  208. live_triangles[a]--;
  209. live_triangles[b]--;
  210. live_triangles[c]--;
  211. // remove emitted triangle from adjacency data
  212. // this makes sure that we spend less time traversing these lists on subsequent iterations
  213. for (size_t k = 0; k < 3; ++k)
  214. {
  215. unsigned int index = indices[current_triangle * 3 + k];
  216. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  217. size_t neighbors_size = adjacency.counts[index];
  218. for (size_t i = 0; i < neighbors_size; ++i)
  219. {
  220. unsigned int tri = neighbors[i];
  221. if (tri == current_triangle)
  222. {
  223. neighbors[i] = neighbors[neighbors_size - 1];
  224. adjacency.counts[index]--;
  225. break;
  226. }
  227. }
  228. }
  229. unsigned int best_triangle = ~0u;
  230. float best_score = 0;
  231. // update cache positions, vertex scores and triangle scores, and find next best triangle
  232. for (size_t i = 0; i < cache_write; ++i)
  233. {
  234. unsigned int index = cache[i];
  235. // no need to update scores if we are never going to use this vertex
  236. if (adjacency.counts[index] == 0)
  237. continue;
  238. int cache_position = i >= cache_size ? -1 : int(i);
  239. // update vertex score
  240. float score = vertexScore(table, cache_position, live_triangles[index]);
  241. float score_diff = score - vertex_scores[index];
  242. vertex_scores[index] = score;
  243. // update scores of vertex triangles
  244. const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
  245. const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
  246. for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
  247. {
  248. unsigned int tri = *it;
  249. assert(!emitted_flags[tri]);
  250. float tri_score = triangle_scores[tri] + score_diff;
  251. assert(tri_score > 0);
  252. best_triangle = best_score < tri_score ? tri : best_triangle;
  253. best_score = best_score < tri_score ? tri_score : best_score;
  254. triangle_scores[tri] = tri_score;
  255. }
  256. }
  257. // step through input triangles in order if we hit a dead-end
  258. current_triangle = best_triangle;
  259. if (current_triangle == ~0u)
  260. {
  261. current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
  262. }
  263. }
  264. assert(input_cursor == face_count);
  265. assert(output_triangle == face_count);
  266. }
  267. void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
  268. {
  269. meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
  270. }
  271. void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
  272. {
  273. meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
  274. }
  275. void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
  276. {
  277. using namespace meshopt;
  278. assert(index_count % 3 == 0);
  279. assert(cache_size >= 3);
  280. meshopt_Allocator allocator;
  281. // guard for empty meshes
  282. if (index_count == 0 || vertex_count == 0)
  283. return;
  284. // support in-place optimization
  285. if (destination == indices)
  286. {
  287. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  288. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  289. indices = indices_copy;
  290. }
  291. size_t face_count = index_count / 3;
  292. // build adjacency information
  293. TriangleAdjacency adjacency = {};
  294. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  295. // live triangle counts
  296. unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
  297. memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
  298. // cache time stamps
  299. unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
  300. memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
  301. // dead-end stack
  302. unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
  303. unsigned int dead_end_top = 0;
  304. // emitted flags
  305. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  306. memset(emitted_flags, 0, face_count);
  307. unsigned int current_vertex = 0;
  308. unsigned int timestamp = cache_size + 1;
  309. unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
  310. unsigned int output_triangle = 0;
  311. while (current_vertex != ~0u)
  312. {
  313. const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
  314. // emit all vertex neighbors
  315. const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
  316. const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
  317. for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
  318. {
  319. unsigned int triangle = *it;
  320. if (!emitted_flags[triangle])
  321. {
  322. unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
  323. // output indices
  324. destination[output_triangle * 3 + 0] = a;
  325. destination[output_triangle * 3 + 1] = b;
  326. destination[output_triangle * 3 + 2] = c;
  327. output_triangle++;
  328. // update dead-end stack
  329. dead_end[dead_end_top + 0] = a;
  330. dead_end[dead_end_top + 1] = b;
  331. dead_end[dead_end_top + 2] = c;
  332. dead_end_top += 3;
  333. // update live triangle counts
  334. live_triangles[a]--;
  335. live_triangles[b]--;
  336. live_triangles[c]--;
  337. // update cache info
  338. // if vertex is not in cache, put it in cache
  339. if (timestamp - cache_timestamps[a] > cache_size)
  340. cache_timestamps[a] = timestamp++;
  341. if (timestamp - cache_timestamps[b] > cache_size)
  342. cache_timestamps[b] = timestamp++;
  343. if (timestamp - cache_timestamps[c] > cache_size)
  344. cache_timestamps[c] = timestamp++;
  345. // update emitted flags
  346. emitted_flags[triangle] = true;
  347. }
  348. }
  349. // next candidates are the ones we pushed to dead-end stack just now
  350. const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
  351. // get next vertex
  352. current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
  353. if (current_vertex == ~0u)
  354. {
  355. current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
  356. }
  357. }
  358. assert(output_triangle == face_count);
  359. }