vcachetuner.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. #include "../src/meshoptimizer.h"
  2. #include "objparser.h"
  3. #include <algorithm>
  4. #include <functional>
  5. #include <vector>
  6. #include <cmath>
  7. #include <cstdint>
  8. #include <cstdio>
  9. #include <cstring>
  10. const int kCacheSizeMax = 16;
  11. const int kValenceMax = 8;
  12. namespace meshopt
  13. {
  14. extern thread_local float kVertexScoreTableCache[1 + kCacheSizeMax];
  15. extern thread_local float kVertexScoreTableLive[1 + kValenceMax];
  16. } // namespace meshopt
  17. struct { int cache, warp, triangle; } profiles[] =
  18. {
  19. {14, 64, 128}, // AMD GCN
  20. {32, 32, 32}, // NVidia Pascal
  21. // { 16, 32, 32 }, // NVidia Kepler, Maxwell
  22. // { 128, 0, 0 }, // Intel
  23. };
  24. const int Profile_Count = sizeof(profiles) / sizeof(profiles[0]);
  25. struct pcg32_random_t
  26. {
  27. uint64_t state;
  28. uint64_t inc;
  29. };
  30. #define PCG32_INITIALIZER { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL }
  31. uint32_t pcg32_random_r(pcg32_random_t* rng)
  32. {
  33. uint64_t oldstate = rng->state;
  34. // Advance internal state
  35. rng->state = oldstate * 6364136223846793005ULL + (rng->inc | 1);
  36. // Calculate output function (XSH RR), uses old state for max ILP
  37. uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
  38. uint32_t rot = oldstate >> 59u;
  39. return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
  40. }
  41. pcg32_random_t rngstate = PCG32_INITIALIZER;
  42. float rand01()
  43. {
  44. return pcg32_random_r(&rngstate) / float(1ull << 32);
  45. }
  46. uint32_t rand32()
  47. {
  48. return pcg32_random_r(&rngstate);
  49. }
  50. struct State
  51. {
  52. float cache[kCacheSizeMax];
  53. float live[kValenceMax];
  54. };
  55. struct Mesh
  56. {
  57. size_t vertex_count;
  58. std::vector<unsigned int> indices;
  59. float atvr_base[Profile_Count];
  60. };
  61. Mesh gridmesh(unsigned int N)
  62. {
  63. Mesh result;
  64. result.vertex_count = (N + 1) * (N + 1);
  65. result.indices.reserve(N * N * 6);
  66. for (unsigned int y = 0; y < N; ++y)
  67. for (unsigned int x = 0; x < N; ++x)
  68. {
  69. result.indices.push_back((y + 0) * (N + 1) + (x + 0));
  70. result.indices.push_back((y + 0) * (N + 1) + (x + 1));
  71. result.indices.push_back((y + 1) * (N + 1) + (x + 0));
  72. result.indices.push_back((y + 1) * (N + 1) + (x + 0));
  73. result.indices.push_back((y + 0) * (N + 1) + (x + 1));
  74. result.indices.push_back((y + 1) * (N + 1) + (x + 1));
  75. }
  76. return result;
  77. }
  78. Mesh objmesh(const char* path)
  79. {
  80. ObjFile file;
  81. if (!objParseFile(file, path))
  82. {
  83. printf("Error loading %s: file not found\n", path);
  84. return Mesh();
  85. }
  86. if (!objValidate(file))
  87. {
  88. printf("Error loading %s: invalid file data\n", path);
  89. return Mesh();
  90. }
  91. size_t total_indices = file.f_size / 3;
  92. struct Vertex
  93. {
  94. float px, py, pz;
  95. float nx, ny, nz;
  96. float tx, ty;
  97. };
  98. std::vector<Vertex> vertices(total_indices);
  99. for (size_t i = 0; i < total_indices; ++i)
  100. {
  101. int vi = file.f[i * 3 + 0];
  102. int vti = file.f[i * 3 + 1];
  103. int vni = file.f[i * 3 + 2];
  104. Vertex v =
  105. {
  106. file.v[vi * 3 + 0],
  107. file.v[vi * 3 + 1],
  108. file.v[vi * 3 + 2],
  109. vni >= 0 ? file.vn[vni * 3 + 0] : 0,
  110. vni >= 0 ? file.vn[vni * 3 + 1] : 0,
  111. vni >= 0 ? file.vn[vni * 3 + 2] : 0,
  112. vti >= 0 ? file.vt[vti * 3 + 0] : 0,
  113. vti >= 0 ? file.vt[vti * 3 + 1] : 0,
  114. };
  115. vertices[i] = v;
  116. }
  117. Mesh result;
  118. std::vector<unsigned int> remap(total_indices);
  119. size_t total_vertices = meshopt_generateVertexRemap(&remap[0], NULL, total_indices, &vertices[0], total_indices, sizeof(Vertex));
  120. result.indices.resize(total_indices);
  121. meshopt_remapIndexBuffer(&result.indices[0], NULL, total_indices, &remap[0]);
  122. result.vertex_count = total_vertices;
  123. return result;
  124. }
  125. void compute_atvr(const State& state, const Mesh& mesh, float result[Profile_Count])
  126. {
  127. memcpy(meshopt::kVertexScoreTableCache + 1, state.cache, kCacheSizeMax * sizeof(float));
  128. memcpy(meshopt::kVertexScoreTableLive + 1, state.live, kValenceMax * sizeof(float));
  129. std::vector<unsigned int> indices(mesh.indices.size());
  130. meshopt_optimizeVertexCache(&indices[0], &mesh.indices[0], mesh.indices.size(), mesh.vertex_count);
  131. for (int profile = 0; profile < Profile_Count; ++profile)
  132. result[profile] = meshopt_analyzeVertexCache(&indices[0], indices.size(), mesh.vertex_count, profiles[profile].cache, profiles[profile].warp, profiles[profile].triangle).atvr;
  133. }
  134. float fitness_score(const State& state, const std::vector<Mesh>& meshes)
  135. {
  136. float result = 0;
  137. float count = 0;
  138. for (auto& mesh : meshes)
  139. {
  140. float atvr[Profile_Count];
  141. compute_atvr(state, mesh, atvr);
  142. for (int profile = 0; profile < Profile_Count; ++profile)
  143. {
  144. result += mesh.atvr_base[profile] / atvr[profile];
  145. count += 1;
  146. }
  147. }
  148. return result / count;
  149. }
  150. float rndcache()
  151. {
  152. return rand01();
  153. }
  154. float rndlive()
  155. {
  156. return rand01();
  157. }
  158. std::vector<State> gen0(size_t count)
  159. {
  160. std::vector<State> result;
  161. for (size_t i = 0; i < count; ++i)
  162. {
  163. State state = {};
  164. for (int j = 0; j < kCacheSizeMax; ++j)
  165. state.cache[j] = rndcache();
  166. for (int j = 0; j < kValenceMax; ++j)
  167. state.live[j] = rndlive();
  168. result.push_back(state);
  169. }
  170. return result;
  171. }
  172. size_t rndindex(const std::vector<float>& prob)
  173. {
  174. float r = rand01();
  175. for (size_t i = 0; i < prob.size(); ++i)
  176. {
  177. r -= prob[i];
  178. if (r <= 0)
  179. return i;
  180. }
  181. return prob.size() - 1;
  182. }
  183. State mutate(const State& state)
  184. {
  185. State result = state;
  186. if (rand01() < 0.7f)
  187. {
  188. size_t idxcache = std::min(int(rand01() * kCacheSizeMax + 0.5f), int(kCacheSizeMax - 1));
  189. result.cache[idxcache] = rndcache();
  190. }
  191. if (rand01() < 0.7f)
  192. {
  193. size_t idxlive = std::min(int(rand01() * kValenceMax + 0.5f), int(kValenceMax - 1));
  194. result.live[idxlive] = rndlive();
  195. }
  196. if (rand01() < 0.2f)
  197. {
  198. uint32_t mask = rand32();
  199. for (size_t i = 0; i < kCacheSizeMax; ++i)
  200. if (mask & (1 << i))
  201. result.cache[i] *= 0.9f + 0.2f * rand01();
  202. }
  203. if (rand01() < 0.2f)
  204. {
  205. uint32_t mask = rand32();
  206. for (size_t i = 0; i < kValenceMax; ++i)
  207. if (mask & (1 << i))
  208. result.live[i] *= 0.9f + 0.2f * rand01();
  209. }
  210. if (rand01() < 0.05f)
  211. {
  212. uint32_t mask = rand32();
  213. for (size_t i = 0; i < kCacheSizeMax; ++i)
  214. if (mask & (1 << i))
  215. result.cache[i] = rndcache();
  216. }
  217. if (rand01() < 0.05f)
  218. {
  219. uint32_t mask = rand32();
  220. for (size_t i = 0; i < kValenceMax; ++i)
  221. if (mask & (1 << i))
  222. result.live[i] = rndlive();
  223. }
  224. return result;
  225. }
  226. bool accept(float fitnew, float fitold, float temp)
  227. {
  228. if (fitnew >= fitold)
  229. return true;
  230. if (temp == 0)
  231. return false;
  232. float prob = exp2((fitnew - fitold) / temp);
  233. return rand01() < prob;
  234. }
  235. std::pair<State, float> genN_SA(std::vector<State>& seed, const std::vector<Mesh>& meshes, size_t steps)
  236. {
  237. std::vector<State> result;
  238. result.reserve(seed.size() * (1 + steps));
  239. // perform several parallel steps of mutation for each temperature
  240. for (size_t i = 0; i < seed.size(); ++i)
  241. {
  242. result.push_back(seed[i]);
  243. for (size_t s = 0; s < steps; ++s)
  244. result.push_back(mutate(seed[i]));
  245. }
  246. // compute fitness for all temperatures & mutations in parallel
  247. std::vector<float> resultfit(result.size());
  248. #pragma omp parallel for
  249. for (size_t i = 0; i < result.size(); ++i)
  250. {
  251. resultfit[i] = fitness_score(result[i], meshes);
  252. }
  253. // perform annealing for each temperature
  254. std::vector<float> seedfit(seed.size());
  255. for (size_t i = 0; i < seed.size(); ++i)
  256. {
  257. size_t offset = i * (1 + steps);
  258. seedfit[i] = resultfit[offset];
  259. float temp = (float(i) / float(seed.size() - 1)) / 0.1f;
  260. for (size_t s = 0; s < steps; ++s)
  261. {
  262. if (accept(resultfit[offset + s + 1], seedfit[i], temp))
  263. {
  264. seedfit[i] = resultfit[offset + s + 1];
  265. seed[i] = result[offset + s + 1];
  266. }
  267. }
  268. }
  269. // perform promotion from each temperature to the next one
  270. for (size_t i = seed.size() - 1; i > 0; --i)
  271. {
  272. if (seedfit[i] > seedfit[i - 1])
  273. {
  274. seedfit[i - 1] = seedfit[i];
  275. seed[i - 1] = seed[i];
  276. }
  277. }
  278. return std::make_pair(seed[0], seedfit[0]);
  279. }
  280. std::pair<State, float> genN_GA(std::vector<State>& seed, const std::vector<Mesh>& meshes, float crossover, float mutate)
  281. {
  282. std::vector<State> result;
  283. result.reserve(seed.size());
  284. std::vector<float> seedprob(seed.size());
  285. #pragma omp parallel for
  286. for (size_t i = 0; i < seed.size(); ++i)
  287. {
  288. seedprob[i] = fitness_score(seed[i], meshes);
  289. }
  290. State best = {};
  291. float bestfit = 0;
  292. float probsum = 0;
  293. for (size_t i = 0; i < seed.size(); ++i)
  294. {
  295. float score = seedprob[i];
  296. probsum += score;
  297. if (score > bestfit)
  298. {
  299. best = seed[i];
  300. bestfit = score;
  301. }
  302. }
  303. for (auto& prob : seedprob)
  304. {
  305. prob /= probsum;
  306. }
  307. std::vector<unsigned int> seedidx;
  308. seedidx.reserve(seed.size());
  309. for (size_t i = 0; i < seed.size(); ++i)
  310. seedidx.push_back(i);
  311. std::sort(seedidx.begin(), seedidx.end(), [&](size_t l, size_t r) { return seedprob[l] < seedprob[r]; });
  312. while (result.size() < seed.size() / 4)
  313. {
  314. size_t idx = seedidx.back();
  315. seedidx.pop_back();
  316. result.push_back(seed[idx]);
  317. }
  318. while (result.size() < seed.size())
  319. {
  320. State s0 = seed[rndindex(seedprob)];
  321. State s1 = seed[rndindex(seedprob)];
  322. State state = s0;
  323. // crossover
  324. if (rand01() < crossover)
  325. {
  326. size_t idxcache = std::min(int(rand01() * kCacheSizeMax + 0.5f), 15);
  327. memcpy(state.cache + idxcache, s1.cache + idxcache, (kCacheSizeMax - idxcache) * sizeof(float));
  328. }
  329. if (rand01() < crossover)
  330. {
  331. size_t idxlive = std::min(int(rand01() * kValenceMax + 0.5f), 7);
  332. memcpy(state.live + idxlive, s1.live + idxlive, (kValenceMax - idxlive) * sizeof(float));
  333. }
  334. // mutate
  335. if (rand01() < mutate)
  336. {
  337. size_t idxcache = std::min(int(rand01() * kCacheSizeMax + 0.5f), 15);
  338. state.cache[idxcache] = rndcache();
  339. }
  340. if (rand01() < mutate)
  341. {
  342. size_t idxlive = std::min(int(rand01() * kValenceMax + 0.5f), 7);
  343. state.live[idxlive] = rndlive();
  344. }
  345. result.push_back(state);
  346. }
  347. seed.swap(result);
  348. return std::make_pair(best, bestfit);
  349. }
  350. bool load_state(const char* path, std::vector<State>& result)
  351. {
  352. FILE* file = fopen(path, "rb");
  353. if (!file)
  354. return false;
  355. State state;
  356. result.clear();
  357. while (fread(&state, sizeof(State), 1, file) == 1)
  358. result.push_back(state);
  359. fclose(file);
  360. return true;
  361. }
  362. bool save_state(const char* path, const std::vector<State>& result)
  363. {
  364. FILE* file = fopen(path, "wb");
  365. if (!file)
  366. return false;
  367. for (auto& state : result)
  368. {
  369. if (fwrite(&state, sizeof(State), 1, file) != 1)
  370. {
  371. fclose(file);
  372. return false;
  373. }
  374. }
  375. return fclose(file) == 0;
  376. }
  377. void dump_state(const State& state)
  378. {
  379. printf("cache:");
  380. for (int i = 0; i < kCacheSizeMax; ++i)
  381. {
  382. printf(" %.3f", state.cache[i]);
  383. }
  384. printf("\n");
  385. printf("live:");
  386. for (int i = 0; i < kValenceMax; ++i)
  387. {
  388. printf(" %.3f", state.live[i]);
  389. }
  390. printf("\n");
  391. }
  392. int main(int argc, char** argv)
  393. {
  394. bool annealing = false;
  395. State baseline;
  396. memcpy(baseline.cache, meshopt::kVertexScoreTableCache + 1, kCacheSizeMax * sizeof(float));
  397. memcpy(baseline.live, meshopt::kVertexScoreTableLive + 1, kValenceMax * sizeof(float));
  398. std::vector<Mesh> meshes;
  399. meshes.push_back(gridmesh(50));
  400. for (int i = 1; i < argc; ++i)
  401. meshes.push_back(objmesh(argv[i]));
  402. size_t total_triangles = 0;
  403. for (auto& mesh : meshes)
  404. {
  405. compute_atvr(baseline, mesh, mesh.atvr_base);
  406. total_triangles += mesh.indices.size() / 3;
  407. }
  408. std::vector<State> pop;
  409. size_t gen = 0;
  410. if (load_state("mutator.state", pop))
  411. {
  412. printf("Loaded %d state vectors\n", int(pop.size()));
  413. }
  414. else
  415. {
  416. pop = gen0(annealing ? 32 : 1000);
  417. }
  418. printf("%d meshes, %.1fM triangles\n", int(meshes.size()), double(total_triangles) / 1e6);
  419. float atvr_0[Profile_Count];
  420. float atvr_N[Profile_Count];
  421. compute_atvr(baseline, meshes[0], atvr_0);
  422. compute_atvr(baseline, meshes.back(), atvr_N);
  423. printf("baseline: grid %f %f %s %f %f\n", atvr_0[0], atvr_0[1], argv[argc - 1], atvr_N[0], atvr_N[1]);
  424. for (;;)
  425. {
  426. auto best = annealing ? genN_SA(pop, meshes, 31) : genN_GA(pop, meshes, 0.7f, 0.3f);
  427. gen++;
  428. compute_atvr(best.first, meshes[0], atvr_0);
  429. compute_atvr(best.first, meshes.back(), atvr_N);
  430. printf("%d: fitness %f; grid %f %f %s %f %f\n", int(gen), best.second, atvr_0[0], atvr_0[1], argv[argc - 1], atvr_N[0], atvr_N[1]);
  431. if (gen % 100 == 0)
  432. {
  433. char buf[128];
  434. sprintf(buf, "gcloud logging write vcache-log \"fitness %f; grid %f %f %s %f %f\"", best.second, atvr_0[0], atvr_0[1], argv[argc - 1], atvr_N[0], atvr_N[1]);
  435. system(buf);
  436. }
  437. dump_state(best.first);
  438. if (save_state("mutator.state-temp", pop) && rename("mutator.state-temp", "mutator.state") == 0)
  439. {
  440. }
  441. else
  442. {
  443. printf("ERROR: Can't save state\n");
  444. }
  445. }
  446. }