vcachetester.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. #ifdef _WIN32
  2. #include <assert.h>
  3. #include <d3d11.h>
  4. #include <d3dcompiler.h>
  5. #include <stdio.h>
  6. #include <cassert>
  7. #include <cmath>
  8. #include <algorithm>
  9. #include <vector>
  10. #include "../src/meshoptimizer.h"
  11. #include "fast_obj.h"
  12. #pragma comment(lib, "d3d11.lib")
  13. #pragma comment(lib, "d3dcompiler.lib")
  14. #pragma comment(lib, "dxgi.lib")
  15. void stripGen(std::vector<unsigned int>& indices, int x0, int x1, int y0, int y1, int width, bool prefetch)
  16. {
  17. if (prefetch)
  18. {
  19. for (int x = x0; x < x1; x++)
  20. {
  21. indices.push_back(x + 0);
  22. indices.push_back(x + 0);
  23. indices.push_back(x + 1);
  24. }
  25. }
  26. for (int y = y0; y < y1; y++)
  27. {
  28. for (int x = x0; x < x1; x++)
  29. {
  30. indices.push_back((width + 1) * (y + 0) + (x + 0));
  31. indices.push_back((width + 1) * (y + 1) + (x + 0));
  32. indices.push_back((width + 1) * (y + 0) + (x + 1));
  33. indices.push_back((width + 1) * (y + 0) + (x + 1));
  34. indices.push_back((width + 1) * (y + 1) + (x + 0));
  35. indices.push_back((width + 1) * (y + 1) + (x + 1));
  36. }
  37. }
  38. }
  39. void gridGen(std::vector<unsigned int>& indices, int x0, int x1, int y0, int y1, int width, int cacheSize, bool prefetch)
  40. {
  41. if (x1 - x0 + 1 < cacheSize)
  42. {
  43. bool prefetchStrip = 2 * (x1 - x0) + 1 > cacheSize && prefetch;
  44. stripGen(indices, x0, x1, y0, y1, width, prefetchStrip);
  45. }
  46. else
  47. {
  48. int xm = x0 + cacheSize - 2;
  49. gridGen(indices, x0, xm, y0, y1, width, cacheSize, prefetch);
  50. gridGen(indices, xm, x1, y0, y1, width, cacheSize, prefetch);
  51. }
  52. }
  53. unsigned int queryVSInvocations(ID3D11Device* device, ID3D11DeviceContext* context, const unsigned int* indices, size_t index_count)
  54. {
  55. if (index_count == 0)
  56. return 0;
  57. ID3D11Buffer* ib = 0;
  58. {
  59. D3D11_BUFFER_DESC bd = {};
  60. bd.Usage = D3D11_USAGE_DYNAMIC;
  61. bd.ByteWidth = index_count * 4;
  62. bd.BindFlags = D3D11_BIND_INDEX_BUFFER;
  63. bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
  64. device->CreateBuffer(&bd, 0, &ib);
  65. D3D11_MAPPED_SUBRESOURCE ms;
  66. context->Map(ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &ms);
  67. memcpy(ms.pData, indices, index_count * 4);
  68. context->Unmap(ib, 0);
  69. }
  70. context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
  71. context->IASetIndexBuffer(ib, DXGI_FORMAT_R32_UINT, 0);
  72. D3D11_QUERY_DESC qdesc = {D3D11_QUERY_PIPELINE_STATISTICS};
  73. ID3D11Query* query = 0;
  74. device->CreateQuery(&qdesc, &query);
  75. context->Begin(query);
  76. context->DrawIndexed(index_count, 0, 0);
  77. context->End(query);
  78. D3D11_QUERY_DATA_PIPELINE_STATISTICS stats = {};
  79. while (S_FALSE == context->GetData(query, &stats, sizeof(stats), 0))
  80. ;
  81. query->Release();
  82. ib->Release();
  83. assert(stats.IAVertices == index_count);
  84. return stats.VSInvocations;
  85. }
  86. void setupShaders(ID3D11Device* device, ID3D11DeviceContext* context)
  87. {
  88. // load and compile the two shaders
  89. const char* shaders =
  90. "#define ATTRIBUTES 5\n"
  91. "struct Foo { float4 v[ATTRIBUTES]; };"
  92. "float4 VS(uint index: SV_VertexId, out Foo foo: FOO): SV_Position { uint i = index % 3; [unroll] for (int j = 0; j < ATTRIBUTES; j++) foo.v[j] = j; return float4(i != 0, i != 2, 0, 1); }"
  93. "float4 PS(Foo foo: FOO): SV_Target { float4 result = 0; [unroll] for (int j = 0; j < ATTRIBUTES; j++) result += foo.v[j]; return result; }";
  94. ID3DBlob* vsblob = 0;
  95. ID3DBlob* psblob = 0;
  96. D3DCompile(shaders, strlen(shaders), 0, 0, 0, "VS", "vs_5_0", 0, 0, &vsblob, 0);
  97. D3DCompile(shaders, strlen(shaders), 0, 0, 0, "PS", "ps_5_0", 0, 0, &psblob, 0);
  98. ID3D11VertexShader* vs = 0;
  99. ID3D11PixelShader* ps = 0;
  100. device->CreateVertexShader(vsblob->GetBufferPointer(), vsblob->GetBufferSize(), 0, &vs);
  101. device->CreatePixelShader(psblob->GetBufferPointer(), psblob->GetBufferSize(), 0, &ps);
  102. context->VSSetShader(vs, 0, 0);
  103. context->PSSetShader(ps, 0, 0);
  104. }
  105. template <typename Cache>
  106. void inspectCache(Cache cache)
  107. {
  108. unsigned int max_cache_size = 200;
  109. unsigned int grid_size = 100;
  110. for (unsigned int cache_size = 3; cache_size <= max_cache_size; cache_size += 1)
  111. {
  112. std::vector<unsigned int> grid1;
  113. gridGen(grid1, 0, grid_size, 0, grid_size, grid_size, cache_size, true);
  114. std::vector<unsigned int> grid2;
  115. gridGen(grid2, 0, grid_size, 0, grid_size, grid_size, cache_size, false);
  116. std::vector<unsigned int> grid3;
  117. gridGen(grid3, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles
  118. meshopt_optimizeVertexCacheFifo(&grid3[0], &grid3[0], grid3.size(), (grid_size + 1) * (grid_size + 1), cache_size);
  119. std::vector<unsigned int> grid4;
  120. gridGen(grid4, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles
  121. meshopt_optimizeVertexCache(&grid4[0], &grid4[0], grid4.size(), (grid_size + 1) * (grid_size + 1));
  122. unsigned int invocations1 = cache(&grid1[0], grid1.size());
  123. unsigned int invocations2 = cache(&grid2[0], grid2.size());
  124. unsigned int invocations3 = cache(&grid3[0], grid3.size());
  125. unsigned int invocations4 = cache(&grid4[0], grid4.size());
  126. unsigned int ideal_invocations = (grid_size + 1) * (grid_size + 1);
  127. printf("%d, %f, %f, %f, %f\n", cache_size,
  128. double(invocations1) / double(ideal_invocations),
  129. double(invocations2) / double(ideal_invocations),
  130. double(invocations3) / double(ideal_invocations),
  131. double(invocations4) / double(ideal_invocations));
  132. }
  133. }
  134. void testCache(IDXGIAdapter* adapter)
  135. {
  136. ID3D11Device* device = 0;
  137. ID3D11DeviceContext* context = 0;
  138. D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context);
  139. setupShaders(device, context);
  140. inspectCache([&](const unsigned int* indices, size_t index_count) { return queryVSInvocations(device, context, indices, index_count); });
  141. }
  142. void testCacheSequence(IDXGIAdapter* adapter, int argc, char** argv)
  143. {
  144. ID3D11Device* device = 0;
  145. ID3D11DeviceContext* context = 0;
  146. D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context);
  147. setupShaders(device, context);
  148. std::vector<unsigned int> ib;
  149. for (int i = 2; i < argc; ++i)
  150. {
  151. char* end;
  152. int i0 = strtol(argv[i], &end, 10);
  153. if (end[0] == '-')
  154. {
  155. int i1 = strtol(end + 1, &end, 10);
  156. if (end[0] != 0)
  157. {
  158. printf("Unrecognized index range: %s\n", argv[i]);
  159. return;
  160. }
  161. if (i0 < i1)
  162. {
  163. for (int ii = i0; ii <= i1; ++ii)
  164. ib.push_back(ii);
  165. }
  166. else
  167. {
  168. for (int ii = i0; ii >= i1; --ii)
  169. ib.push_back(ii);
  170. }
  171. }
  172. else if (end[0] == '*')
  173. {
  174. int i1 = strtol(end + 1, &end, 10);
  175. if (end[0] != 0 || i1 == 0)
  176. {
  177. printf("Unrecognized index range: %s\n", argv[i]);
  178. return;
  179. }
  180. for (int ii = 0; ii < i1; ++ii)
  181. ib.push_back(i0);
  182. }
  183. else if (end[0] == 'x')
  184. {
  185. int i1 = strtol(end + 1, &end, 10);
  186. if (end[0] != 0)
  187. {
  188. printf("Unrecognized index range: %s\n", argv[i]);
  189. return;
  190. }
  191. stripGen(ib, 0, i0, 0, i1, i0, true);
  192. }
  193. else if (end[0] == 0)
  194. {
  195. ib.push_back(i0);
  196. }
  197. else
  198. {
  199. printf("Unrecognized index range: %s\n", argv[i]);
  200. return;
  201. }
  202. }
  203. if (ib.size() % 3)
  204. ib.resize(ib.size() - ib.size() % 3);
  205. std::vector<bool> xformed(ib.size());
  206. for (size_t i = 0; i < ib.size(); i += 3)
  207. {
  208. unsigned int inv0 = i == 0 ? 0 : queryVSInvocations(device, context, ib.data(), i);
  209. unsigned int inv1 = queryVSInvocations(device, context, ib.data(), i + 3);
  210. assert(inv0 <= inv1);
  211. assert(inv0 + 3 >= inv1);
  212. switch (inv1 - inv0)
  213. {
  214. case 0:
  215. xformed[i + 0] = xformed[i + 1] = xformed[i + 2] = false;
  216. break;
  217. case 3:
  218. xformed[i + 0] = xformed[i + 1] = xformed[i + 2] = true;
  219. break;
  220. case 1:
  221. case 2:
  222. {
  223. unsigned int a = ib[i + 0];
  224. unsigned int b = ib[i + 1];
  225. unsigned int c = ib[i + 2];
  226. ib[i + 0] = ib[i + 1] = ib[i + 2] = a;
  227. unsigned int inva = queryVSInvocations(device, context, ib.data(), i + 3);
  228. ib[i + 1] = ib[i + 2] = b;
  229. unsigned int invb = queryVSInvocations(device, context, ib.data(), i + 3);
  230. ib[i + 2] = c;
  231. unsigned int invc = queryVSInvocations(device, context, ib.data(), i + 3);
  232. assert(inv0 <= inva && inva <= inv1);
  233. assert(inv0 <= invb && invb <= inv1);
  234. assert(inv0 <= invc && invc <= inv1);
  235. if (inv1 - inv0 == 1 && a == c && inva == inv1 && invb == inv0 && invc == inv1)
  236. {
  237. xformed[i + 0] = false;
  238. xformed[i + 1] = false;
  239. xformed[i + 2] = true;
  240. }
  241. else
  242. {
  243. assert(inva <= invb);
  244. assert(invb <= invc);
  245. xformed[i + 0] = inva == inv0 + 1;
  246. xformed[i + 1] = invb == inva + 1;
  247. xformed[i + 2] = invc == invb + 1;
  248. }
  249. break;
  250. }
  251. }
  252. }
  253. unsigned int xformed_total = 0;
  254. for (size_t i = 0; i < ib.size(); ++i)
  255. xformed_total += xformed[i];
  256. printf("// Sequence: %d indices", int(ib.size()));
  257. for (size_t i = 0; i < ib.size(); ++i)
  258. {
  259. if (i % 12 == 0)
  260. {
  261. printf("\n// %3d*3:", int(i / 3));
  262. }
  263. if (xformed[i])
  264. printf(" %3d*", ib[i]);
  265. else
  266. printf(" %3d ", ib[i]);
  267. }
  268. printf("\n");
  269. std::vector<unsigned int> cached;
  270. for (size_t i = 0; i < ib.size(); ++i)
  271. {
  272. unsigned int index = ib[i];
  273. unsigned int inv0 = queryVSInvocations(device, context, ib.data(), ib.size());
  274. ib.push_back(index);
  275. ib.push_back(index);
  276. ib.push_back(index);
  277. unsigned int inv1 = queryVSInvocations(device, context, ib.data(), ib.size());
  278. ib.resize(ib.size() - 3);
  279. if (inv1 == inv0)
  280. cached.push_back(index);
  281. }
  282. std::sort(cached.begin(), cached.end());
  283. cached.erase(std::unique(cached.begin(), cached.end()), cached.end());
  284. printf("// Cached :");
  285. for (size_t i = 0; i < cached.size(); ++i)
  286. printf(" %d", cached[i]);
  287. printf(" (%d)\n", int(cached.size()));
  288. unsigned int invocations = queryVSInvocations(device, context, ib.data(), ib.size());
  289. printf("// Invocations: %d\n", invocations);
  290. assert(xformed_total == invocations);
  291. }
  292. void testCacheMeshes(IDXGIAdapter* adapter, int argc, char** argv)
  293. {
  294. ID3D11Device* device = 0;
  295. ID3D11DeviceContext* context = 0;
  296. D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context);
  297. setupShaders(device, context);
  298. bool stat = false;
  299. double atvr_sum = 0;
  300. double atvr_count = 0;
  301. unsigned int total_invocations = 0;
  302. unsigned int total_vertices = 0;
  303. for (int i = 1; i < argc; ++i)
  304. {
  305. const char* path = argv[i];
  306. if (strcmp(path, "--stat") == 0)
  307. {
  308. stat = true;
  309. continue;
  310. }
  311. fastObjMesh* obj = fast_obj_read(path);
  312. if (!obj)
  313. {
  314. printf("Error loading %s: file not found\n", path);
  315. continue;
  316. }
  317. std::vector<unsigned int> ib1;
  318. size_t index_offset = 0;
  319. for (unsigned int i = 0; i < obj->face_count; ++i)
  320. {
  321. for (unsigned int j = 0; j < obj->face_vertices[i]; ++j)
  322. {
  323. fastObjIndex gi = obj->indices[index_offset + j];
  324. // triangulate polygon on the fly; offset-3 is always the first polygon vertex
  325. if (j >= 3)
  326. {
  327. unsigned int i0 = ib1[ib1.size() - 3];
  328. unsigned int i1 = ib1[ib1.size() - 1];
  329. ib1.push_back(i0);
  330. ib1.push_back(i1);
  331. }
  332. ib1.push_back(gi.p);
  333. }
  334. index_offset += obj->face_vertices[i];
  335. }
  336. unsigned int vertex_count = obj->position_count;
  337. unsigned int index_count = ib1.size();
  338. unsigned int invocations1 = queryVSInvocations(device, context, ib1.data(), index_count);
  339. if (stat)
  340. {
  341. std::vector<unsigned int> ib2(ib1.size());
  342. meshopt_optimizeVertexCache(&ib2[0], &ib1[0], ib1.size(), vertex_count);
  343. unsigned int invocations = queryVSInvocations(device, context, ib2.data(), index_count);
  344. atvr_sum += double(invocations) / double(vertex_count);
  345. atvr_count += 1;
  346. total_invocations += invocations;
  347. total_vertices += vertex_count;
  348. }
  349. else
  350. {
  351. printf("%s: baseline %f\n", path, double(invocations1) / double(vertex_count));
  352. std::vector<unsigned int> ib3(ib1.size());
  353. meshopt_optimizeVertexCache(&ib3[0], &ib1[0], ib1.size(), vertex_count);
  354. unsigned int invocations3 = queryVSInvocations(device, context, ib3.data(), index_count);
  355. printf("%s: forsyth %f\n", path, double(invocations3) / double(vertex_count));
  356. for (unsigned int cache_size = 12; cache_size <= 24; ++cache_size)
  357. {
  358. std::vector<unsigned int> ib2(ib1.size());
  359. meshopt_optimizeVertexCacheFifo(&ib2[0], &ib1[0], ib1.size(), vertex_count, cache_size);
  360. unsigned int invocations2 = queryVSInvocations(device, context, ib2.data(), index_count);
  361. printf("%s: tipsify(%d) %f\n", path, cache_size, double(invocations2) / double(vertex_count));
  362. }
  363. }
  364. }
  365. if (stat)
  366. {
  367. printf("ATVR: average %f cumulative %f; %d vertices\n", atvr_sum / atvr_count, double(total_invocations) / double(total_vertices), total_vertices);
  368. }
  369. }
  370. int main(int argc, char** argv)
  371. {
  372. IDXGIFactory1* factory = 0;
  373. CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&factory);
  374. IDXGIAdapter* adapter = NULL;
  375. for (unsigned int index = 0; SUCCEEDED(factory->EnumAdapters(index, &adapter)); ++index)
  376. {
  377. DXGI_ADAPTER_DESC ad = {};
  378. adapter->GetDesc(&ad);
  379. if (ad.VendorId == 0x1414 && ad.DeviceId == 0x8c)
  380. continue; // Skip Microsoft Basic Render Driver
  381. printf("// GPU %d: %S (Vendor %04x Device %04x)\n", index, ad.Description, ad.VendorId, ad.DeviceId);
  382. if (argc == 1)
  383. {
  384. testCache(adapter);
  385. }
  386. else if (argc > 1 && strcmp(argv[1], "--") == 0)
  387. {
  388. testCacheSequence(adapter, argc, argv);
  389. }
  390. else
  391. {
  392. testCacheMeshes(adapter, argc, argv);
  393. }
  394. }
  395. }
  396. #endif