#ifdef _WIN32 #include #include #include #include #include #include #include #include #include "../src/meshoptimizer.h" #include "fast_obj.h" #pragma comment(lib, "d3d11.lib") #pragma comment(lib, "d3dcompiler.lib") #pragma comment(lib, "dxgi.lib") void stripGen(std::vector& indices, int x0, int x1, int y0, int y1, int width, bool prefetch) { if (prefetch) { for (int x = x0; x < x1; x++) { indices.push_back(x + 0); indices.push_back(x + 0); indices.push_back(x + 1); } } for (int y = y0; y < y1; y++) { for (int x = x0; x < x1; x++) { indices.push_back((width + 1) * (y + 0) + (x + 0)); indices.push_back((width + 1) * (y + 1) + (x + 0)); indices.push_back((width + 1) * (y + 0) + (x + 1)); indices.push_back((width + 1) * (y + 0) + (x + 1)); indices.push_back((width + 1) * (y + 1) + (x + 0)); indices.push_back((width + 1) * (y + 1) + (x + 1)); } } } void gridGen(std::vector& indices, int x0, int x1, int y0, int y1, int width, int cacheSize, bool prefetch) { if (x1 - x0 + 1 < cacheSize) { bool prefetchStrip = 2 * (x1 - x0) + 1 > cacheSize && prefetch; stripGen(indices, x0, x1, y0, y1, width, prefetchStrip); } else { int xm = x0 + cacheSize - 2; gridGen(indices, x0, xm, y0, y1, width, cacheSize, prefetch); gridGen(indices, xm, x1, y0, y1, width, cacheSize, prefetch); } } unsigned int queryVSInvocations(ID3D11Device* device, ID3D11DeviceContext* context, const unsigned int* indices, size_t index_count) { if (index_count == 0) return 0; ID3D11Buffer* ib = 0; { D3D11_BUFFER_DESC bd = {}; bd.Usage = D3D11_USAGE_DYNAMIC; bd.ByteWidth = index_count * 4; bd.BindFlags = D3D11_BIND_INDEX_BUFFER; bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; device->CreateBuffer(&bd, 0, &ib); D3D11_MAPPED_SUBRESOURCE ms; context->Map(ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &ms); memcpy(ms.pData, indices, index_count * 4); context->Unmap(ib, 0); } context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); context->IASetIndexBuffer(ib, DXGI_FORMAT_R32_UINT, 0); D3D11_QUERY_DESC qdesc = {D3D11_QUERY_PIPELINE_STATISTICS}; ID3D11Query* query = 0; device->CreateQuery(&qdesc, &query); context->Begin(query); context->DrawIndexed(index_count, 0, 0); context->End(query); D3D11_QUERY_DATA_PIPELINE_STATISTICS stats = {}; while (S_FALSE == context->GetData(query, &stats, sizeof(stats), 0)) ; query->Release(); ib->Release(); assert(stats.IAVertices == index_count); return stats.VSInvocations; } void setupShaders(ID3D11Device* device, ID3D11DeviceContext* context) { // load and compile the two shaders const char* shaders = "#define ATTRIBUTES 5\n" "struct Foo { float4 v[ATTRIBUTES]; };" "float4 VS(uint index: SV_VertexId, out Foo foo: FOO): SV_Position { uint i = index % 3; [unroll] for (int j = 0; j < ATTRIBUTES; j++) foo.v[j] = j; return float4(i != 0, i != 2, 0, 1); }" "float4 PS(Foo foo: FOO): SV_Target { float4 result = 0; [unroll] for (int j = 0; j < ATTRIBUTES; j++) result += foo.v[j]; return result; }"; ID3DBlob* vsblob = 0; ID3DBlob* psblob = 0; D3DCompile(shaders, strlen(shaders), 0, 0, 0, "VS", "vs_5_0", 0, 0, &vsblob, 0); D3DCompile(shaders, strlen(shaders), 0, 0, 0, "PS", "ps_5_0", 0, 0, &psblob, 0); ID3D11VertexShader* vs = 0; ID3D11PixelShader* ps = 0; device->CreateVertexShader(vsblob->GetBufferPointer(), vsblob->GetBufferSize(), 0, &vs); device->CreatePixelShader(psblob->GetBufferPointer(), psblob->GetBufferSize(), 0, &ps); context->VSSetShader(vs, 0, 0); context->PSSetShader(ps, 0, 0); } template void inspectCache(Cache cache) { unsigned int max_cache_size = 200; unsigned int grid_size = 100; for (unsigned int cache_size = 3; cache_size <= max_cache_size; cache_size += 1) { std::vector grid1; gridGen(grid1, 0, grid_size, 0, grid_size, grid_size, cache_size, true); std::vector grid2; gridGen(grid2, 0, grid_size, 0, grid_size, grid_size, cache_size, false); std::vector grid3; gridGen(grid3, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles meshopt_optimizeVertexCacheFifo(&grid3[0], &grid3[0], grid3.size(), (grid_size + 1) * (grid_size + 1), cache_size); std::vector grid4; gridGen(grid4, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles meshopt_optimizeVertexCache(&grid4[0], &grid4[0], grid4.size(), (grid_size + 1) * (grid_size + 1)); unsigned int invocations1 = cache(&grid1[0], grid1.size()); unsigned int invocations2 = cache(&grid2[0], grid2.size()); unsigned int invocations3 = cache(&grid3[0], grid3.size()); unsigned int invocations4 = cache(&grid4[0], grid4.size()); unsigned int ideal_invocations = (grid_size + 1) * (grid_size + 1); printf("%d, %f, %f, %f, %f\n", cache_size, double(invocations1) / double(ideal_invocations), double(invocations2) / double(ideal_invocations), double(invocations3) / double(ideal_invocations), double(invocations4) / double(ideal_invocations)); } } void testCache(IDXGIAdapter* adapter) { ID3D11Device* device = 0; ID3D11DeviceContext* context = 0; D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context); setupShaders(device, context); inspectCache([&](const unsigned int* indices, size_t index_count) { return queryVSInvocations(device, context, indices, index_count); }); } void testCacheSequence(IDXGIAdapter* adapter, int argc, char** argv) { ID3D11Device* device = 0; ID3D11DeviceContext* context = 0; D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context); setupShaders(device, context); std::vector ib; for (int i = 2; i < argc; ++i) { char* end; int i0 = strtol(argv[i], &end, 10); if (end[0] == '-') { int i1 = strtol(end + 1, &end, 10); if (end[0] != 0) { printf("Unrecognized index range: %s\n", argv[i]); return; } if (i0 < i1) { for (int ii = i0; ii <= i1; ++ii) ib.push_back(ii); } else { for (int ii = i0; ii >= i1; --ii) ib.push_back(ii); } } else if (end[0] == '*') { int i1 = strtol(end + 1, &end, 10); if (end[0] != 0 || i1 == 0) { printf("Unrecognized index range: %s\n", argv[i]); return; } for (int ii = 0; ii < i1; ++ii) ib.push_back(i0); } else if (end[0] == 'x') { int i1 = strtol(end + 1, &end, 10); if (end[0] != 0) { printf("Unrecognized index range: %s\n", argv[i]); return; } stripGen(ib, 0, i0, 0, i1, i0, true); } else if (end[0] == 0) { ib.push_back(i0); } else { printf("Unrecognized index range: %s\n", argv[i]); return; } } if (ib.size() % 3) ib.resize(ib.size() - ib.size() % 3); std::vector xformed(ib.size()); for (size_t i = 0; i < ib.size(); i += 3) { unsigned int inv0 = i == 0 ? 0 : queryVSInvocations(device, context, ib.data(), i); unsigned int inv1 = queryVSInvocations(device, context, ib.data(), i + 3); assert(inv0 <= inv1); assert(inv0 + 3 >= inv1); switch (inv1 - inv0) { case 0: xformed[i + 0] = xformed[i + 1] = xformed[i + 2] = false; break; case 3: xformed[i + 0] = xformed[i + 1] = xformed[i + 2] = true; break; case 1: case 2: { unsigned int a = ib[i + 0]; unsigned int b = ib[i + 1]; unsigned int c = ib[i + 2]; ib[i + 0] = ib[i + 1] = ib[i + 2] = a; unsigned int inva = queryVSInvocations(device, context, ib.data(), i + 3); ib[i + 1] = ib[i + 2] = b; unsigned int invb = queryVSInvocations(device, context, ib.data(), i + 3); ib[i + 2] = c; unsigned int invc = queryVSInvocations(device, context, ib.data(), i + 3); assert(inv0 <= inva && inva <= inv1); assert(inv0 <= invb && invb <= inv1); assert(inv0 <= invc && invc <= inv1); if (inv1 - inv0 == 1 && a == c && inva == inv1 && invb == inv0 && invc == inv1) { xformed[i + 0] = false; xformed[i + 1] = false; xformed[i + 2] = true; } else { assert(inva <= invb); assert(invb <= invc); xformed[i + 0] = inva == inv0 + 1; xformed[i + 1] = invb == inva + 1; xformed[i + 2] = invc == invb + 1; } break; } } } unsigned int xformed_total = 0; for (size_t i = 0; i < ib.size(); ++i) xformed_total += xformed[i]; printf("// Sequence: %d indices", int(ib.size())); for (size_t i = 0; i < ib.size(); ++i) { if (i % 12 == 0) { printf("\n// %3d*3:", int(i / 3)); } if (xformed[i]) printf(" %3d*", ib[i]); else printf(" %3d ", ib[i]); } printf("\n"); std::vector cached; for (size_t i = 0; i < ib.size(); ++i) { unsigned int index = ib[i]; unsigned int inv0 = queryVSInvocations(device, context, ib.data(), ib.size()); ib.push_back(index); ib.push_back(index); ib.push_back(index); unsigned int inv1 = queryVSInvocations(device, context, ib.data(), ib.size()); ib.resize(ib.size() - 3); if (inv1 == inv0) cached.push_back(index); } std::sort(cached.begin(), cached.end()); cached.erase(std::unique(cached.begin(), cached.end()), cached.end()); printf("// Cached :"); for (size_t i = 0; i < cached.size(); ++i) printf(" %d", cached[i]); printf(" (%d)\n", int(cached.size())); unsigned int invocations = queryVSInvocations(device, context, ib.data(), ib.size()); printf("// Invocations: %d\n", invocations); assert(xformed_total == invocations); } void testCacheMeshes(IDXGIAdapter* adapter, int argc, char** argv) { ID3D11Device* device = 0; ID3D11DeviceContext* context = 0; D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context); setupShaders(device, context); bool stat = false; double atvr_sum = 0; double atvr_count = 0; unsigned int total_invocations = 0; unsigned int total_vertices = 0; for (int i = 1; i < argc; ++i) { const char* path = argv[i]; if (strcmp(path, "--stat") == 0) { stat = true; continue; } fastObjMesh* obj = fast_obj_read(path); if (!obj) { printf("Error loading %s: file not found\n", path); continue; } std::vector ib1; size_t index_offset = 0; for (unsigned int i = 0; i < obj->face_count; ++i) { for (unsigned int j = 0; j < obj->face_vertices[i]; ++j) { fastObjIndex gi = obj->indices[index_offset + j]; // triangulate polygon on the fly; offset-3 is always the first polygon vertex if (j >= 3) { unsigned int i0 = ib1[ib1.size() - 3]; unsigned int i1 = ib1[ib1.size() - 1]; ib1.push_back(i0); ib1.push_back(i1); } ib1.push_back(gi.p); } index_offset += obj->face_vertices[i]; } unsigned int vertex_count = obj->position_count; unsigned int index_count = ib1.size(); unsigned int invocations1 = queryVSInvocations(device, context, ib1.data(), index_count); if (stat) { std::vector ib2(ib1.size()); meshopt_optimizeVertexCache(&ib2[0], &ib1[0], ib1.size(), vertex_count); unsigned int invocations = queryVSInvocations(device, context, ib2.data(), index_count); atvr_sum += double(invocations) / double(vertex_count); atvr_count += 1; total_invocations += invocations; total_vertices += vertex_count; } else { printf("%s: baseline %f\n", path, double(invocations1) / double(vertex_count)); std::vector ib3(ib1.size()); meshopt_optimizeVertexCache(&ib3[0], &ib1[0], ib1.size(), vertex_count); unsigned int invocations3 = queryVSInvocations(device, context, ib3.data(), index_count); printf("%s: forsyth %f\n", path, double(invocations3) / double(vertex_count)); for (unsigned int cache_size = 12; cache_size <= 24; ++cache_size) { std::vector ib2(ib1.size()); meshopt_optimizeVertexCacheFifo(&ib2[0], &ib1[0], ib1.size(), vertex_count, cache_size); unsigned int invocations2 = queryVSInvocations(device, context, ib2.data(), index_count); printf("%s: tipsify(%d) %f\n", path, cache_size, double(invocations2) / double(vertex_count)); } } } if (stat) { printf("ATVR: average %f cumulative %f; %d vertices\n", atvr_sum / atvr_count, double(total_invocations) / double(total_vertices), total_vertices); } } int main(int argc, char** argv) { IDXGIFactory1* factory = 0; CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&factory); IDXGIAdapter* adapter = NULL; for (unsigned int index = 0; SUCCEEDED(factory->EnumAdapters(index, &adapter)); ++index) { DXGI_ADAPTER_DESC ad = {}; adapter->GetDesc(&ad); if (ad.VendorId == 0x1414 && ad.DeviceId == 0x8c) continue; // Skip Microsoft Basic Render Driver printf("// GPU %d: %S (Vendor %04x Device %04x)\n", index, ad.Description, ad.VendorId, ad.DeviceId); if (argc == 1) { testCache(adapter); } else if (argc > 1 && strcmp(argv[1], "--") == 0) { testCacheSequence(adapter, argc, argv); } else { testCacheMeshes(adapter, argc, argv); } } } #endif