Browse Source

ShaderGenerator: Use float3x4 for hardware skinning

This is a 25% reduction in uniform vectors used, particularly relevant on DX9 which would otherwise allow at most 63 joints, but now allows 84
rdb 5 years ago
parent
commit
7aa6a2dfab

+ 57 - 24
panda/src/dxgsg9/dxShaderContext9.cxx

@@ -901,41 +901,74 @@ update_tables(GSG *gsg, const GeomVertexDataPipelineReader *data_reader) {
   if (loc >= 0) {
     ConstantRegister &reg = _register_map[(size_t)loc];
 
-    // reg.count is the number of registers, which is 4 per matrix.  However,
-    // due to optimization, the last row of the last matrix may be cut off.
-    size_t num_matrices = (reg.count + 3) / 4;
-    LMatrix4f *matrices = (LMatrix4f *)alloca(num_matrices * sizeof(LMatrix4f));
-
-    size_t i = 0;
+    float *data;
     const TransformTable *table = data_reader->get_transform_table();
-    if (table != nullptr) {
-      bool transpose = (_shader->get_language() == Shader::SL_Cg);
-      size_t num_transforms = std::min(num_matrices, table->get_num_transforms());
-      for (; i < num_transforms; ++i) {
+    if (!_shader->_transform_table_reduced) {
+      // reg.count is the number of registers, which is 4 per matrix.  However,
+      // due to optimization, the last row of the last matrix may be cut off.
+      size_t num_matrices = (reg.count + 3) / 4;
+      data = (float *)alloca(num_matrices * sizeof(LMatrix4f));
+      LMatrix4f *matrices = (LMatrix4f *)data;
+
+      size_t i = 0;
+      if (table != nullptr) {
+        bool transpose = (_shader->get_language() == Shader::SL_Cg);
+        size_t num_transforms = std::min(num_matrices, table->get_num_transforms());
+        for (; i < num_transforms; ++i) {
 #ifdef STDFLOAT_DOUBLE
-        LMatrix4 matrix;
-        table->get_transform(i)->get_matrix(matrix);
-        if (transpose) {
-          matrix.transpose_in_place();
-        }
-        matrices[i] = LCAST(float, matrix);
+          LMatrix4 matrix;
+          table->get_transform(i)->get_matrix(matrix);
+          if (transpose) {
+            matrix.transpose_in_place();
+          }
+          matrices[i] = LCAST(float, matrix);
 #else
-        table->get_transform(i)->get_matrix(matrices[i]);
-        if (transpose) {
-          matrices[i].transpose_in_place();
-        }
+          table->get_transform(i)->get_matrix(matrices[i]);
+          if (transpose) {
+            matrices[i].transpose_in_place();
+          }
 #endif
+        }
+      }
+      for (; i < num_matrices; ++i) {
+        matrices[i] = LMatrix4f::ident_mat();
       }
     }
-    for (; i < num_matrices; ++i) {
-      matrices[i] = LMatrix4f::ident_mat();
+    else {
+      // Reduced 3x4 matrix, used by shader generator
+      size_t num_matrices = (reg.count + 2) / 3;
+      data = (float *)alloca(num_matrices * sizeof(LVecBase4f) * 3);
+      LVecBase4f *vectors = (LVecBase4f *)data;
+
+      size_t i = 0;
+      if (table != nullptr) {
+        size_t num_transforms = std::min(num_matrices, table->get_num_transforms());
+        for (; i < num_transforms; ++i) {
+          LMatrix4f matrix;
+#ifdef STDFLOAT_DOUBLE
+          LMatrix4d matrixd;
+          table->get_transform(i)->get_matrix(matrixd);
+          matrix = LCAST(float, matrixd);
+#else
+          table->get_transform(i)->get_matrix(matrix);
+#endif
+          vectors[i * 3 + 0] = matrix.get_col(0);
+          vectors[i * 3 + 1] = matrix.get_col(1);
+          vectors[i * 3 + 2] = matrix.get_col(2);
+        }
+      }
+      for (; i < num_matrices; ++i) {
+        vectors[i * 3 + 0].set(1, 0, 0, 0);
+        vectors[i * 3 + 1].set(0, 1, 0, 0);
+        vectors[i * 3 + 2].set(0, 0, 1, 0);
+      }
     }
 
     if (reg.vreg >= 0) {
-      gsg->_d3d_device->SetVertexShaderConstantF(reg.vreg, (float *)matrices, reg.count);
+      gsg->_d3d_device->SetVertexShaderConstantF(reg.vreg, data, reg.count);
     }
     if (reg.freg >= 0) {
-      gsg->_d3d_device->SetPixelShaderConstantF(reg.freg, (float *)matrices, reg.count);
+      gsg->_d3d_device->SetPixelShaderConstantF(reg.freg, data, reg.count);
     }
   }
 

+ 47 - 15
panda/src/glstuff/glShaderContext_src.cxx

@@ -2521,28 +2521,60 @@ issue_parameters(int altered) {
  */
 void CLP(ShaderContext)::
 update_transform_table(const TransformTable *table) {
-  LMatrix4f *matrices = (LMatrix4f *)alloca(_transform_table_size * 64);
+  size_t num_matrices = (size_t)_transform_table_size;
 
-  size_t i = 0;
-  if (table != nullptr) {
-    size_t num_transforms = min((size_t)_transform_table_size, table->get_num_transforms());
-    for (; i < num_transforms; ++i) {
+  if (!_shader->_transform_table_reduced) {
+    LMatrix4f *matrices = (LMatrix4f *)alloca(num_matrices * sizeof(LMatrix4f));
+
+    size_t i = 0;
+    if (table != nullptr) {
+      size_t num_transforms = min(num_matrices, table->get_num_transforms());
+      for (; i < num_transforms; ++i) {
 #ifdef STDFLOAT_DOUBLE
-      LMatrix4 matrix;
-      table->get_transform(i)->get_matrix(matrix);
-      matrices[i] = LCAST(float, matrix);
+        LMatrix4 matrix;
+        table->get_transform(i)->get_matrix(matrix);
+        matrices[i] = LCAST(float, matrix);
 #else
-      table->get_transform(i)->get_matrix(matrices[i]);
+        table->get_transform(i)->get_matrix(matrices[i]);
 #endif
+      }
     }
+    for (; i < num_matrices; ++i) {
+      matrices[i] = LMatrix4f::ident_mat();
+    }
+    _glgsg->_glUniformMatrix4fv(_transform_table_index, _transform_table_size,
+                                (_shader->get_language() == Shader::SL_Cg),
+                                (float *)matrices);
   }
-  for (; i < (size_t)_transform_table_size; ++i) {
-    matrices[i] = LMatrix4f::ident_mat();
+  else {
+    // Reduced 3x4 matrix, used by shader generator
+    LVecBase4f *vectors = (LVecBase4f *)alloca(_transform_table_size * sizeof(LVecBase4f) * 3);
+
+    size_t i = 0;
+    if (table != nullptr) {
+      size_t num_transforms = std::min(num_matrices, table->get_num_transforms());
+      for (; i < num_transforms; ++i) {
+        LMatrix4f matrix;
+#ifdef STDFLOAT_DOUBLE
+        LMatrix4d matrixd;
+        table->get_transform(i)->get_matrix(matrixd);
+        matrix = LCAST(float, matrixd);
+#else
+        table->get_transform(i)->get_matrix(matrix);
+#endif
+        vectors[i * 3 + 0] = matrix.get_col(0);
+        vectors[i * 3 + 1] = matrix.get_col(1);
+        vectors[i * 3 + 2] = matrix.get_col(2);
+      }
+    }
+    for (; i < num_matrices; ++i) {
+      vectors[i * 3 + 0].set(1, 0, 0, 0);
+      vectors[i * 3 + 1].set(0, 1, 0, 0);
+      vectors[i * 3 + 2].set(0, 0, 1, 0);
+    }
+    _glgsg->_glUniformMatrix3x4fv(_transform_table_index, _transform_table_size,
+                                  GL_FALSE, (float *)vectors);
   }
-
-  _glgsg->_glUniformMatrix4fv(_transform_table_index, _transform_table_size,
-                              (_shader->get_language() == Shader::SL_Cg),
-                              (float *)matrices);
 }
 
 /**

+ 10 - 0
panda/src/gobj/shader.cxx

@@ -1724,6 +1724,7 @@ bind_parameter(const Parameter &param) {
 
       _transform_table_loc = param._location;
       _transform_table_size = num_elements;
+      _transform_table_reduced = false;
       return true;
     }
     if (pieces[1] == "SliderTable") {
@@ -2415,8 +2416,17 @@ bind_parameter(const Parameter &param) {
       }
 
       if (pieces[1] == "transforms") {
+        const ::ShaderType::Matrix *matrix = element_type->as_matrix();
+        if (matrix == nullptr ||
+            matrix->get_num_rows() < 3 ||
+            matrix->get_num_columns() != 4 ||
+            matrix->get_scalar_type() != ScalarType::ST_float) {
+          return report_parameter_error(name, type, "expected float3x4[] or float4x4[]");
+        }
+
         _transform_table_loc = param._location;
         _transform_table_size = num_elements;
+        _transform_table_reduced = (matrix->get_num_rows() == 3);
       }
       else if (pieces[1] == "sliders") {
         _slider_table_loc = param._location;

+ 1 - 0
panda/src/gobj/shader.h

@@ -477,6 +477,7 @@ public:
   int _frame_number_loc = -1;
   int _transform_table_loc = -1;
   uint32_t _transform_table_size = 0;
+  bool _transform_table_reduced = false;
   int _slider_table_loc = -1;
   uint32_t _slider_table_size = 0;
 

+ 3 - 3
panda/src/pgraphnodes/shaderGenerator.cxx

@@ -810,7 +810,7 @@ synthesize_shader(const RenderState *rs, const GeomVertexAnimationSpec &anim) {
     } else {
       num_transforms = key._anim_spec.get_num_transforms();
     }
-    text << "\t uniform float4x4 tbl_transforms[" << num_transforms << "],\n";
+    text << "\t uniform float3x4 tbl_transforms[" << num_transforms << "],\n";
     text << "\t in float4 vtx_transform_weight : BLENDWEIGHT,\n";
     if (key._anim_spec.get_indexed_transforms()) {
       text << "\t in uint4 vtx_transform_index : BLENDINDICES,\n";
@@ -828,7 +828,7 @@ synthesize_shader(const RenderState *rs, const GeomVertexAnimationSpec &anim) {
       text << "\t const uint4 vtx_transform_index = uint4(0, 1, 2, 3);\n";
     }
 
-    text << "\t float4x4 vtxmat = tbl_transforms[vtx_transform_index.x] * vtx_transform_weight.x";
+    text << "\t float3x4 vtxmat = tbl_transforms[vtx_transform_index.x] * vtx_transform_weight.x";
     if (key._anim_spec.get_num_transforms() > 1) {
       text << "\n\t                 + tbl_transforms[vtx_transform_index.y] * vtx_transform_weight.y";
     }
@@ -839,7 +839,7 @@ synthesize_shader(const RenderState *rs, const GeomVertexAnimationSpec &anim) {
       text << "\n\t                 + tbl_transforms[vtx_transform_index.w] * vtx_transform_weight.w";
     }
     text << ";\n";
-    text << "\t vtx_position = mul(vtxmat, vtx_position);\n";
+    text << "\t vtx_position = float4(mul(vtxmat, vtx_position), 1);\n";
     if (need_world_normal || need_eye_normal) {
       text << "\t vtx_normal = mul((float3x3)vtxmat, vtx_normal);\n";
     }