shaderMethods.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2023 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. #ifndef DFPSR_RENDER_SHADER_METHODS
  24. #define DFPSR_RENDER_SHADER_METHODS
  25. #include <cstdint>
  26. #include "../../math/FVector.h"
  27. #include "../../math/scalar.h"
  28. #include "../../base/simd3D.h"
  29. #include "../../image/ImageRgbaU8.h"
  30. #include "shaderTypes.h"
  31. #include "../constants.h"
  32. namespace dsr {
  33. namespace shaderMethods {
  34. // Returns the linear interpolation of the values using corresponding weight ratios for A, B and C in 4 pixels at the same time.
  35. inline F32x4 interpolate(const FVector3D &vertexData, const F32x4x3 &vertexWeights) {
  36. F32x4 vMA = vertexData.x * vertexWeights.v1;
  37. F32x4 vMB = vertexData.y * vertexWeights.v2;
  38. F32x4 vMC = vertexData.z * vertexWeights.v3;
  39. return vMA + vMB + vMC;
  40. }
  41. inline Rgba_F32 interpolateVertexColor(const FVector3D &red, const FVector3D &green, const FVector3D &blue, const FVector3D &alpha, const F32x4x3 &vertexWeights) {
  42. return Rgba_F32(
  43. interpolate(red, vertexWeights),
  44. interpolate(green, vertexWeights),
  45. interpolate(blue, vertexWeights),
  46. interpolate(alpha, vertexWeights)
  47. );
  48. }
  49. // Returns (colorA * weightA + colorB * weightB) / 256 as bytes
  50. // weightA and weightB should contain pairs of the same 16-bit weights for each of the 4 pixels in the corresponding A and B colors
  51. inline U32x4 weightColors(const U32x4 &colorA, const U16x8 &weightA, const U32x4 &colorB, const U16x8 &weightB) {
  52. U32x4 lowMask(0x00FF00FFu);
  53. U16x8 lowColorA = U16x8(colorA & lowMask);
  54. U16x8 lowColorB = U16x8(colorB & lowMask);
  55. U32x4 highMask(0xFF00FF00u);
  56. U16x8 highColorA = U16x8((colorA & highMask) >> 8);
  57. U16x8 highColorB = U16x8((colorB & highMask) >> 8);
  58. U32x4 lowColor = (((lowColorA * weightA) + (lowColorB * weightB))).get_U32();
  59. U32x4 highColor = (((highColorA * weightA) + (highColorB * weightB))).get_U32();
  60. return (((lowColor >> 8) & lowMask) | (highColor & highMask));
  61. }
  62. // The more significant bits must be zero so that the lower bits can fill the space.
  63. // lowBits[x] < 2^16
  64. inline U16x8 repeatAs16Bits(const U32x4 &lowBits) {
  65. return U16x8(lowBits | (lowBits << 16));
  66. }
  67. // Returns 256 - weight
  68. inline U16x8 invertWeight(const U16x8 &weight) {
  69. return U16x8(0x01000100u) - weight;
  70. }
  71. inline U32x4 mix_L(const U32x4 &colorA, const U32x4 &colorB, const U32x4 &weight) {
  72. // Get inverse weights
  73. U16x8 weightB = repeatAs16Bits(weight);
  74. U16x8 weightA = invertWeight(weightB);
  75. // Multiply
  76. return weightColors(colorA, weightA, colorB, weightB);
  77. }
  78. inline U32x4 mix_BL(const U32x4 &colorA, const U32x4 &colorB, const U32x4 &colorC, const U32x4 &colorD, const U32x4 &weightX, const U32x4 &weightY) {
  79. // Get inverse weights
  80. U16x8 weightXR = repeatAs16Bits(weightX);
  81. U16x8 weightYB = repeatAs16Bits(weightY);
  82. U16x8 weightXL = invertWeight(weightXR);
  83. U16x8 weightYT = invertWeight(weightYB);
  84. // Multiply
  85. return weightColors(weightColors(colorA, weightXL, colorB, weightXR), weightYT, weightColors(colorC, weightXL, colorD, weightXR), weightYB);
  86. }
  87. // Single layer sampling methods
  88. inline U32x4 sample_U32(SafePointer<uint32_t> data, const TextureRgbaLayer *source, const U32x4 &col, const U32x4 &row) {
  89. U32x4 pixelOffset((source->startOffset + col + (row << source->widthShift))); // PixelOffset = Start + Column + Row * Width
  90. return gather(data, pixelOffset);
  91. }
  92. // How many mip levels down from here should be sampled for the given texture coordinates
  93. template<int maxOffset>
  94. inline int getMipLevelOffset(const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
  95. FVector4D ua = u.get();
  96. FVector4D va = v.get();
  97. float offsetUX = fabs(ua.x - ua.y);
  98. float offsetUY = fabs(ua.x - ua.z);
  99. float offsetVX = fabs(va.x - va.y);
  100. float offsetVY = fabs(va.x - va.z);
  101. float offsetU = max(offsetUX, offsetUY) * source->width;
  102. float offsetV = max(offsetVX, offsetVY) * source->height;
  103. float offset = max(offsetU, offsetV);
  104. // This log2 approximation has to be adapted if the number of mip levels changes.
  105. static_assert(MIP_BIN_COUNT == 5, "Changing MIP_BIN_COUNT must also adapt shaderMethods::getMipLevelOffset");
  106. int result = 0;
  107. if (offset > 2.0f) { result = 1; }
  108. if (offset > 4.0f) { result = 2; }
  109. if (offset > 8.0f) { result = 3; }
  110. if (offset > 16.0f) { result = 4; }
  111. return result;
  112. }
  113. inline int getMipLevel(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
  114. return getMipLevelOffset<MIP_BIN_COUNT - 1>(source->mips, u, v);
  115. }
  116. // Single layer sampling method
  117. template<Interpolation INTERPOLATION>
  118. inline U32x4 sample_U32(SafePointer<uint32_t> data, const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
  119. if (INTERPOLATION == Interpolation::BL) {
  120. U32x4 subPixelOffset = U32x4(1073741952); // 2 to the power of 30 + 128, adjusting to a safe part of the unsigned integer and adding half a pixel for the bi-linear interpolation.
  121. U32x4 subPixLowX(truncateToU32(u * source->subWidth) + subPixelOffset); // SubPixelLowX = u * (Width * 256) + 128
  122. U32x4 subPixLowY(truncateToU32(v * source->subHeight) + subPixelOffset); // SubPixelLowY = v * (Height * 256) + 128
  123. U32x4 weightX = subPixLowX & 255; // WeightX = SubPixelLowX % 256
  124. U32x4 weightY = subPixLowY & 255; // WeightY = SubPixelLowY % 256
  125. U32x4 pixLowX(subPixLowX >> 8); // PixelLowX = SubPixelLowX / 256
  126. U32x4 pixLowY(subPixLowY >> 8); // PixelLowY = SubPixelLowY / 256
  127. U32x4 wMask(source->widthMask);
  128. U32x4 hMask(source->heightMask);
  129. U32x4 colLow(pixLowX & wMask); // ColumnLow = PixelLowX % Width
  130. U32x4 rowLow(pixLowY & hMask); // RowLow = PixelLowY % Height
  131. U32x4 colHigh(((colLow + 1) & wMask)); // ColumnHigh = (ColumnLow + 1) % Width
  132. U32x4 rowHigh(((rowLow + 1) & hMask)); // RowHigh = (RowLow + 1) % Height
  133. // Sample colors in the 4 closest pixels
  134. U32x4 colorA(sample_U32(data, source, colLow, rowLow));
  135. U32x4 colorB(sample_U32(data, source, colHigh, rowLow));
  136. U32x4 colorC(sample_U32(data, source, colLow, rowHigh));
  137. U32x4 colorD(sample_U32(data, source, colHigh, rowHigh));
  138. // Take a weighted average
  139. return shaderMethods::mix_BL(colorA, colorB, colorC, colorD, weightX, weightY);
  140. } else { // Interpolation::NN or unhandled
  141. // TODO: Test nearest neighbor sampling.
  142. F32x4 subPixelOffset = F32x4(1073741824.0f);
  143. // TODO: Use multiply and add instructions.
  144. U32x4 pixX(truncateToU32(u * source->width + subPixelOffset)); // PixelX = U * Width
  145. U32x4 pixY(truncateToU32(v * source->height + subPixelOffset)); // PixelY = V * Height
  146. U32x4 col(pixX & source->widthMask); // Column = PixelX % Width
  147. U32x4 row(pixY & source->heightMask); // Row = PixelY % Height
  148. return sample_U32(data, source, col, row);
  149. }
  150. }
  151. template<Interpolation INTERPOLATION, bool HIGH_QUALITY>
  152. inline Rgba_F32 sample_F32(SafePointer<uint32_t> data, const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
  153. if (INTERPOLATION == Interpolation::BL) {
  154. if (HIGH_QUALITY) { // High quality interpolation
  155. F32x4 subPixelOffset = F32x4(4194304.5f); // A large power of two and half a pixel's offset for bi-linear interpolation.
  156. F32x4 pixX = u * source->width + subPixelOffset; // PixelX = ULow * Width
  157. F32x4 pixY = v * source->height + subPixelOffset; // PixelY = VLow * Height
  158. // Truncation can be used as floor for positive input
  159. U32x4 pixLowX(truncateToU32(pixX)); // PixelLowX = floor(PixelX)
  160. U32x4 pixLowY(truncateToU32(pixY)); // PixelLowY = floor(PixelY)
  161. U32x4 wMask(source->widthMask);
  162. U32x4 hMask(source->heightMask);
  163. U32x4 colLow(pixLowX & wMask); // ColumnLow = PixelLowX % Width
  164. U32x4 rowLow(pixLowY & hMask); // RowLow = PixelLowY % Height
  165. U32x4 colHigh(((colLow + 1) & wMask)); // ColumnHigh = (ColumnLow + 1) % Width
  166. U32x4 rowHigh(((rowLow + 1) & hMask)); // RowHigh = (RowLow + 1) % Height
  167. // Sample colors in the 4 closest pixels
  168. Rgba_F32 colorA(Rgba_F32(sample_U32(data, source, colLow, rowLow)));
  169. Rgba_F32 colorB(Rgba_F32(sample_U32(data, source, colHigh, rowLow)));
  170. Rgba_F32 colorC(Rgba_F32(sample_U32(data, source, colLow, rowHigh)));
  171. Rgba_F32 colorD(Rgba_F32(sample_U32(data, source, colHigh, rowHigh)));
  172. F32x4 weightX = pixX - floatFromU32(pixLowX);
  173. F32x4 weightY = pixY - floatFromU32(pixLowY);
  174. F32x4 invWeightX = 1.0f - weightX;
  175. F32x4 invWeightY = 1.0f - weightY;
  176. return (colorA * invWeightX + colorB * weightX) * invWeightY + (colorC * invWeightX + colorD * weightX) * weightY;
  177. } else { // Fast interpolation
  178. return Rgba_F32(sample_U32<Interpolation::BL>(data, source, u, v));
  179. }
  180. } else { // Interpolation::NN or unhandled
  181. return Rgba_F32(sample_U32<Interpolation::NN>(data, source, u, v));
  182. }
  183. }
  184. // Multi layer sampling method
  185. template<Interpolation INTERPOLATION, bool DISABLE_MIPMAP>
  186. inline U32x4 sample_U32(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
  187. if (DISABLE_MIPMAP) {
  188. return sample_U32<INTERPOLATION>(source->data, &(source->mips[0]), u, v);
  189. } else {
  190. int mipLevel = getMipLevel(source, u, v);
  191. return sample_U32<INTERPOLATION>(source->data, &(source->mips[mipLevel]), u, v);
  192. }
  193. }
  194. template<Interpolation INTERPOLATION, bool DISABLE_MIPMAP, bool HIGH_QUALITY>
  195. inline Rgba_F32 sample_F32(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
  196. if (DISABLE_MIPMAP) {
  197. return sample_F32<INTERPOLATION, HIGH_QUALITY>(source->data, &(source->mips[0]), u, v);
  198. } else {
  199. int mipLevel = getMipLevel(source, u, v);
  200. return sample_F32<INTERPOLATION, HIGH_QUALITY>(source->data, &(source->mips[mipLevel]), u, v);
  201. }
  202. }
  203. }
  204. }
  205. #endif