bc6_encode_kernel.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. //=====================================================================
  2. // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files(the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights
  7. // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions :
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE.
  21. //
  22. //=====================================================================
  23. #ifndef BC6_ENCODE_KERNEL_H
  24. #define BC6_ENCODE_KERNEL_H
  25. #pragma warning(disable:4505) // disable warnings on unreferenced local function has been removed
  26. #include "common_def.h"
  27. #define MAX_TRACE 10
  28. #define MAX_ENTRIES_QUANT_TRACE 16
  29. #define BlockX 4
  30. #define BlockY 4
  31. #define BYTEPP 4
  32. #define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes
  33. #define MAX_DIMENSION_BIG 4
  34. #define MAX_SUBSET_SIZE 16 // Largest possible size for an individual subset
  35. #define NUM_BLOCK_TYPES 8 // Number of block types in the format
  36. #define MAX_SUBSETS 3 // Maximum number of possible subsets
  37. #define MAX_PARTITIONS 64 // Maximum number of partition types
  38. #define MAX_ENTRIES 64
  39. #define MAX_TRY 20
  40. #define BC6_FLT_MAX_EXP 128
  41. #define MAX_PARTITIONS_TABLE (1+64+64)
  42. #define DIMENSION 4
  43. #define MAX_CLUSTERS_BIG 16
  44. #define EPSILON 0.000001
  45. #define MAX_CLUSTERS_QUANT_TRACE 8
  46. //# Image Quality will increase as this number gets larger and end-to-end performance time will reduce
  47. #define MAX_INDEX_BITS 4
  48. #define HIGHQULITY_THRESHOLD 0.7F
  49. #define qFAST_THRESHOLD 0.5F
  50. #define F16NEGPREC_LIMIT_VAL -2048.0f //f16 negative precision limit value
  51. #define LOG_CL_RANGE 5
  52. #define LOG_CL_BASE 2
  53. #define BIT_BASE 5
  54. #define BIT_RANGE 9
  55. #define MAX_CLUSTERS 8
  56. #define BTT(bits) (bits-BIT_BASE)
  57. #define CLT(cl) (cl-LOG_CL_BASE)
  58. #define MASK(n) ((1<<(n))-1)
  59. #define SIGN_EXTEND_TYPELESS(x,nb) ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x))
  60. #define CMP_HALF_MAX 65504.0f // positive half max
  61. #ifndef ASPM_GPU
  62. #include <bitset>
  63. #include <assert.h>
  64. //typedef uint8_t byte;
  65. #else
  66. //typedef bitset uint8_t;
  67. //typedef uint8 byte;
  68. #endif
  69. #define BC6CompBlockSize 16
  70. #define BC6BlockX 4
  71. #define BC6BlockY 4
  72. typedef struct {
  73. CGU_INT k;
  74. CGU_FLOAT d;
  75. } BC6H_TRACE;
  76. #define NCHANNELS 3
  77. #define MAX_END_POINTS 2
  78. #define MAX_BC6H_MODES 14
  79. #define MAX_BC6H_PARTITIONS 32
  80. #define MAX_TWOREGION_MODES 10
  81. #define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes
  82. #define ONE_REGION_INDEX_OFFSET 65 // bit location to start saving color index values for single region shape
  83. #define TWO_REGION_INDEX_OFFSET 82 // bit location to start saving color index values for two region shapes
  84. #define MIN_MODE_FOR_ONE_REGION 11 // Two regions shapes use modes 1..9 and single use 11..14
  85. #define R_0(ep) (ep)[0][0][i]
  86. #define R_1(ep) (ep)[0][1][i]
  87. #define R_2(ep) (ep)[1][0][i]
  88. #define R_3(ep) (ep)[1][1][i]
  89. #define FLT16_MAX 0x7bff
  90. #ifndef ASPM_GPU
  91. #define USE_SHAKERHD
  92. #endif
  93. #define USE_NEWRAMP
  94. typedef struct {
  95. CGU_FLOAT A[NCHANNELS];
  96. CGU_FLOAT B[NCHANNELS];
  97. } END_Points;
  98. typedef struct {
  99. CGU_FLOAT x, y, z;
  100. } BC6H_Vec3f;
  101. typedef struct {
  102. CGU_INT nbits; // Number of bits
  103. CGU_INT prec[3]; // precission of the Qunatized RGB endpoints
  104. CGU_INT transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
  105. CGU_INT modebits; // number of mode bits
  106. CGU_INT IndexPrec; // Index Precision
  107. CGU_INT mode; // Mode value to save
  108. CGU_INT lowestPrec; // Step size of each precesion incriment
  109. } ModePartitions;
  110. __constant ModePartitions ModePartition[MAX_BC6H_MODES + 1] = {
  111. {0, {0,0,0}, 0, 0, 0, 0, 0}, // Mode = Invaild
  112. // Two region Partition
  113. { 10, {5,5,5}, 1, 2, 3, 0x00, 31 }, // Mode = 1
  114. { 7, {6,6,6}, 1, 2, 3, 0x01, 248}, // Mode = 2
  115. { 11, {5,4,4}, 1, 5, 3, 0x02, 15 }, // Mode = 3
  116. { 11, {4,5,4}, 1, 5, 3, 0x06, 15 }, // Mode = 4
  117. { 11, {4,4,5}, 1, 5, 3, 0x0a, 15 }, // Mode = 5
  118. { 9, {5,5,5}, 1, 5, 3, 0x0e, 62 }, // Mode = 6
  119. { 8, {6,5,5}, 1, 5, 3, 0x12, 124}, // Mode = 7
  120. { 8, {5,6,5}, 1, 5, 3, 0x16, 124}, // Mode = 8
  121. { 8, {5,5,6}, 1, 5, 3, 0x1a, 124}, // Mode = 9
  122. { 6, {6,6,6}, 0, 5, 3, 0x1e, 496}, // Mode = 10
  123. // One region Partition
  124. {10, {10,10,10}, 0, 5, 4, 0x03, 31}, // Mode = 11
  125. {11, {9,9,9 }, 1, 5, 4, 0x07, 15}, // Mode = 12
  126. {12, {8,8,8 }, 1, 5, 4, 0x0b, 7 }, // Mode = 13
  127. {16, {4,4,4 }, 1, 5, 4, 0x0f, 1 } // Mode = 14
  128. };
  129. //================================================
  130. // Mode Pathern order to try on endpoints
  131. // The order can be rearranged to set which modes gets processed first
  132. // for now it is set in order.
  133. //================================================
  134. __constant CGU_INT8 ModeFitOrder[MAX_BC6H_MODES + 1] = {
  135. 0, //0: N/A
  136. // ---- 2 region lower bits ---
  137. 1, // 10 5 5 5
  138. 2, // 7 6 6 6
  139. 3, // 11 5 4 5
  140. 4, // 11 4 5 4
  141. 5, // 11 4 4 5
  142. 6, // 9 5 5 5
  143. 7, // 8 6 5 5
  144. 8, // 8 5 6 5
  145. 9, // 8 5 5 6
  146. 10, // 6 6 6 6
  147. //------ 1 region high bits ---
  148. 11, // 10 10 10 10
  149. 12, // 11 9 9 9
  150. 13, // 12 8 8 8
  151. 14 // 16 4 4 4
  152. };
  153. // The Region2FixUps are for our index[subset = 2][16][3] locations
  154. // indexed by shape region 2
  155. __constant CGU_INT g_Region2FixUp[32] = {
  156. 7, 3, 11, 7,
  157. 3, 11, 9, 5,
  158. 2, 12, 7, 3,
  159. 11, 7, 11, 3,
  160. 7, 1, 0, 1,
  161. 0, 1, 0, 7,
  162. 0, 1, 1, 0,
  163. 4, 4, 1, 0,
  164. };
  165. // Indexed by all shape regions
  166. // Partition Set Fixups for region 1 note region 0 is always at 0
  167. // that means normally we use 3 bits to define an index value
  168. // if its at the fix up location then its one bit less
  169. __constant CGU_INT g_indexfixups[32] = {
  170. 15,15,15,15,
  171. 15,15,15,15,
  172. 15,15,15,15,
  173. 15,15,15,15,
  174. 15, 2, 8, 2,
  175. 2, 8, 8,15,
  176. 2, 8, 2, 2,
  177. 8, 8, 2, 2,
  178. };
  179. typedef struct {
  180. CGU_INT8 region; // one or two
  181. CGU_INT8 m_mode; // m
  182. CGU_INT8 d_shape_index; // d
  183. CGU_INT rw; // endpt[0].A[0]
  184. CGU_INT rx; // endpt[0].B[0]
  185. CGU_INT ry; // endpt[1].A[0]
  186. CGU_INT rz; // endpt[1].B[0]
  187. CGU_INT gw; // endpt[0].A[1]
  188. CGU_INT gx; // endpt[0].B[1]
  189. CGU_INT gy; // endpt[1].A[1]
  190. CGU_INT gz; // endpt[1].B[1]
  191. CGU_INT bw; // endpt[0].A[2]
  192. CGU_INT bx; // endpt[0].B[2]
  193. CGU_INT by; // endpt[1].A[2]
  194. CGU_INT bz; // endpt[1].B[2]
  195. union {
  196. CGU_UINT8 indices[4][4]; // Indices data after header block
  197. CGU_UINT8 indices16[16];
  198. };
  199. union {
  200. CGU_FLOAT din[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG]; // Original data input as floats
  201. unsigned char cdin[256]; // as uchar to match float
  202. };
  203. END_Points EC[MAX_END_POINTS]; // compressed endpoints expressed as endpt[0].A[] and endpt[1].B[]
  204. END_Points E[MAX_END_POINTS]; // decompressed endpoints
  205. CGU_BOOL issigned; // Format is 16 bit signed floating point
  206. CGU_BOOL istransformed; // region two: all modes = true except mode=10
  207. short wBits; // number of bits for the root endpoint
  208. short tBits[NCHANNELS]; // number of bits used for the transformed endpoints
  209. CGU_INT format; // floating point format are we using for decompression
  210. BC6H_Vec3f Paletef[2][16];
  211. CGU_INT index; // for debugging
  212. CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
  213. CGU_FLOAT cur_best_fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
  214. CGU_INT shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
  215. CGU_INT cur_best_shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
  216. CGU_INT entryCount[MAX_SUBSETS];
  217. CGU_INT cur_best_entryCount[MAX_SUBSETS];
  218. CGU_FLOAT partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
  219. CGU_FLOAT cur_best_partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
  220. CGU_BOOL optimized; // were end points optimized during final encoding
  221. } BC6H_Encode_local;
  222. #ifndef ASPM_GPU
  223. class BitHeader {
  224. public:
  225. BitHeader(const CGU_UINT8 in[], CGU_INT sizeinbytes) {
  226. m_bits.reset();
  227. m_sizeinbytes = sizeinbytes;
  228. if ((in != NULL) && (sizeinbytes <= 16)) {
  229. // Init bits set with given data
  230. CGU_INT bitpos = 0;
  231. for (CGU_INT i = 0; i < sizeinbytes; i++) {
  232. CGU_INT bit = 1;
  233. for (CGU_INT j = 0; j < 8; j++) {
  234. m_bits[bitpos] = in[i] & bit ? 1 : 0;
  235. bit = bit << 1;
  236. bitpos++;
  237. }
  238. }
  239. }
  240. }
  241. ~BitHeader() {
  242. }
  243. void transferbits(CGU_UINT8 in[], CGU_INT sizeinbytes) {
  244. if ((sizeinbytes <= m_sizeinbytes) && (in != NULL)) {
  245. // Init bits set with given data
  246. memset(in, 0, sizeinbytes);
  247. CGU_INT bitpos = 0;
  248. for (CGU_INT i = 0; i < sizeinbytes; i++) {
  249. CGU_INT bit = 1;
  250. for (CGU_INT j = 0; j < 8; j++) {
  251. if (m_bits[bitpos]) in[i] |= bit;
  252. bit = bit << 1;
  253. bitpos++;
  254. }
  255. }
  256. }
  257. }
  258. CGU_INT getvalue(CGU_INT start, CGU_INT bitsize) {
  259. CGU_INT value = 0;
  260. CGU_INT end = start + bitsize - 1;
  261. for (; end >= start; end--) {
  262. value |= m_bits[end] ? 1 : 0;
  263. if (end > start) value <<= 1;
  264. }
  265. return value;
  266. }
  267. void setvalue(CGU_INT start, CGU_INT bitsize, CGU_INT value, CGU_INT maskshift = 0) {
  268. CGU_INT end = start + bitsize - 1;
  269. CGU_INT mask = 0x1 << maskshift;
  270. for (; start <= end; start++) {
  271. m_bits[start] = (value&mask) ? 1 : 0;
  272. mask <<= 1;
  273. }
  274. }
  275. std::bitset<128> m_bits; // 16 bytes max
  276. CGU_INT m_sizeinbytes;
  277. };
  278. //==================== DECODER CODE ======================
  279. #define MAXENDPOINTS 2
  280. #define U16MAX 0xffff
  281. #define S16MAX 0x7fff
  282. #define SIGN_EXTEND(w,tbits) ((((signed(w))&(1<<((tbits)-1)))?((~0)<<(tbits)):0)|(signed(w)))
  283. enum {
  284. UNSIGNED_F16 = 1,
  285. SIGNED_F16 = 2
  286. };
  287. enum {
  288. BC6_ONE = 0,
  289. BC6_TWO
  290. };
  291. enum {
  292. C_RED = 0,
  293. C_GREEN,
  294. C_BLUE
  295. };
  296. struct BC6H_Vec3 {
  297. int x,y,z;
  298. };
  299. struct AMD_BC6H_Format {
  300. unsigned short region; // one or two
  301. unsigned short m_mode; // m
  302. int d_shape_index; // d
  303. int rw; // endpt[0].A[0]
  304. int rx; // endpt[0].B[0]
  305. int ry; // endpt[1].A[0]
  306. int rz; // endpt[1].B[0]
  307. int gw; // endpt[0].A[1]
  308. int gx; // endpt[0].B[1]
  309. int gy; // endpt[1].A[1]
  310. int gz; // endpt[1].B[1]
  311. int bw; // endpt[0].A[2]
  312. int bx; // endpt[0].B[2]
  313. int by; // endpt[1].A[2]
  314. int bz; // endpt[1].B[2]
  315. union {
  316. CGU_UINT8 indices[4][4]; // Indices data after header block
  317. CGU_UINT8 indices16[16];
  318. };
  319. float din[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG]; // Original data input
  320. END_Points EC[MAXENDPOINTS]; // compressed endpoints expressed as endpt[0].A[] and endpt[1].B[]
  321. END_Points E[MAXENDPOINTS]; // decompressed endpoints
  322. bool issigned; // Format is 16 bit signed floating point
  323. bool istransformed; // region two: all modes = true except mode=10
  324. short wBits; // number of bits for the root endpoint
  325. short tBits[NCHANNELS]; // number of bits used for the transformed endpoints
  326. int format; // floating point format are we using for decompression
  327. BC6H_Vec3 Palete[2][16];
  328. BC6H_Vec3f Paletef[2][16];
  329. int index; // for debugging
  330. float fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
  331. float cur_best_fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
  332. int shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
  333. int cur_best_shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
  334. int entryCount[MAX_SUBSETS];
  335. int cur_best_entryCount[MAX_SUBSETS];
  336. float partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
  337. float cur_best_partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
  338. bool optimized; // were end points optimized during final encoding
  339. };
  340. // =================================== END OF DECODER CODE ========================================================
  341. #endif
  342. //-------------------------------------------------
  343. // Set by Host : Read only in kernel
  344. //-------------------------------------------------
  345. typedef struct {
  346. // Setup at initialization time
  347. CGU_FLOAT m_quality;
  348. CGU_FLOAT m_performance;
  349. CGU_FLOAT m_errorThreshold;
  350. CGU_DWORD m_validModeMask;
  351. CGU_BOOL m_imageNeedsAlpha;
  352. CGU_BOOL m_colourRestrict;
  353. CGU_BOOL m_alphaRestrict;
  354. CGU_BOOL m_isSigned;
  355. } CMP_BC6HOptions;
  356. typedef struct {
  357. // These are quality parameters used to select when to use the high precision quantizer
  358. // and shaker paths
  359. CGU_FLOAT m_quantizerRangeThreshold;
  360. CGU_FLOAT m_shakerRangeThreshold;
  361. CGU_FLOAT m_partitionSearchSize;
  362. // Setup at initialization time
  363. CGU_FLOAT m_quality;
  364. CGU_FLOAT m_performance;
  365. CGU_FLOAT m_errorThreshold;
  366. CGU_DWORD m_validModeMask;
  367. CGU_BOOL m_imageNeedsAlpha;
  368. CGU_BOOL m_colourRestrict;
  369. CGU_BOOL m_alphaRestrict;
  370. CGU_BOOL m_isSigned;
  371. // Source image info : must be set prior to use in kernel
  372. CGU_UINT32 m_src_width;
  373. CGU_UINT32 m_src_height;
  374. CGU_UINT32 m_src_stride;
  375. } BC6H_Encode;
  376. CMP_STATIC void SetDefaultBC6Options(BC6H_Encode *BC6Encode) {
  377. if (BC6Encode) {
  378. BC6Encode->m_quality = 1.0f;
  379. BC6Encode->m_quantizerRangeThreshold = 0.0f;
  380. BC6Encode->m_shakerRangeThreshold = 0.0f;
  381. BC6Encode->m_partitionSearchSize = 0.20f;
  382. BC6Encode->m_performance = 0.0f;
  383. BC6Encode->m_errorThreshold = 0.0f;
  384. BC6Encode->m_validModeMask = 0;
  385. BC6Encode->m_imageNeedsAlpha = 0;
  386. BC6Encode->m_colourRestrict = 0;
  387. BC6Encode->m_alphaRestrict = 0;
  388. BC6Encode->m_isSigned = 0;
  389. BC6Encode->m_src_width = 4;
  390. BC6Encode->m_src_height = 4;
  391. BC6Encode->m_src_stride = 0;
  392. }
  393. }
  394. #endif