BC6HEncode_EncodeBlockCS.hlsl 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // check input.
  3. // CHECK: flattenedThreadIdInGroup
  4. // CHECK: groupId
  5. // check intrinsic used.
  6. // CHECK: textureLoad
  7. // CHECK: bufferLoad
  8. // CHECK: IMin
  9. // CHECK: IMax
  10. // CHECK: dot3
  11. // CHECK: bufferStore
  12. //--------------------------------------------------------------------------------------
  13. // File: BC6HEncode.hlsl
  14. //
  15. // The Compute Shader for BC6 Decoder
  16. //
  17. // Copyright (c) Microsoft Corporation. All rights reserved.
  18. //--------------------------------------------------------------------------------------
  19. //#define REF_DEVICE
  20. #define UINTLENGTH 32
  21. #define NCHANNELS 3
  22. #define SIGNED_F16 96
  23. #define UNSIGNED_F16 95
  24. #define MAX_FLOAT asfloat(0x7F7FFFFF)
  25. #define MIN_FLOAT asfloat(0xFF7FFFFF)
  26. #define MAX_INT asint(0x7FFFFFFF)
  27. #define MIN_INT asint(0x80000000)
  28. cbuffer cbCS : register( b0 )
  29. {
  30. uint g_tex_width;
  31. uint g_num_block_x;
  32. uint g_format; //either SIGNED_F16 for DXGI_FORMAT_BC6H_SF16 or UNSIGNED_F16 for DXGI_FORMAT_BC6H_UF16
  33. uint g_mode_id;
  34. uint g_start_block_id;
  35. uint g_num_total_blocks;
  36. };
  37. static const uint candidateModeMemory[14] = { 0x00, 0x01,
  38. 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };
  39. static const uint candidateModeFlag[14] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
  40. static const bool candidateModeTransformed[14] = { true, true, true, true, true, true, true, true, true, false, false, true, true, true };
  41. static const uint4 candidateModePrec[14] = { uint4(10,5,5,5), uint4(7,6,6,6),
  42. uint4(11,5,4,4), uint4(11,4,5,4), uint4(11,4,4,5), uint4(9,5,5,5),
  43. uint4(8,6,5,5), uint4(8,5,6,5), uint4(8,5,5,6), uint4(6,6,6,6),
  44. uint4(10,10,10,10), uint4(11,9,9,9), uint4(12,8,8,8), uint4(16,4,4,4) };
  45. /*static const uint4x4 candidateSection[32] =
  46. {
  47. {0,0,1,1, 0,0,1,1, 0,0,1,1, 0,0,1,1}, {0,0,0,1, 0,0,0,1, 0,0,0,1, 0,0,0,1}, {0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1}, {0,0,0,1, 0,0,1,1, 0,0,1,1, 0,1,1,1},
  48. {0,0,0,0, 0,0,0,1, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,1, 0,0,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,0,1,1, 0,1,1,1},
  49. {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,1,1,1},
  50. {0,0,0,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,1,1,1},
  51. {0,0,0,0, 1,0,0,0, 1,1,1,0, 1,1,1,1}, {0,1,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,1,0}, {0,1,1,1, 0,0,1,1, 0,0,0,1, 0,0,0,0},
  52. {0,0,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,1,0,0, 1,1,1,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,1, 0,0,1,1, 0,0,1,1, 0,0,0,1},
  53. {0,0,1,1, 0,0,0,1, 0,0,0,1, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,0, 0,1,1,0, 0,1,1,0, 0,1,1,0}, {0,0,1,1, 0,1,1,0, 0,1,1,0, 1,1,0,0},
  54. {0,0,0,1, 0,1,1,1, 1,1,1,0, 1,0,0,0}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 0,0,0,0}, {0,1,1,1, 0,0,0,1, 1,0,0,0, 1,1,1,0}, {0,0,1,1, 1,0,0,1, 1,0,0,1, 1,1,0,0}
  55. };*/
  56. static const uint candidateSectionBit[32] =
  57. {
  58. 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
  59. 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
  60. 0xC800, 0xFFEC, 0xFE80, 0xE800,
  61. 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
  62. 0xF710, 0x008E, 0x7100, 0x08CE,
  63. 0x008C, 0x7310, 0x3100, 0x8CCE,
  64. 0x088C, 0x3110, 0x6666, 0x366C,
  65. 0x17E8, 0x0FF0, 0x718E, 0x399C
  66. };
  67. static const uint candidateFixUpIndex1D[32] =
  68. {
  69. 15,15,15,15,
  70. 15,15,15,15,
  71. 15,15,15,15,
  72. 15,15,15,15,
  73. 15, 2, 8, 2,
  74. 2, 8, 8,15,
  75. 2, 8, 2, 2,
  76. 8, 8, 2, 2
  77. };
  78. //0, 9, 18, 27, 37, 46, 55, 64
  79. static const uint aStep1[64] = {0,0,0,0,0,1,1,1,
  80. 1,1,1,1,1,1,2,2,
  81. 2,2,2,2,2,2,2,3,
  82. 3,3,3,3,3,3,3,3,
  83. 3,4,4,4,4,4,4,4,
  84. 4,4,5,5,5,5,5,5,
  85. 5,5,5,6,6,6,6,6,
  86. 6,6,6,6,7,7,7,7};
  87. //0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
  88. static const uint aStep2[64] = { 0, 0, 0, 1, 1, 1, 1, 2,
  89. 2, 2, 2, 2, 3, 3, 3, 3,
  90. 4, 4, 4, 4, 5, 5, 5, 5,
  91. 6, 6, 6, 6, 6, 7, 7, 7,
  92. 7, 8, 8, 8, 8, 9, 9, 9,
  93. 9,10,10,10,10,10,11,11,
  94. 11,11,12,12,12,12,13,13,
  95. 13,13,14,14,14,14,15,15};
  96. #define THREAD_GROUP_SIZE 64
  97. #define BLOCK_SIZE_Y 4
  98. #define BLOCK_SIZE_X 4
  99. #define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X)
  100. //Forward declaration
  101. uint3 float2half( float3 pixel_f );
  102. int3 start_quantize( uint3 pixel_h );
  103. void quantize( inout int2x3 endPoint, uint prec );
  104. void finish_quantize_0( inout bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
  105. void finish_quantize_1( inout bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
  106. void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
  107. void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed );
  108. void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed );
  109. void unquantize( inout int2x3 color, uint prec );
  110. uint3 finish_unquantize( int3 color );
  111. void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i );
  112. void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i );
  113. float3 half2float(uint3 color_h );
  114. void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index );
  115. void block_package( inout uint4 block, int2x3 endPoint, uint mode_type );
  116. void swap(inout int3 lhs, inout int3 rhs)
  117. {
  118. int3 tmp = lhs;
  119. lhs = rhs;
  120. rhs = tmp;
  121. }
  122. Texture2D<float4> g_Input : register( t0 );
  123. StructuredBuffer<uint4> g_InBuff : register( t1 );
  124. RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
  125. struct SharedData
  126. {
  127. float3 pixel;
  128. int3 pixel_ph;
  129. float3 pixel_hr;
  130. float error;
  131. uint best_mode;
  132. uint best_partition;
  133. int3 endPoint_low;
  134. int3 endPoint_high;
  135. };
  136. groupshared SharedData shared_temp[THREAD_GROUP_SIZE];
  137. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  138. void TryModeG10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
  139. {
  140. const uint MAX_USED_THREAD = 16;
  141. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  142. uint blockInGroup = GI / MAX_USED_THREAD;
  143. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  144. uint threadBase = blockInGroup * MAX_USED_THREAD;
  145. uint threadInBlock = GI - threadBase;
  146. #ifndef REF_DEVICE
  147. if (blockID >= g_num_total_blocks)
  148. {
  149. return;
  150. }
  151. #endif
  152. uint block_y = blockID / g_num_block_x;
  153. uint block_x = blockID - block_y * g_num_block_x;
  154. uint base_x = block_x * BLOCK_SIZE_X;
  155. uint base_y = block_y * BLOCK_SIZE_Y;
  156. if (threadInBlock < 16)
  157. {
  158. shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
  159. uint3 pixel_h = float2half( shared_temp[GI].pixel );
  160. shared_temp[GI].pixel_hr = half2float(pixel_h);
  161. shared_temp[GI].pixel_ph = start_quantize( pixel_h );
  162. shared_temp[GI].endPoint_low = shared_temp[GI].pixel_ph;
  163. shared_temp[GI].endPoint_high = shared_temp[GI].pixel_ph;
  164. }
  165. #ifdef REF_DEVICE
  166. GroupMemoryBarrierWithGroupSync();
  167. #endif
  168. if (threadInBlock < 8)
  169. {
  170. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  171. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  172. }
  173. #ifdef REF_DEVICE
  174. GroupMemoryBarrierWithGroupSync();
  175. #endif
  176. if (threadInBlock < 4)
  177. {
  178. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  179. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  180. }
  181. #ifdef REF_DEVICE
  182. GroupMemoryBarrierWithGroupSync();
  183. #endif
  184. if (threadInBlock < 2)
  185. {
  186. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  187. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  188. }
  189. #ifdef REF_DEVICE
  190. GroupMemoryBarrierWithGroupSync();
  191. #endif
  192. if (threadInBlock < 1)
  193. {
  194. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  195. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  196. }
  197. #ifdef REF_DEVICE
  198. GroupMemoryBarrierWithGroupSync();
  199. #endif
  200. //ergod mode_type 11:14
  201. if ( threadInBlock == 0 )
  202. {
  203. int2x3 endPoint;
  204. // find_axis
  205. endPoint[0] = shared_temp[threadBase + 0].endPoint_low;
  206. endPoint[1] = shared_temp[threadBase + 0].endPoint_high;
  207. //compute_index
  208. float3 span = endPoint[1] - endPoint[0];// fixed a bug in v0.2
  209. float span_norm_sqr = dot( span, span );// fixed a bug in v0.2
  210. float dotProduct = dot( span, shared_temp[threadBase + 0].pixel_ph - endPoint[0] );// fixed a bug in v0.2
  211. if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 )
  212. {
  213. swap(endPoint[0], endPoint[1]);
  214. shared_temp[GI].endPoint_low = endPoint[0];
  215. shared_temp[GI].endPoint_high = endPoint[1];
  216. }
  217. }
  218. #ifdef REF_DEVICE
  219. GroupMemoryBarrierWithGroupSync();
  220. #endif
  221. if (threadInBlock < 4)
  222. {
  223. int2x3 endPoint;
  224. endPoint[0] = shared_temp[threadBase + 0].endPoint_low;
  225. endPoint[1] = shared_temp[threadBase + 0].endPoint_high;
  226. float3 span = endPoint[1] - endPoint[0];
  227. float span_norm_sqr = dot( span, span );
  228. uint4 prec = candidateModePrec[threadInBlock + 10];
  229. int2x3 endPoint_q = endPoint;
  230. quantize( endPoint_q, prec.x );
  231. bool transformed = candidateModeTransformed[threadInBlock + 10];
  232. if (transformed)
  233. {
  234. endPoint_q[1] -= endPoint_q[0];
  235. }
  236. bool bBadQuantize;
  237. finish_quantize( bBadQuantize, endPoint_q, prec, transformed );
  238. start_unquantize( endPoint_q, prec, transformed );
  239. unquantize( endPoint_q, prec.x );
  240. float error = 0;
  241. [loop]for ( uint j = 0; j < 16; j ++ )
  242. {
  243. float dotProduct = dot( span, shared_temp[threadBase + j].pixel_ph - endPoint[0] );// fixed a bug in v0.2
  244. uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
  245. : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] );
  246. uint3 pixel_rh;
  247. generate_palette_unquantized16( pixel_rh, endPoint_q[0], endPoint_q[1], index );
  248. float3 pixel_r = half2float( pixel_rh );
  249. pixel_r -= shared_temp[threadBase + j].pixel_hr;
  250. error += dot(pixel_r, pixel_r);
  251. }
  252. if ( bBadQuantize )
  253. error = 1e20f;
  254. shared_temp[GI].error = error;
  255. shared_temp[GI].best_mode = candidateModeFlag[threadInBlock + 10];
  256. }
  257. #ifdef REF_DEVICE
  258. GroupMemoryBarrierWithGroupSync();
  259. #endif
  260. if (threadInBlock < 2)
  261. {
  262. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  263. {
  264. shared_temp[GI].error = shared_temp[GI + 2].error;
  265. shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode;
  266. }
  267. }
  268. #ifdef REF_DEVICE
  269. GroupMemoryBarrierWithGroupSync();
  270. #endif
  271. if (threadInBlock < 1)
  272. {
  273. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  274. {
  275. shared_temp[GI].error = shared_temp[GI + 1].error;
  276. shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode;
  277. }
  278. g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, 0, 0);
  279. }
  280. }
  281. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  282. void TryModeLE10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
  283. {
  284. const uint MAX_USED_THREAD = 32;
  285. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  286. uint blockInGroup = GI / MAX_USED_THREAD;
  287. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  288. uint threadBase = blockInGroup * MAX_USED_THREAD;
  289. uint threadInBlock = GI - threadBase;
  290. #ifndef REF_DEVICE
  291. if (blockID >= g_num_total_blocks)
  292. {
  293. return;
  294. }
  295. if (asfloat(g_InBuff[blockID].x) < 1e-6f)
  296. {
  297. g_OutBuff[blockID] = g_InBuff[blockID];
  298. return;
  299. }
  300. #endif
  301. uint block_y = blockID / g_num_block_x;
  302. uint block_x = blockID - block_y * g_num_block_x;
  303. uint base_x = block_x * BLOCK_SIZE_X;
  304. uint base_y = block_y * BLOCK_SIZE_Y;
  305. if (threadInBlock < 16)
  306. {
  307. shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
  308. uint3 pixel_h = float2half( shared_temp[GI].pixel );
  309. shared_temp[GI].pixel_hr = half2float(pixel_h);
  310. shared_temp[GI].pixel_ph = start_quantize( pixel_h );
  311. }
  312. #ifdef REF_DEVICE
  313. GroupMemoryBarrierWithGroupSync();
  314. #endif
  315. //ergod mode_type 1:10
  316. if (threadInBlock < 32)
  317. {
  318. int2x3 endPoint[2];
  319. // find_axis
  320. endPoint[0][0] = MAX_INT;
  321. endPoint[0][1] = MIN_INT;
  322. endPoint[1][0] = MAX_INT;
  323. endPoint[1][1] = MIN_INT;
  324. uint bit = candidateSectionBit[threadInBlock];
  325. for ( uint i = 0; i < 16; i ++ )
  326. {
  327. int3 pixel_ph = shared_temp[threadBase + i].pixel_ph;
  328. if ( (bit >> i) & 1 ) //It gets error when using "candidateSection" as "endPoint_ph" index
  329. {
  330. endPoint[1][0] = min( endPoint[1][0], pixel_ph );
  331. endPoint[1][1] = max( endPoint[1][1], pixel_ph );
  332. }
  333. else
  334. {
  335. endPoint[0][0] = min( endPoint[0][0], pixel_ph );
  336. endPoint[0][1] = max( endPoint[0][1], pixel_ph );
  337. }
  338. }
  339. //compute_index
  340. float3 span[2];// fixed a bug in v0.2
  341. float span_norm_sqr[2];// fixed a bug in v0.2
  342. [unroll]
  343. for (uint p = 0; p < 2; ++ p)
  344. {
  345. span[p] = endPoint[p][1] - endPoint[p][0];
  346. span_norm_sqr[p] = dot( span[p], span[p] );
  347. float dotProduct = dot( span[p], shared_temp[threadBase + (0 == p ? 0 : candidateFixUpIndex1D[threadInBlock])].pixel_ph - endPoint[p][0] );// fixed a bug in v0.2
  348. if ( span_norm_sqr[p] > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr[p] ) > 32 )
  349. {
  350. span[p] = -span[p];
  351. swap(endPoint[p][0], endPoint[p][1]);
  352. }
  353. }
  354. uint4 prec = candidateModePrec[g_mode_id];
  355. int2x3 endPoint_q[2] = endPoint;
  356. quantize( endPoint_q[0], prec.x );
  357. quantize( endPoint_q[1], prec.x );
  358. bool transformed = candidateModeTransformed[g_mode_id];
  359. if (transformed)
  360. {
  361. endPoint_q[0][1] -= endPoint_q[0][0];
  362. endPoint_q[1][0] -= endPoint_q[0][0];
  363. endPoint_q[1][1] -= endPoint_q[0][0];
  364. }
  365. bool bBadQuantize = false;
  366. finish_quantize_0( bBadQuantize, endPoint_q[0], prec, transformed );
  367. finish_quantize_1( bBadQuantize, endPoint_q[1], prec, transformed );
  368. start_unquantize( endPoint_q, prec, transformed );
  369. unquantize( endPoint_q[0], prec.x );
  370. unquantize( endPoint_q[1], prec.x );
  371. float error = 0;
  372. for ( uint j = 0; j < 16; j ++ )
  373. {
  374. uint3 pixel_rh;
  375. if ((bit >> j) & 1)
  376. {
  377. float dotProduct = dot( span[1], shared_temp[threadBase + j].pixel_ph - endPoint[1][0] );// fixed a bug in v0.2
  378. uint index = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
  379. : ( ( dotProduct < span_norm_sqr[1] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep1[63] );
  380. generate_palette_unquantized8( pixel_rh, endPoint_q[1][0], endPoint_q[1][1], index );
  381. }
  382. else
  383. {
  384. float dotProduct = dot( span[0], shared_temp[threadBase + j].pixel_ph - endPoint[0][0] );// fixed a bug in v0.2
  385. uint index = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
  386. : ( ( dotProduct < span_norm_sqr[0] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep1[63] );
  387. generate_palette_unquantized8( pixel_rh, endPoint_q[0][0], endPoint_q[0][1], index );
  388. }
  389. float3 pixel_r = half2float( pixel_rh );
  390. pixel_r -= shared_temp[threadBase + j].pixel_hr;
  391. error += dot(pixel_r, pixel_r);
  392. }
  393. if ( bBadQuantize )
  394. error = 1e20f;
  395. shared_temp[GI].error = error;
  396. shared_temp[GI].best_mode = candidateModeFlag[g_mode_id];
  397. shared_temp[GI].best_partition = threadInBlock;
  398. }
  399. #ifdef REF_DEVICE
  400. GroupMemoryBarrierWithGroupSync();
  401. #endif
  402. if (threadInBlock < 16)
  403. {
  404. if ( shared_temp[GI].error > shared_temp[GI + 16].error )
  405. {
  406. shared_temp[GI].error = shared_temp[GI + 16].error;
  407. shared_temp[GI].best_mode = shared_temp[GI + 16].best_mode;
  408. shared_temp[GI].best_partition = shared_temp[GI + 16].best_partition;
  409. }
  410. }
  411. #ifdef REF_DEVICE
  412. GroupMemoryBarrierWithGroupSync();
  413. #endif
  414. if (threadInBlock < 8)
  415. {
  416. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  417. {
  418. shared_temp[GI].error = shared_temp[GI + 8].error;
  419. shared_temp[GI].best_mode = shared_temp[GI + 8].best_mode;
  420. shared_temp[GI].best_partition = shared_temp[GI + 8].best_partition;
  421. }
  422. }
  423. #ifdef REF_DEVICE
  424. GroupMemoryBarrierWithGroupSync();
  425. #endif
  426. if (threadInBlock < 4)
  427. {
  428. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  429. {
  430. shared_temp[GI].error = shared_temp[GI + 4].error;
  431. shared_temp[GI].best_mode = shared_temp[GI + 4].best_mode;
  432. shared_temp[GI].best_partition = shared_temp[GI + 4].best_partition;
  433. }
  434. }
  435. #ifdef REF_DEVICE
  436. GroupMemoryBarrierWithGroupSync();
  437. #endif
  438. if (threadInBlock < 2)
  439. {
  440. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  441. {
  442. shared_temp[GI].error = shared_temp[GI + 2].error;
  443. shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode;
  444. shared_temp[GI].best_partition = shared_temp[GI + 2].best_partition;
  445. }
  446. }
  447. #ifdef REF_DEVICE
  448. GroupMemoryBarrierWithGroupSync();
  449. #endif
  450. if (threadInBlock < 1)
  451. {
  452. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  453. {
  454. shared_temp[GI].error = shared_temp[GI + 1].error;
  455. shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode;
  456. shared_temp[GI].best_partition = shared_temp[GI + 1].best_partition;
  457. }
  458. if (asfloat(g_InBuff[blockID].x) > shared_temp[GI].error)
  459. {
  460. g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, shared_temp[GI].best_partition, 0);
  461. }
  462. else
  463. {
  464. g_OutBuff[blockID] = g_InBuff[blockID];
  465. }
  466. }
  467. }
  468. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  469. void main(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
  470. {
  471. const uint MAX_USED_THREAD = 32;
  472. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  473. uint blockInGroup = GI / MAX_USED_THREAD;
  474. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  475. uint threadBase = blockInGroup * MAX_USED_THREAD;
  476. uint threadInBlock = GI - threadBase;
  477. #ifndef REF_DEVICE
  478. if (blockID >= g_num_total_blocks)
  479. {
  480. return;
  481. }
  482. #endif
  483. uint block_y = blockID / g_num_block_x;
  484. uint block_x = blockID - block_y * g_num_block_x;
  485. uint base_x = block_x * BLOCK_SIZE_X;
  486. uint base_y = block_y * BLOCK_SIZE_Y;
  487. if (threadInBlock < 16)
  488. {
  489. shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
  490. uint3 pixel_h = float2half( shared_temp[GI].pixel );
  491. shared_temp[GI].pixel_ph = start_quantize( pixel_h );
  492. }
  493. #ifdef REF_DEVICE
  494. GroupMemoryBarrierWithGroupSync();
  495. #endif
  496. uint best_mode = g_InBuff[blockID].y;
  497. uint best_partition = g_InBuff[blockID].z;
  498. uint4 block = 0;
  499. if (threadInBlock < 32)
  500. {
  501. int2x3 endPoint;
  502. endPoint[0] = MAX_INT;
  503. endPoint[1] = MIN_INT;
  504. int3 pixel_ph = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_ph;
  505. if (threadInBlock < 16)
  506. {
  507. if (best_mode > 10)
  508. {
  509. endPoint[0] = endPoint[1] = pixel_ph;
  510. }
  511. else
  512. {
  513. uint bits = candidateSectionBit[best_partition];
  514. if (0 == ((bits >> threadInBlock) & 1))
  515. {
  516. endPoint[0] = endPoint[1] = pixel_ph;
  517. }
  518. }
  519. }
  520. else
  521. {
  522. if (best_mode <= 10)
  523. {
  524. uint bits = candidateSectionBit[best_partition];
  525. if (1 == ((bits >> (threadInBlock & 0xF)) & 1))
  526. {
  527. endPoint[0] = endPoint[1] = pixel_ph;
  528. }
  529. }
  530. }
  531. shared_temp[GI].endPoint_low = endPoint[0];
  532. shared_temp[GI].endPoint_high = endPoint[1];
  533. }
  534. #ifdef REF_DEVICE
  535. GroupMemoryBarrierWithGroupSync();
  536. #endif
  537. if ((threadInBlock & 0xF) < 8)
  538. {
  539. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  540. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  541. }
  542. #ifdef REF_DEVICE
  543. GroupMemoryBarrierWithGroupSync();
  544. #endif
  545. if ((threadInBlock & 0xF) < 4)
  546. {
  547. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  548. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  549. }
  550. #ifdef REF_DEVICE
  551. GroupMemoryBarrierWithGroupSync();
  552. #endif
  553. if ((threadInBlock & 0xF) < 2)
  554. {
  555. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  556. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  557. }
  558. #ifdef REF_DEVICE
  559. GroupMemoryBarrierWithGroupSync();
  560. #endif
  561. if ((threadInBlock & 0xF) < 1)
  562. {
  563. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  564. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  565. }
  566. #ifdef REF_DEVICE
  567. GroupMemoryBarrierWithGroupSync();
  568. #endif
  569. if (threadInBlock < 2)
  570. {
  571. // find_axis
  572. int2x3 endPoint;
  573. endPoint[0] = shared_temp[threadBase + threadInBlock * 16].endPoint_low;
  574. endPoint[1] = shared_temp[threadBase + threadInBlock * 16].endPoint_high;
  575. uint fixup = 0;
  576. if ((1 == threadInBlock) && (best_mode <= 10))
  577. {
  578. fixup = candidateFixUpIndex1D[best_partition];
  579. }
  580. float3 span = endPoint[1] - endPoint[0];
  581. float span_norm_sqr = dot( span, span );
  582. float dotProduct = dot( span, shared_temp[threadBase + fixup].pixel_ph - endPoint[0] );
  583. if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 )
  584. {
  585. swap(endPoint[0], endPoint[1]);
  586. }
  587. shared_temp[GI].endPoint_low = endPoint[0];
  588. shared_temp[GI].endPoint_high = endPoint[1];
  589. }
  590. #ifdef REF_DEVICE
  591. GroupMemoryBarrierWithGroupSync();
  592. #endif
  593. if (threadInBlock < 16)
  594. {
  595. uint bits;
  596. if (best_mode > 10)
  597. {
  598. bits = 0;
  599. }
  600. else
  601. {
  602. bits = candidateSectionBit[best_partition];
  603. }
  604. float3 span;
  605. float dotProduct;
  606. if ((bits >> threadInBlock) & 1)
  607. {
  608. span = shared_temp[threadBase + 1].endPoint_high - shared_temp[threadBase + 1].endPoint_low;
  609. dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 1].endPoint_low );
  610. }
  611. else
  612. {
  613. span = shared_temp[threadBase + 0].endPoint_high - shared_temp[threadBase + 0].endPoint_low;
  614. dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 0].endPoint_low );
  615. }
  616. float span_norm_sqr = dot( span, span );
  617. if (best_mode > 10)
  618. {
  619. uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
  620. : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] );
  621. if (threadInBlock == 0)
  622. {
  623. block.z |= index << 1;
  624. }
  625. else if (threadInBlock < 8)
  626. {
  627. block.z |= index << (threadInBlock * 4);
  628. }
  629. else
  630. {
  631. block.w |= index << ((threadInBlock - 8) * 4);
  632. }
  633. }
  634. else
  635. {
  636. uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
  637. : ( ( dotProduct < span_norm_sqr ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep1[63] );
  638. uint fixup = candidateFixUpIndex1D[best_partition];
  639. int2 offset = int2((fixup != 2), (fixup == 15));
  640. if (threadInBlock == 0)
  641. {
  642. block.z |= index << 18;
  643. }
  644. else if (threadInBlock < 3)
  645. {
  646. block.z |= index << (20 + (threadInBlock - 1) * 3);
  647. }
  648. else if (threadInBlock < 5)
  649. {
  650. block.z |= index << (25 + (threadInBlock - 3) * 3 + offset.x);
  651. }
  652. else if (threadInBlock == 5)
  653. {
  654. block.w |= index >> !offset.x;
  655. if (!offset.x)
  656. {
  657. block.z |= index << 31;
  658. }
  659. }
  660. else if (threadInBlock < 9)
  661. {
  662. block.w |= index << (2 + (threadInBlock - 6) * 3 + offset.x);
  663. }
  664. else
  665. {
  666. block.w |= index << (11 + (threadInBlock - 9) * 3 + offset.y);
  667. }
  668. }
  669. shared_temp[GI].pixel_hr.xy = asfloat(block.zw);
  670. }
  671. #ifdef REF_DEVICE
  672. GroupMemoryBarrierWithGroupSync();
  673. #endif
  674. if (threadInBlock < 8)
  675. {
  676. shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 8].pixel_hr.xy));
  677. }
  678. #ifdef REF_DEVICE
  679. GroupMemoryBarrierWithGroupSync();
  680. #endif
  681. if (threadInBlock < 4)
  682. {
  683. shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 4].pixel_hr.xy));
  684. }
  685. #ifdef REF_DEVICE
  686. GroupMemoryBarrierWithGroupSync();
  687. #endif
  688. if (threadInBlock < 2)
  689. {
  690. shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 2].pixel_hr.xy));
  691. }
  692. #ifdef REF_DEVICE
  693. GroupMemoryBarrierWithGroupSync();
  694. #endif
  695. if (threadInBlock < 1)
  696. {
  697. shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 1].pixel_hr.xy));
  698. block.zw = asuint(shared_temp[GI].pixel_hr.xy);
  699. }
  700. #ifdef REF_DEVICE
  701. GroupMemoryBarrierWithGroupSync();
  702. #endif
  703. bool transformed = candidateModeTransformed[best_mode - 1];
  704. uint4 prec = candidateModePrec[best_mode - 1];
  705. if (threadInBlock == 2)
  706. {
  707. int2x3 endPoint_q;
  708. endPoint_q[0] = shared_temp[threadBase + 0].endPoint_low;
  709. endPoint_q[1] = shared_temp[threadBase + 0].endPoint_high;
  710. quantize( endPoint_q, prec.x );
  711. if (transformed)
  712. {
  713. endPoint_q[1] -= endPoint_q[0];
  714. }
  715. shared_temp[GI].endPoint_low = endPoint_q[0];
  716. shared_temp[GI].endPoint_high = endPoint_q[1];
  717. }
  718. #ifdef REF_DEVICE
  719. GroupMemoryBarrierWithGroupSync();
  720. #endif
  721. if (threadInBlock == 3)
  722. {
  723. int3 ep0 = shared_temp[threadBase + 2].endPoint_low;
  724. int2x3 endPoint_q;
  725. endPoint_q[0] = shared_temp[threadBase + 1].endPoint_low;
  726. endPoint_q[1] = shared_temp[threadBase + 1].endPoint_high;
  727. if (best_mode <= 10)
  728. {
  729. quantize( endPoint_q, prec.x );
  730. if (transformed)
  731. {
  732. endPoint_q[0] -= ep0;
  733. endPoint_q[1] -= ep0;
  734. }
  735. shared_temp[GI].endPoint_low = endPoint_q[0];
  736. shared_temp[GI].endPoint_high = endPoint_q[1];
  737. }
  738. }
  739. #ifdef REF_DEVICE
  740. GroupMemoryBarrierWithGroupSync();
  741. #endif
  742. if (threadInBlock < 2)
  743. {
  744. int2x3 endPoint_q;
  745. endPoint_q[0] = shared_temp[threadBase + threadInBlock + 2].endPoint_low;
  746. endPoint_q[1] = shared_temp[threadBase + threadInBlock + 2].endPoint_high;
  747. bool bBadQuantize = false;
  748. if (threadInBlock == 0)
  749. {
  750. if (best_mode > 10)
  751. {
  752. finish_quantize( bBadQuantize, endPoint_q, prec, transformed );
  753. }
  754. else
  755. {
  756. finish_quantize_0( bBadQuantize, endPoint_q, prec, transformed );
  757. }
  758. }
  759. else // if (threadInBlock == 1)
  760. {
  761. if (best_mode <= 10)
  762. {
  763. finish_quantize_1( bBadQuantize, endPoint_q, prec, transformed );
  764. }
  765. }
  766. shared_temp[GI].endPoint_low = endPoint_q[0];
  767. shared_temp[GI].endPoint_high = endPoint_q[1];
  768. }
  769. #ifdef REF_DEVICE
  770. GroupMemoryBarrierWithGroupSync();
  771. #endif
  772. if ( threadInBlock == 0 )
  773. {
  774. int2x3 endPoint_q[2];
  775. endPoint_q[0][0] = shared_temp[threadBase + 0].endPoint_low;
  776. endPoint_q[0][1] = shared_temp[threadBase + 0].endPoint_high;
  777. endPoint_q[1][0] = shared_temp[threadBase + 1].endPoint_low;
  778. endPoint_q[1][1] = shared_temp[threadBase + 1].endPoint_high;
  779. if ( best_mode > 10 )
  780. {
  781. block_package( block, endPoint_q[0], best_mode );
  782. }
  783. else
  784. {
  785. block_package( block, endPoint_q, best_mode, best_partition );
  786. }
  787. g_OutBuff[blockID] = block;
  788. }
  789. }
  790. uint3 float2half( float3 endPoint_f )
  791. {
  792. uint3 sign = asuint(endPoint_f) & 0x80000000;
  793. uint3 expo = asuint(endPoint_f) & 0x7F800000;
  794. uint3 base = asuint(endPoint_f) & 0x007FFFFF;
  795. return ( expo < 0x33800000 ) ? 0
  796. //0x33800000 indicating 2^-24, which is minimal denormalized number that half can present
  797. : ( ( expo < 0x38800000 ) ? ( sign >> 16 ) | ( ( base + 0x00800000 ) >> ( 23 - ( ( expo - 0x33800000 ) >> 23 ) ) )//fixed a bug in v0.2
  798. //0x38800000 indicating 2^-14, which is minimal normalized number that half can present, so need to use denormalized half presentation
  799. : ( ( expo == 0x7F800000 || expo > 0x47000000 ) ? ( ( sign >> 16 ) | 0x7bff )
  800. // treat NaN as INF, treat INF (including NaN) as the maximum/minimum number that half can present
  801. // 0x47000000 indicating 2^15, which is maximum exponent that half can present, so cut to 0x7bff which is the maximum half number
  802. : ( ( sign >> 16 ) | ( ( ( expo - 0x38000000 ) | base ) >> 13 ) ) ) );
  803. }
  804. int3 start_quantize( uint3 pixel_h )
  805. {
  806. if ( g_format == UNSIGNED_F16 )
  807. {
  808. return asint( ( pixel_h << 6 ) / 31 );
  809. }
  810. else
  811. {
  812. return ( pixel_h < 0x8000 ) ? ( ( pixel_h == 0x7bff ) ? 0x7fff : asint( ( pixel_h << 5 ) / 31 ) )// fixed a bug in v0.2
  813. : ( ( pixel_h == 0x7bff ) ? 0xffff8001 : -asint( ( ( 0x00007fff & pixel_h ) << 5 ) / 31 ) );// fixed a bug in v0.2
  814. }
  815. }
  816. void quantize( inout int2x3 endPoint, uint prec )
  817. {
  818. int iprec = asint( prec );
  819. if ( g_format == UNSIGNED_F16 )
  820. {
  821. endPoint = ( ( iprec >= 15 ) | ( endPoint == 0 ) ) ? endPoint
  822. : ( ( endPoint == asint(0xFFFF) ) ? ( ( 1 << iprec ) - 1 )
  823. : ( ( ( endPoint << iprec ) + asint(0x0000) ) >> 16 ) );
  824. }
  825. else
  826. {
  827. endPoint = ( ( iprec >= 16 ) | ( endPoint == 0 ) ) ? endPoint
  828. : ( ( endPoint >= 0 ) ? ( ( endPoint == asint(0x7FFF) ) ? ( ( 1 << ( iprec - 1 ) ) - 1 ) : ( ( ( endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) )
  829. : ( ( -endPoint == asint(0x7FFF) ) ? -( ( 1 << ( iprec - 1 ) ) - 1 ) : -( ( ( -endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) ) );
  830. }
  831. }
  832. void finish_quantize_0( inout bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
  833. {
  834. if ( transformed )
  835. {
  836. bool3 bBadComponent = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? true : false )
  837. : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? true : false );
  838. bBadQuantize = bBadQuantize || any(bBadComponent);
  839. endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 );
  840. endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
  841. : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
  842. }
  843. else
  844. {
  845. endPoint &= ( ( 1 << prec.x ) - 1 );
  846. }
  847. }
  848. void finish_quantize_1( inout bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
  849. {
  850. if ( transformed )
  851. {
  852. bool2x3 bBadComponent;
  853. bBadComponent[0] = ( endPoint[0] >= 0 ) ? ( ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) ) ? true : false )
  854. : ( ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) ) ? true : false );
  855. bBadComponent[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? true : false )
  856. : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? true : false );
  857. bBadQuantize = bBadQuantize || any(bBadComponent);
  858. endPoint[0] = ( endPoint[0] >= 0 ) ? ( ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[0] )
  859. : ( ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[0] & ( ( 1 << prec.yzw ) - 1 ) ) );
  860. endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
  861. : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
  862. }
  863. else
  864. {
  865. endPoint &= ( ( 1 << prec.x ) - 1 );
  866. }
  867. }
  868. void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
  869. {
  870. if ( transformed )
  871. {
  872. bool3 bBadComponent;
  873. bBadComponent = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? true : false )
  874. : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? true : false );
  875. bBadQuantize = any( bBadComponent );
  876. endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 );
  877. endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
  878. : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
  879. }
  880. else
  881. {
  882. endPoint &= ( ( 1 << prec.x ) - 1 );
  883. bBadQuantize = false;
  884. }
  885. }
  886. void SIGN_EXTEND( uint3 prec, inout int3 color )
  887. {
  888. uint3 p = 1 << (prec - 1);
  889. color = (color & p) ? (color & (p - 1)) - p : color;
  890. }
  891. void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint )
  892. {
  893. if ( g_format == SIGNED_F16 )
  894. SIGN_EXTEND( prec.x, endPoint[0] );
  895. if ( g_format == SIGNED_F16 || transformed )
  896. SIGN_EXTEND( prec.yzw, endPoint[1] );
  897. }
  898. void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint[2] )
  899. {
  900. if ( g_format == SIGNED_F16 )
  901. SIGN_EXTEND( prec.x, endPoint[0][0] );
  902. if ( g_format == SIGNED_F16 || transformed )
  903. {
  904. SIGN_EXTEND( prec.yzw, endPoint[0][1] );
  905. SIGN_EXTEND( prec.yzw, endPoint[1][0] );
  906. SIGN_EXTEND( prec.yzw, endPoint[1][1] );
  907. }
  908. }
  909. void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed )
  910. {
  911. sign_extend( transformed, prec, endPoint );
  912. if ( transformed )
  913. {
  914. endPoint[0][1] += endPoint[0][0];
  915. endPoint[1][0] += endPoint[0][0];
  916. endPoint[1][1] += endPoint[0][0];
  917. }
  918. }
  919. void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed )
  920. {
  921. sign_extend( transformed, prec, endPoint );
  922. if ( transformed )
  923. endPoint[1] += endPoint[0];
  924. }
  925. void unquantize( inout int2x3 color, uint prec )
  926. {
  927. int iprec = asint( prec );
  928. if (g_format == UNSIGNED_F16 )
  929. {
  930. if (prec < 15)
  931. {
  932. color = (color != 0) ? (color == ((1 << iprec) - 1) ? 0xFFFF : (((color << 16) + 0x8000) >> iprec)) : color;
  933. }
  934. }
  935. else
  936. {
  937. if (prec < 16)
  938. {
  939. uint2x3 s = color >= 0 ? 0 : 1;
  940. color = abs(color);
  941. color = (color != 0) ? (color >= ((1 << (iprec - 1)) - 1) ? 0x7FFF : (((color << 15) + 0x4000) >> (iprec - 1))) : color;
  942. color = s > 0 ? -color : color;
  943. }
  944. }
  945. }
  946. uint3 finish_unquantize( int3 color )
  947. {
  948. if ( g_format == UNSIGNED_F16 )
  949. color = ( color * 31 ) >> 6;
  950. else
  951. {
  952. color = ( color < 0 ) ? -( ( -color * 31 ) >> 5 ) : ( color * 31 ) >> 5;
  953. color = ( color < 0 ) ? ( ( -color ) | 0x8000 ) : color;
  954. }
  955. return asuint(color);
  956. }
  957. void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i )
  958. {
  959. static const int aWeight3[] = {0, 9, 18, 27, 37, 46, 55, 64};
  960. int3 tmp = ( low * ( 64 - aWeight3[i] ) + high * aWeight3[i] + 32 ) >> 6;
  961. palette = finish_unquantize( tmp );
  962. }
  963. void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i )
  964. {
  965. static const int aWeight4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};
  966. int3 tmp = ( low * ( 64 - aWeight4[i] ) + high * aWeight4[i] + 32 ) >> 6;
  967. palette = finish_unquantize( tmp );
  968. }
  969. float3 half2float(uint3 color_h )
  970. {
  971. uint3 sign = color_h & 0x8000;
  972. uint3 expo = color_h & 0x7C00;
  973. uint3 base = color_h & 0x03FF;
  974. return ( expo == 0 ) ? asfloat( ( sign << 16 ) | asuint( float3(base) / 16777216 ) ) //16777216 = 2^24
  975. : asfloat( ( sign << 16 ) | ( ( ( expo + 0x1C000 ) | base ) << 13 ) ); //0x1C000 = 0x1FC00 - 0x3C00
  976. }
  977. void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index )
  978. {
  979. block.z |= partition_index << 13;
  980. if ( mode_type == candidateModeFlag[0])
  981. {
  982. block.x = candidateModeMemory[0];
  983. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  984. block.x |= ( endPoint[1][0].g >> 2 ) & 0x00000004;
  985. block.x |= ( endPoint[1][0].b >> 1 ) & 0x00000008;
  986. block.x |= endPoint[1][1].b & 0x00000010;
  987. block.y |= ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
  988. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
  989. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
  990. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  991. block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 );
  992. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
  993. block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) );
  994. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  995. }
  996. else if ( mode_type == candidateModeFlag[1])
  997. {
  998. block.x = candidateModeMemory[1];
  999. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00000FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x003F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1000. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
  1001. block.x |= ( ( endPoint[1][0].g >> 3 ) & 0x00000004 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 );
  1002. block.x |= ( endPoint[1][1].g >> 1 ) & 0x00000018;
  1003. block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 );
  1004. block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
  1005. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
  1006. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1007. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
  1008. block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 );
  1009. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1010. }
  1011. else if ( mode_type == candidateModeFlag[2])
  1012. {
  1013. block.x = candidateModeMemory[2];
  1014. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1015. block.y |= ( endPoint[0][0].r >> 2 ) & 0x00000100;
  1016. block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000;
  1017. block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
  1018. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 );
  1019. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
  1020. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1021. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
  1022. block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) );
  1023. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1024. }
  1025. else if ( mode_type == candidateModeFlag[3])
  1026. {
  1027. block.x = candidateModeMemory[3];
  1028. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1029. block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080;
  1030. block.y |= ( endPoint[0][0].g << 8 ) & 0x00040000;
  1031. block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
  1032. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 );
  1033. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000001E);
  1034. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1035. block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 );
  1036. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780);
  1037. block.yz |= ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000);
  1038. block.z |= ( ( endPoint[1][0].g << 7 ) & 0x00000800 );
  1039. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1040. block.z |= ( endPoint[1][1].b << 4 ) & 0x00000040;
  1041. block.z |= ( endPoint[1][1].b << 5 ) & 0x00000020;
  1042. }
  1043. else if ( mode_type == candidateModeFlag[4])
  1044. {
  1045. block.x = candidateModeMemory[4];
  1046. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1047. block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080;
  1048. block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000;
  1049. block.y |= ( ( endPoint[0][0].b << 18 ) & 0x10000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
  1050. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
  1051. block.y |= ( ( endPoint[1][0].g << 9 ) & 0x00001E00 ) | ( ( endPoint[1][0].b << 4 ) & 0x00000100 );
  1052. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1053. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780);
  1054. block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000060);
  1055. block.z |= ( endPoint[1][0].r << 1 ) & 0x0000001E;
  1056. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1057. block.z |= ( ( endPoint[1][1].b << 7 ) & 0x00000800 ) | ( ( endPoint[1][1].b << 9 ) & 0x00001000 );
  1058. }
  1059. else if ( mode_type == candidateModeFlag[5])
  1060. {
  1061. block.x = candidateModeMemory[5];
  1062. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00003FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x00FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000);
  1063. block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000003;
  1064. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
  1065. block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
  1066. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
  1067. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1068. block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
  1069. block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 );
  1070. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
  1071. block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040);
  1072. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1073. block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 );
  1074. }
  1075. else if ( mode_type == candidateModeFlag[6])
  1076. {
  1077. block.x = candidateModeMemory[6];
  1078. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1079. block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
  1080. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
  1081. block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000);
  1082. block.x |= ( ( endPoint[1][1].g << 9 ) & 0x00002000 ) | ( ( endPoint[1][1].b << 21 ) & 0x00800000);
  1083. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
  1084. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1085. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
  1086. block.y |= ( ( endPoint[1][1].b >> 2 ) & 0x00000006 );
  1087. block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 ) | ( ( endPoint[1][1].b << 18 ) & 0x00040000 );
  1088. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1089. }
  1090. else if ( mode_type == candidateModeFlag[7])
  1091. {
  1092. block.x = candidateModeMemory[7];
  1093. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1094. block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
  1095. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
  1096. block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
  1097. block.x |= ( ( endPoint[1][0].g << 18 ) & 0x00800000 );
  1098. block.x |= ( ( endPoint[1][1].b << 13 ) & 0x00002000 );
  1099. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
  1100. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
  1101. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1102. block.y |= ( ( endPoint[1][1].g >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
  1103. block.y |= ( endPoint[1][1].b << 27 ) & 0x10000000;
  1104. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1105. block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );
  1106. }
  1107. else if ( mode_type == candidateModeFlag[8])
  1108. {
  1109. block.x = candidateModeMemory[8];
  1110. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
  1111. block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
  1112. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
  1113. block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
  1114. block.x |= ( ( endPoint[1][0].b << 18 ) & 0x00800000 );
  1115. block.x |= ( endPoint[1][1].b << 12 ) & 0x00002000;
  1116. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1117. block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
  1118. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
  1119. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
  1120. block.y |= ( endPoint[1][1].b << 18 ) & 0x00040000;
  1121. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1122. block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );
  1123. }
  1124. else if ( mode_type == candidateModeFlag[9])
  1125. {
  1126. block.x = candidateModeMemory[9];
  1127. block.x |= ( ( endPoint[0][0].r << 5 ) & 0x000007E0 ) | ( ( endPoint[0][0].g << 15 ) & 0x001F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0x7E000000 );
  1128. block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
  1129. block.x |= ( ( endPoint[1][0].g << 16 ) & 0x00200000 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 );
  1130. block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
  1131. block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 );
  1132. block.x |= ( ( endPoint[1][1].g << 26 ) & 0x80000000 ) | ( ( endPoint[1][1].g << 7 ) & 0x00000800 );
  1133. block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
  1134. block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
  1135. block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
  1136. block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 );
  1137. block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
  1138. }
  1139. }
  1140. void block_package( inout uint4 block, int2x3 endPoint, uint mode_type )
  1141. {
  1142. block.x = ( ( endPoint[0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0].b << 25 ) & 0xFE000000 );
  1143. block.y |= ( endPoint[0].b >> 7 ) & 0x00000007;
  1144. if ( mode_type == candidateModeFlag[10])
  1145. {
  1146. block.x |= candidateModeMemory[10];
  1147. block.y |= ( ( endPoint[1].r << 3 ) & 0x00001FF8 ) | ( ( endPoint[1].g << 13 ) & 0x007FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 );
  1148. block.z |= ( endPoint[1].b >> 9 ) & 0x00000001;
  1149. }
  1150. else if (mode_type == candidateModeFlag[11])
  1151. {
  1152. block.x |= candidateModeMemory[11];
  1153. block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 );
  1154. block.y |= ( ( endPoint[1].r << 3 ) & 0x00000FF8 ) | ( ( endPoint[1].g << 13 ) & 0x003FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 );
  1155. block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;
  1156. }
  1157. else if (mode_type == candidateModeFlag[12])// violate the spec in [0].low
  1158. {
  1159. block.x |= candidateModeMemory[12];
  1160. block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 );
  1161. block.y |= ( ( endPoint[0].r << 0 ) & 0x00000800 ) | ( ( endPoint[0].g << 10 ) & 0x00200000 );
  1162. block.y |= ( endPoint[0].b << 20 ) & 0x80000000;
  1163. block.y |= ( ( endPoint[1].r << 3 ) & 0x000007F8 ) | ( ( endPoint[1].g << 13 ) & 0x001FE000 ) | ( ( endPoint[1].b << 23 ) & 0x7F800000 );
  1164. block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;
  1165. }
  1166. else if (mode_type == candidateModeFlag[13])
  1167. {
  1168. block.x |= candidateModeMemory[13];
  1169. block.y |= ( ( endPoint[0].r >> 3 ) & 0x00001F80 ) | ( ( endPoint[0].g << 7 ) & 0x007E0000 ) | ( ( endPoint[0].b << 17 ) & 0xF8000000 );
  1170. block.y |= ( ( endPoint[1].r << 3 ) & 0x00000078 ) | ( ( endPoint[1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[1].b << 23 ) & 0x07800000 );
  1171. block.z |= ( endPoint[0].b >> 15 ) & 0x00000001;
  1172. }
  1173. }