BC7Encode_EncodeBlockCS.hlsl 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // CHECK: flattenedThreadIdInGroup
  3. // CHECK: groupId
  4. // CHECK: bufferLoad
  5. // CHECK: textureLoad
  6. // CHECK: UMax
  7. // CHECK: UMin
  8. // CHECK: barrier
  9. // CHECK: IMad
  10. // CHECK: barrier
  11. // CHECK: bufferStore
  12. //--------------------------------------------------------------------------------------
  13. // File: BC7Encode.hlsl
  14. //
  15. // The Compute Shader for BC7 Encoder
  16. //
  17. // Copyright (c) Microsoft Corporation. All rights reserved.
  18. //--------------------------------------------------------------------------------------
  19. #define REF_DEVICE
  20. #define CHAR_LENGTH 8
  21. #define NCHANNELS 4
  22. #define BC7_UNORM 98
  23. #define MAX_UINT 0xFFFFFFFF
  24. #define MIN_UINT 0
  25. static const uint candidateSectionBit[64] = //Associated to partition 0-63
  26. {
  27. 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
  28. 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
  29. 0xC800, 0xFFEC, 0xFE80, 0xE800,
  30. 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
  31. 0xF710, 0x008E, 0x7100, 0x08CE,
  32. 0x008C, 0x7310, 0x3100, 0x8CCE,
  33. 0x088C, 0x3110, 0x6666, 0x366C,
  34. 0x17E8, 0x0FF0, 0x718E, 0x399C,
  35. 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
  36. 0x3c3c, 0x55aa, 0x9696, 0xa55a,
  37. 0x73ce, 0x13c8, 0x324c, 0x3bdc,
  38. 0x6996, 0xc33c, 0x9966, 0x660,
  39. 0x272, 0x4e4, 0x4e40, 0x2720,
  40. 0xc936, 0x936c, 0x39c6, 0x639c,
  41. 0x9336, 0x9cc6, 0x817e, 0xe718,
  42. 0xccf0, 0xfcc, 0x7744, 0xee22,
  43. };
  44. static const uint candidateSectionBit2[64] = //Associated to partition 64-127
  45. {
  46. 0xf60008cc, 0x73008cc8, 0x3310cc80, 0xceec00,
  47. 0xcc003300, 0xcc0000cc, 0xccff00, 0x3300cccc,
  48. 0xf0000f00, 0xf0000ff0, 0xff0000f0, 0x88884444,
  49. 0x88886666, 0xcccc2222, 0xec80136c, 0x7310008c,
  50. 0xc80036c8, 0x310008ce, 0xccc03330, 0xcccf000,
  51. 0xee0000ee, 0x77008888, 0xcc0022c0, 0x33004430,
  52. 0xcc0c22, 0xfc880344, 0x6606996, 0x66009960,
  53. 0xc88c0330, 0xf9000066, 0xcc0c22c, 0x73108c00,
  54. 0xec801300, 0x8cec400, 0xec80004c, 0x44442222,
  55. 0xf0000f0, 0x49242492, 0x42942942, 0xc30c30c,
  56. 0x3c0c03c, 0xff0000aa, 0x5500aa00, 0xcccc3030,
  57. 0xc0cc0c0, 0x66669090, 0xff0a00a, 0x5550aaa0,
  58. 0xf0000aaa, 0xe0ee0e0, 0x88887070, 0x99906660,
  59. 0xe00e0ee0, 0x88880770, 0xf0000666, 0x99006600,
  60. 0xff000066, 0xc00c0cc0, 0xcccc0330, 0x90006000,
  61. 0x8088080, 0xeeee1010, 0xfff0000a, 0x731008ce,
  62. };
  63. static const uint2 candidateFixUpIndex1D[128] =
  64. {
  65. {15, 0},{15, 0},{15, 0},{15, 0},
  66. {15, 0},{15, 0},{15, 0},{15, 0},
  67. {15, 0},{15, 0},{15, 0},{15, 0},
  68. {15, 0},{15, 0},{15, 0},{15, 0},
  69. {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
  70. { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
  71. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  72. { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  73. {15, 0},{15, 0},{ 6, 0},{ 8, 0},
  74. { 2, 0},{ 8, 0},{15, 0},{15, 0},
  75. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  76. { 2, 0},{15, 0},{15, 0},{ 6, 0},
  77. { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
  78. {15, 0},{15, 0},{ 2, 0},{ 2, 0},
  79. {15, 0},{15, 0},{15, 0},{15, 0},
  80. {15, 0},{ 2, 0},{ 2, 0},{15, 0},
  81. //candidateFixUpIndex1D[i][1], i < 64 should not be used
  82. { 3,15},{ 3, 8},{15, 8},{15, 3},
  83. { 8,15},{ 3,15},{15, 3},{15, 8},
  84. { 8,15},{ 8,15},{ 6,15},{ 6,15},
  85. { 6,15},{ 5,15},{ 3,15},{ 3, 8},
  86. { 3,15},{ 3, 8},{ 8,15},{15, 3},
  87. { 3,15},{ 3, 8},{ 6,15},{10, 8},
  88. { 5, 3},{ 8,15},{ 8, 6},{ 6,10},
  89. { 8,15},{ 5,15},{15,10},{15, 8},
  90. { 8,15},{15, 3},{ 3,15},{ 5,10},
  91. { 6,10},{10, 8},{ 8, 9},{15,10},
  92. {15, 6},{ 3,15},{15, 8},{ 5,15},
  93. {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
  94. { 3,15},{15, 3},{ 5,15},{ 5,15},
  95. { 5,15},{ 8,15},{ 5,15},{10,15},
  96. { 5,15},{10,15},{ 8,15},{13,15},
  97. {15, 3},{12,15},{ 3,15},{ 3, 8},
  98. };
  99. static const uint2 candidateFixUpIndex1DOrdered[128] = //Same with candidateFixUpIndex1D but order the result when i >= 64
  100. {
  101. {15, 0},{15, 0},{15, 0},{15, 0},
  102. {15, 0},{15, 0},{15, 0},{15, 0},
  103. {15, 0},{15, 0},{15, 0},{15, 0},
  104. {15, 0},{15, 0},{15, 0},{15, 0},
  105. {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
  106. { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
  107. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  108. { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  109. {15, 0},{15, 0},{ 6, 0},{ 8, 0},
  110. { 2, 0},{ 8, 0},{15, 0},{15, 0},
  111. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  112. { 2, 0},{15, 0},{15, 0},{ 6, 0},
  113. { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
  114. {15, 0},{15, 0},{ 2, 0},{ 2, 0},
  115. {15, 0},{15, 0},{15, 0},{15, 0},
  116. {15, 0},{ 2, 0},{ 2, 0},{15, 0},
  117. //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
  118. { 3,15},{ 3, 8},{ 8,15},{ 3,15},
  119. { 8,15},{ 3,15},{ 3,15},{ 8,15},
  120. { 8,15},{ 8,15},{ 6,15},{ 6,15},
  121. { 6,15},{ 5,15},{ 3,15},{ 3, 8},
  122. { 3,15},{ 3, 8},{ 8,15},{ 3,15},
  123. { 3,15},{ 3, 8},{ 6,15},{ 8,10},
  124. { 3, 5},{ 8,15},{ 6, 8},{ 6,10},
  125. { 8,15},{ 5,15},{10,15},{ 8,15},
  126. { 8,15},{ 3,15},{ 3,15},{ 5,10},
  127. { 6,10},{ 8,10},{ 8, 9},{10,15},
  128. { 6,15},{ 3,15},{ 8,15},{ 5,15},
  129. { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
  130. { 3,15},{ 3,15},{ 5,15},{ 5,15},
  131. { 5,15},{ 8,15},{ 5,15},{10,15},
  132. { 5,15},{10,15},{ 8,15},{13,15},
  133. { 3,15},{12,15},{ 3,15},{ 3, 8},
  134. };
  135. static const uint4x4 candidateRotation[4] =
  136. {
  137. {1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1},
  138. {0,0,0,1},{0,1,0,0},{0,0,1,0},{1,0,0,0},
  139. {1,0,0,0},{0,0,0,1},{0,0,1,0},{0,1,0,0},
  140. {1,0,0,0},{0,1,0,0},{0,0,0,1},{0,0,1,0}
  141. };
  142. static const uint2 candidateIndexPrec[8] = {{3,0},{3,0},{2,0},{2,0},
  143. {2,3}, //color index and alpha index can exchange
  144. {2,2},{4,4},{2,2}};
  145. static const uint aWeight[3][16] = { {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
  146. {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},
  147. {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
  148. //0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
  149. static const uint aStep[3][64] = { { 0, 0, 0, 1, 1, 1, 1, 2,
  150. 2, 2, 2, 2, 3, 3, 3, 3,
  151. 4, 4, 4, 4, 5, 5, 5, 5,
  152. 6, 6, 6, 6, 6, 7, 7, 7,
  153. 7, 8, 8, 8, 8, 9, 9, 9,
  154. 9,10,10,10,10,10,11,11,
  155. 11,11,12,12,12,12,13,13,
  156. 13,13,14,14,14,14,15,15 },
  157. //0, 9, 18, 27, 37, 46, 55, 64
  158. { 0,0,0,0,0,1,1,1,
  159. 1,1,1,1,1,1,2,2,
  160. 2,2,2,2,2,2,2,3,
  161. 3,3,3,3,3,3,3,3,
  162. 3,4,4,4,4,4,4,4,
  163. 4,4,5,5,5,5,5,5,
  164. 5,5,5,6,6,6,6,6,
  165. 6,6,6,6,7,7,7,7 },
  166. //0, 21, 43, 64
  167. { 0,0,0,0,0,0,0,0,
  168. 0,0,0,1,1,1,1,1,
  169. 1,1,1,1,1,1,1,1,
  170. 1,1,1,1,1,1,1,1,
  171. 1,2,2,2,2,2,2,2,
  172. 2,2,2,2,2,2,2,2,
  173. 2,2,2,2,2,2,3,3,
  174. 3,3,3,3,3,3,3,3 } };
  175. cbuffer cbCS : register( b0 )
  176. {
  177. uint g_tex_width;
  178. uint g_num_block_x;
  179. uint g_format;
  180. uint g_mode_id;
  181. uint g_start_block_id;
  182. uint g_num_total_blocks;
  183. };
  184. //Forward declaration
  185. void compress_endpoints0( inout uint2x4 endPoint ); //Mode = 0
  186. void compress_endpoints1( inout uint2x4 endPoint ); //Mode = 1
  187. void compress_endpoints2( inout uint2x4 endPoint ); //Mode = 2
  188. void compress_endpoints3( inout uint2x4 endPoint ); //Mode = 3
  189. void compress_endpoints7( inout uint2x4 endPoint ); //Mode = 7
  190. void compress_endpoints6( inout uint2x4 endPoint ); //Mode = 6
  191. void compress_endpoints4( inout uint2x4 endPoint ); //Mode = 4
  192. void compress_endpoints5( inout uint2x4 endPoint ); //Mode = 5
  193. void block_package0( out uint4 block, uint partition, uint threadBase ); //Mode0
  194. void block_package1( out uint4 block, uint partition, uint threadBase ); //Mode1
  195. void block_package2( out uint4 block, uint partition, uint threadBase ); //Mode2
  196. void block_package3( out uint4 block, uint partition, uint threadBase ); //Mode3
  197. void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase ); //Mode4
  198. void block_package5( out uint4 block, uint rotation, uint threadBase ); //Mode5
  199. void block_package6( out uint4 block, uint threadBase ); //Mode6
  200. void block_package7( out uint4 block, uint partition, uint threadBase ); //Mode7
  201. void swap(inout uint4 lhs, inout uint4 rhs)
  202. {
  203. int4 tmp = lhs;
  204. lhs = rhs;
  205. rhs = tmp;
  206. }
  207. void swap(inout uint3 lhs, inout uint3 rhs)
  208. {
  209. int3 tmp = lhs;
  210. lhs = rhs;
  211. rhs = tmp;
  212. }
  213. void swap(inout uint lhs, inout uint rhs)
  214. {
  215. int tmp = lhs;
  216. lhs = rhs;
  217. rhs = tmp;
  218. }
  219. Texture2D g_Input : register( t0 );
  220. StructuredBuffer<uint4> g_InBuff : register( t1 );
  221. RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
  222. #define THREAD_GROUP_SIZE 64
  223. #define BLOCK_SIZE_Y 4
  224. #define BLOCK_SIZE_X 4
  225. #define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X)
  226. struct BufferShared
  227. {
  228. uint4 pixel;
  229. uint error;
  230. uint mode;
  231. uint partition;
  232. uint index_selector;
  233. uint rotation;
  234. uint4 endPoint_low;
  235. uint4 endPoint_high;
  236. };
  237. groupshared BufferShared shared_temp[THREAD_GROUP_SIZE];
  238. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  239. void TryMode456CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
  240. {
  241. const uint MAX_USED_THREAD = 16;
  242. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  243. uint blockInGroup = GI / MAX_USED_THREAD;
  244. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  245. uint threadBase = blockInGroup * MAX_USED_THREAD;
  246. uint threadInBlock = GI - threadBase;
  247. #ifndef REF_DEVICE
  248. if (blockID >= g_num_total_blocks)
  249. {
  250. return;
  251. }
  252. #endif
  253. uint block_y = blockID / g_num_block_x;
  254. uint block_x = blockID - block_y * g_num_block_x;
  255. uint base_x = block_x * BLOCK_SIZE_X;
  256. uint base_y = block_y * BLOCK_SIZE_Y;
  257. if (threadInBlock < 16)
  258. {
  259. shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
  260. shared_temp[GI].endPoint_low = shared_temp[GI].pixel;
  261. shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
  262. }
  263. #ifdef REF_DEVICE
  264. GroupMemoryBarrierWithGroupSync();
  265. #endif
  266. if (threadInBlock < 8)
  267. {
  268. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  269. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  270. }
  271. #ifdef REF_DEVICE
  272. GroupMemoryBarrierWithGroupSync();
  273. #endif
  274. if (threadInBlock < 4)
  275. {
  276. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  277. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  278. }
  279. #ifdef REF_DEVICE
  280. GroupMemoryBarrierWithGroupSync();
  281. #endif
  282. if (threadInBlock < 2)
  283. {
  284. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  285. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  286. }
  287. #ifdef REF_DEVICE
  288. GroupMemoryBarrierWithGroupSync();
  289. #endif
  290. if (threadInBlock < 1)
  291. {
  292. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  293. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  294. }
  295. #ifdef REF_DEVICE
  296. GroupMemoryBarrierWithGroupSync();
  297. #endif
  298. uint2x4 endPoint;
  299. endPoint[0] = shared_temp[threadBase].endPoint_low;
  300. endPoint[1] = shared_temp[threadBase].endPoint_high;
  301. uint error = 0xFFFFFFFF;
  302. uint mode = 0;
  303. uint index_selector = 0;
  304. uint rotation = 0;
  305. uint2 indexPrec;
  306. if (threadInBlock < 8)
  307. {
  308. if (0 == (threadInBlock & 1))
  309. {
  310. //2 represents 2bit index precision; 1 represents 3bit index precision
  311. indexPrec = uint2( 2, 1 );
  312. }
  313. else
  314. {
  315. //2 represents 2bit index precision; 1 represents 3bit index precision
  316. index_selector = 1;
  317. indexPrec = uint2( 1, 2 );
  318. }
  319. }
  320. else
  321. {
  322. //2 represents 2bit index precision
  323. indexPrec = uint2( 2, 2 );
  324. }
  325. uint4 pixel_r;
  326. uint color_index;
  327. uint alpha_index;
  328. int4 span;
  329. int2 span_norm_sqr;
  330. int2 dotProduct;
  331. if (threadInBlock < 12)
  332. {
  333. if ((threadInBlock < 2) || (8 == threadInBlock)) // rotation = 0
  334. {
  335. rotation = 0;
  336. }
  337. else if ((threadInBlock < 4) || (9 == threadInBlock)) // rotation = 1
  338. {
  339. endPoint[0].ra = endPoint[0].ar;
  340. endPoint[1].ra = endPoint[1].ar;
  341. rotation = 1;
  342. }
  343. else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2
  344. {
  345. endPoint[0].ga = endPoint[0].ag;
  346. endPoint[1].ga = endPoint[1].ag;
  347. rotation = 2;
  348. }
  349. else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3
  350. {
  351. endPoint[0].ba = endPoint[0].ab;
  352. endPoint[1].ba = endPoint[1].ab;
  353. rotation = 3;
  354. }
  355. if (threadInBlock < 8)
  356. {
  357. mode = 4;
  358. compress_endpoints4( endPoint );
  359. }
  360. else
  361. {
  362. mode = 5;
  363. compress_endpoints5( endPoint );
  364. }
  365. uint4 pixel = shared_temp[threadBase + 0].pixel;
  366. if (1 == rotation)
  367. {
  368. pixel.ra = pixel.ar;
  369. }
  370. else if (2 == rotation)
  371. {
  372. pixel.ga = pixel.ag;
  373. }
  374. else if (3 == rotation)
  375. {
  376. pixel.ba = pixel.ab;
  377. }
  378. span = endPoint[1] - endPoint[0];
  379. span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
  380. dotProduct = int2( dot( span.rgb, pixel.rgb - endPoint[0].rgb ), span.a * ( pixel.a - endPoint[0].a ) );
  381. if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
  382. {
  383. span.rgb = -span.rgb;
  384. swap(endPoint[0].rgb, endPoint[1].rgb);
  385. }
  386. if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
  387. {
  388. span.a = -span.a;
  389. swap(endPoint[0].a, endPoint[1].a);
  390. }
  391. error = 0;
  392. for ( uint i = 0; i < 16; i ++ )
  393. {
  394. pixel = shared_temp[threadBase + i].pixel;
  395. if (1 == rotation)
  396. {
  397. pixel.ra = pixel.ar;
  398. }
  399. else if (2 == rotation)
  400. {
  401. pixel.ga = pixel.ag;
  402. }
  403. else if (3 == rotation)
  404. {
  405. pixel.ba = pixel.ab;
  406. }
  407. dotProduct.x = dot( span.rgb, pixel.rgb - endPoint[0].rgb );
  408. color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
  409. : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
  410. dotProduct.y = dot( span.a, pixel.a - endPoint[0].a );
  411. alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0
  412. : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
  413. if (index_selector)
  414. {
  415. swap(color_index, alpha_index);
  416. }
  417. pixel_r.rgb = ( ( 64 - aWeight[indexPrec.x][color_index] ) * endPoint[0].rgb
  418. + aWeight[indexPrec.x][color_index] * endPoint[1] + 32 ) >> 6;
  419. pixel_r.a = ( ( 64 - aWeight[indexPrec.y][alpha_index] ) * endPoint[0].a
  420. + aWeight[indexPrec.y][alpha_index] * endPoint[1] + 32 ) >> 6;
  421. pixel_r -= pixel;
  422. error += dot(pixel_r, pixel_r);
  423. }
  424. }
  425. else if (12 == threadInBlock)//Mode6
  426. {
  427. compress_endpoints6( endPoint );
  428. uint4 pixel = shared_temp[threadBase + 0].pixel;
  429. span = endPoint[1] - endPoint[0];
  430. span_norm_sqr = dot( span, span );
  431. dotProduct = dot( span, pixel - endPoint[0] );
  432. if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
  433. {
  434. span = -span;
  435. swap(endPoint[0], endPoint[1]);
  436. }
  437. error = 0;
  438. for ( uint i = 0; i < 16; i ++ )
  439. {
  440. pixel = shared_temp[threadBase + i].pixel;
  441. dotProduct.x = dot( span, pixel - endPoint[0] );
  442. color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
  443. : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] );
  444. pixel_r = ( ( 64 - aWeight[0][color_index] ) * endPoint[0]
  445. + aWeight[0][color_index] * endPoint[1] + 32 ) >> 6;
  446. pixel_r -= pixel;
  447. error += dot(pixel_r, pixel_r);
  448. }
  449. mode = 6;
  450. rotation = 0;
  451. }
  452. shared_temp[GI].error = error;
  453. shared_temp[GI].mode = mode;
  454. shared_temp[GI].index_selector = index_selector;
  455. shared_temp[GI].rotation = rotation;
  456. #ifdef REF_DEVICE
  457. GroupMemoryBarrierWithGroupSync();
  458. #endif
  459. if (threadInBlock < 8)
  460. {
  461. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  462. {
  463. shared_temp[GI].error = shared_temp[GI + 8].error;
  464. shared_temp[GI].mode = shared_temp[GI + 8].mode;
  465. shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector;
  466. shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
  467. }
  468. }
  469. #ifdef REF_DEVICE
  470. GroupMemoryBarrierWithGroupSync();
  471. #endif
  472. if (threadInBlock < 4)
  473. {
  474. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  475. {
  476. shared_temp[GI].error = shared_temp[GI + 4].error;
  477. shared_temp[GI].mode = shared_temp[GI + 4].mode;
  478. shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
  479. shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
  480. }
  481. }
  482. #ifdef REF_DEVICE
  483. GroupMemoryBarrierWithGroupSync();
  484. #endif
  485. if (threadInBlock < 2)
  486. {
  487. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  488. {
  489. shared_temp[GI].error = shared_temp[GI + 2].error;
  490. shared_temp[GI].mode = shared_temp[GI + 2].mode;
  491. shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
  492. shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
  493. }
  494. }
  495. #ifdef REF_DEVICE
  496. GroupMemoryBarrierWithGroupSync();
  497. #endif
  498. if (threadInBlock < 1)
  499. {
  500. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  501. {
  502. shared_temp[GI].error = shared_temp[GI + 1].error;
  503. shared_temp[GI].mode = shared_temp[GI + 1].mode;
  504. shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector;
  505. shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
  506. }
  507. g_OutBuff[blockID] = uint4(shared_temp[GI].error, (shared_temp[GI].index_selector << 31) | shared_temp[GI].mode,
  508. 0, shared_temp[GI].rotation);
  509. }
  510. }
  511. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  512. void main(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
  513. {
  514. const uint MAX_USED_THREAD = 16;
  515. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  516. uint blockInGroup = GI / MAX_USED_THREAD;
  517. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  518. uint threadBase = blockInGroup * MAX_USED_THREAD;
  519. uint threadInBlock = GI - threadBase;
  520. #ifndef REF_DEVICE
  521. if (blockID >= g_num_total_blocks)
  522. {
  523. return;
  524. }
  525. #endif
  526. uint block_y = blockID / g_num_block_x;
  527. uint block_x = blockID - block_y * g_num_block_x;
  528. uint base_x = block_x * BLOCK_SIZE_X;
  529. uint base_y = block_y * BLOCK_SIZE_Y;
  530. uint mode = g_InBuff[blockID].y & 0x7FFFFFFF;
  531. uint partition = g_InBuff[blockID].z;
  532. uint index_selector = (g_InBuff[blockID].y >> 31) & 1;
  533. uint rotation = g_InBuff[blockID].w;
  534. if (threadInBlock < 16)
  535. {
  536. uint4 pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
  537. if (1 == rotation)
  538. {
  539. pixel.ra = pixel.ar;
  540. }
  541. else if (2 == rotation)
  542. {
  543. pixel.ga = pixel.ag;
  544. }
  545. else if (3 == rotation)
  546. {
  547. pixel.ba = pixel.ab;
  548. }
  549. shared_temp[GI].pixel = pixel;
  550. }
  551. #ifdef REF_DEVICE
  552. GroupMemoryBarrierWithGroupSync();
  553. #endif
  554. uint bits = candidateSectionBit[partition];
  555. uint bits2 = candidateSectionBit2[partition - 64];
  556. uint2x4 ep;
  557. [unroll]
  558. for (int ii = 2; ii >= 0; -- ii)
  559. {
  560. if (threadInBlock < 16)
  561. {
  562. uint2x4 ep;
  563. ep[0] = MAX_UINT;
  564. ep[1] = MIN_UINT;
  565. uint4 pixel = shared_temp[GI].pixel;
  566. if (0 == ii)
  567. {
  568. if ((0 == mode) || (2 == mode))
  569. {
  570. if ((((bits2 >> (threadInBlock + 15)) & 0x02) != 2)
  571. && (((bits2 >> threadInBlock) & 0x01) != 1))
  572. {
  573. ep[0] = ep[1] = pixel;
  574. }
  575. }
  576. else if ((1 == mode) || (3 == mode) || (7 == mode))
  577. {
  578. if ( (( bits >> threadInBlock ) & 0x01) != 1 )
  579. {
  580. ep[0] = ep[1] = pixel;
  581. }
  582. }
  583. else if ((4 == mode) || (5 == mode) || (6 == mode))
  584. {
  585. ep[0] = ep[1] = pixel;
  586. }
  587. }
  588. else if (1 == ii)
  589. {
  590. if ((0 == mode) || (2 == mode))
  591. {
  592. if ((((bits2 >> (threadInBlock + 15)) & 0x02) != 2)
  593. && (((bits2 >> threadInBlock) & 0x01) == 1))
  594. {
  595. ep[0] = ep[1] = pixel;
  596. }
  597. }
  598. else if ((1 == mode) || (3 == mode) || (7 == mode))
  599. {
  600. if ( (( bits >> threadInBlock ) & 0x01) == 1 )
  601. {
  602. ep[0] = ep[1] = pixel;
  603. }
  604. }
  605. }
  606. else
  607. {
  608. if ((0 == mode) || (2 == mode))
  609. {
  610. if (((bits2 >> (threadInBlock + 15)) & 0x02) == 2)
  611. {
  612. ep[0] = ep[1] = pixel;
  613. }
  614. }
  615. }
  616. shared_temp[GI].endPoint_low = ep[0];
  617. shared_temp[GI].endPoint_high = ep[1];
  618. }
  619. #ifdef REF_DEVICE
  620. GroupMemoryBarrierWithGroupSync();
  621. #endif
  622. if (threadInBlock < 8)
  623. {
  624. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  625. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  626. }
  627. #ifdef REF_DEVICE
  628. GroupMemoryBarrierWithGroupSync();
  629. #endif
  630. if (threadInBlock < 4)
  631. {
  632. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  633. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  634. }
  635. #ifdef REF_DEVICE
  636. GroupMemoryBarrierWithGroupSync();
  637. #endif
  638. if (threadInBlock < 2)
  639. {
  640. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  641. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  642. }
  643. #ifdef REF_DEVICE
  644. GroupMemoryBarrierWithGroupSync();
  645. #endif
  646. if (threadInBlock < 1)
  647. {
  648. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  649. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  650. }
  651. #ifdef REF_DEVICE
  652. GroupMemoryBarrierWithGroupSync();
  653. #endif
  654. if (ii == threadInBlock)
  655. {
  656. ep[0] = shared_temp[threadBase].endPoint_low;
  657. ep[1] = shared_temp[threadBase].endPoint_high;
  658. }
  659. }
  660. if (threadInBlock < 3)
  661. {
  662. if (0 == mode)
  663. {
  664. compress_endpoints0( ep );
  665. }
  666. else if (1 == mode)
  667. {
  668. compress_endpoints1( ep );
  669. }
  670. else if (2 == mode)
  671. {
  672. compress_endpoints2( ep );
  673. }
  674. else if (3 == mode)
  675. {
  676. compress_endpoints3( ep );
  677. }
  678. else if (4 == mode)
  679. {
  680. compress_endpoints4( ep );
  681. }
  682. else if (5 == mode)
  683. {
  684. compress_endpoints5( ep );
  685. }
  686. else if (6 == mode)
  687. {
  688. compress_endpoints6( ep );
  689. }
  690. else //if (7 == mode)
  691. {
  692. compress_endpoints7( ep );
  693. }
  694. int4 span = ep[1] - ep[0];
  695. if (mode < 4)
  696. {
  697. span.w = 0;
  698. }
  699. if ((4 == mode) || (5 == mode))
  700. {
  701. if (0 == threadInBlock)
  702. {
  703. int2 span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
  704. int2 dotProduct = int2( dot( span.rgb, shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) );
  705. if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
  706. {
  707. swap(ep[0].rgb, ep[1].rgb);
  708. }
  709. if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
  710. {
  711. swap(ep[0].a, ep[1].a);
  712. }
  713. }
  714. }
  715. else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
  716. {
  717. int p;
  718. if (0 == threadInBlock)
  719. {
  720. p = 0;
  721. }
  722. else if (1 == threadInBlock)
  723. {
  724. p = candidateFixUpIndex1D[partition].x;
  725. }
  726. else //if (2 == threadInBlock)
  727. {
  728. p = candidateFixUpIndex1D[partition].y;
  729. }
  730. int span_norm_sqr = dot( span, span );
  731. int dotProduct = dot( span, shared_temp[threadBase + p].pixel - ep[0] );
  732. if ( span_norm_sqr > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr ) )
  733. {
  734. swap(ep[0], ep[1]);
  735. }
  736. }
  737. shared_temp[GI].endPoint_low = ep[0];
  738. shared_temp[GI].endPoint_high = ep[1];
  739. }
  740. #ifdef REF_DEVICE
  741. GroupMemoryBarrierWithGroupSync();
  742. #endif
  743. if (threadInBlock < 16)
  744. {
  745. uint color_index = 0;
  746. uint alpha_index = 0;
  747. uint2x4 ep;
  748. uint2 indexPrec;
  749. if ((0 == mode) || (1 == mode))
  750. {
  751. indexPrec = 1;
  752. }
  753. else if (6 == mode)
  754. {
  755. indexPrec = 0;
  756. }
  757. else if (4 == mode)
  758. {
  759. if (0 == index_selector)
  760. {
  761. indexPrec = uint2(2, 1);
  762. }
  763. else
  764. {
  765. indexPrec = uint2(1, 2);
  766. }
  767. }
  768. else
  769. {
  770. indexPrec = 2;
  771. }
  772. int subset_index;
  773. if ((0 == mode) || (2 == mode))
  774. {
  775. if ( (( bits2 >> ( threadInBlock + 15 ) ) & 0x02) == 2 )
  776. {
  777. subset_index = 2;
  778. }
  779. else if ( (( bits2 >> threadInBlock ) & 0x01) == 1 )
  780. {
  781. subset_index = 1;
  782. }
  783. else
  784. {
  785. subset_index = 0;
  786. }
  787. }
  788. else if ((1 == mode) || (3 == mode) || (7 == mode))
  789. {
  790. if ( (( bits >> threadInBlock ) & 0x01) == 1 )
  791. {
  792. subset_index = 1;
  793. }
  794. else
  795. {
  796. subset_index = 0;
  797. }
  798. }
  799. else
  800. {
  801. subset_index = 0;
  802. }
  803. ep[0] = shared_temp[threadBase + subset_index].endPoint_low;
  804. ep[1] = shared_temp[threadBase + subset_index].endPoint_high;
  805. int4 span = ep[1] - ep[0];
  806. if (mode < 4)
  807. {
  808. span.w = 0;
  809. }
  810. if ((4 == mode) || (5 == mode))
  811. {
  812. int2 span_norm_sqr;
  813. span_norm_sqr.x = dot( span.rgb, span.rgb );
  814. span_norm_sqr.y = span.a * span.a;
  815. int dotProduct = dot( span.rgb, shared_temp[threadBase + threadInBlock].pixel.rgb - ep[0].rgb );
  816. color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0
  817. : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
  818. dotProduct = dot( span.a, shared_temp[threadBase + threadInBlock].pixel.a - ep[0].a );
  819. alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0
  820. : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
  821. if (index_selector)
  822. {
  823. swap(color_index, alpha_index);
  824. }
  825. }
  826. else
  827. {
  828. int span_norm_sqr = dot( span, span );
  829. int dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel - ep[0] );
  830. color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
  831. : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] );
  832. }
  833. shared_temp[GI].error = color_index;
  834. shared_temp[GI].mode = alpha_index;
  835. }
  836. #ifdef REF_DEVICE
  837. GroupMemoryBarrierWithGroupSync();
  838. #endif
  839. if (0 == threadInBlock)
  840. {
  841. uint4 block;
  842. if (0 == mode)
  843. {
  844. block_package0( block, partition, threadBase );
  845. }
  846. else if (1 == mode)
  847. {
  848. block_package1( block, partition, threadBase );
  849. }
  850. else if (2 == mode)
  851. {
  852. block_package2( block, partition, threadBase );
  853. }
  854. else if (3 == mode)
  855. {
  856. block_package3( block, partition, threadBase );
  857. }
  858. else if (4 == mode)
  859. {
  860. block_package4( block, rotation, index_selector, threadBase );
  861. }
  862. else if (5 == mode)
  863. {
  864. block_package5( block, rotation, threadBase );
  865. }
  866. else if (6 == mode)
  867. {
  868. block_package6( block, threadBase );
  869. }
  870. else //if (7 == mode)
  871. {
  872. block_package7( block, partition, threadBase );
  873. }
  874. g_OutBuff[blockID] = block;
  875. }
  876. }
  877. void compress_endpoints0( inout uint2x4 endPoint )
  878. {
  879. uint3 tmp;
  880. for ( uint j = 0; j < 2; j ++ )
  881. {
  882. tmp = endPoint[j].rgb & 0x0F;
  883. tmp.x += tmp.y + tmp.z;
  884. endPoint[j].rgb = ( endPoint[j].rgb & 0xF0 ) | ( ( tmp.x / 3 ) & 0x08 );
  885. }
  886. }
  887. void compress_endpoints1( inout uint2x4 endPoint )
  888. {
  889. uint3 tmp;
  890. tmp = ( endPoint[0].rgb & 0x03 ) + ( endPoint[1].rgb & 0x03 );
  891. tmp.x += tmp.y + tmp.z;
  892. tmp.x = ( tmp.x / 6 ) & 0x02;
  893. for ( uint j = 0; j < 2; j ++ )
  894. {
  895. endPoint[j].rgb = ( endPoint[j].rgb & 0xFC ) | tmp.x;
  896. }
  897. }
  898. void compress_endpoints2( inout uint2x4 endPoint )
  899. {
  900. for ( uint j = 0; j < 2; j ++ )
  901. {
  902. endPoint[j].rgb = min(255, ( endPoint[j].rgb + 0x04 ) ) & 0xF8;
  903. }
  904. }
  905. void compress_endpoints3( inout uint2x4 endPoint )
  906. {
  907. uint3 tmp;
  908. for ( uint j = 0; j < 2; j ++ )
  909. {
  910. tmp = endPoint[j].rgb & 0x01;
  911. tmp.x += tmp.y + tmp.z;
  912. endPoint[j].rgb = ( endPoint[j].rgb & 0xFE ) | ( tmp.x / 3 );
  913. }
  914. }
  915. void compress_endpoints4( inout uint2x4 endPoint )
  916. {
  917. for ( uint j = 0; j < 2; j ++ )
  918. {
  919. endPoint[j] = min(255, ( endPoint[j] + uint4(0x04.xxx, 0x02) ) ) & uint4(0xF8.xxx, 0xFC);
  920. }
  921. }
  922. void compress_endpoints5( inout uint2x4 endPoint )
  923. {
  924. for ( uint j = 0; j < 2; j ++ )
  925. {
  926. endPoint[j].rgb = min(255, ( endPoint[j].rgb + 0x01 ) ) & 0xFE;
  927. }
  928. }
  929. void compress_endpoints6( inout uint2x4 endPoint )
  930. {
  931. uint4 tmp;
  932. for ( uint j = 0; j < 2; j ++ )
  933. {
  934. tmp = endPoint[j] & 0x01;
  935. tmp.x += tmp.y + tmp.z + tmp.w;
  936. endPoint[j] = ( endPoint[j] & 0xFE ) | ( ( tmp.x >> 2 ) & 0x01 );
  937. }
  938. }
  939. void compress_endpoints7( inout uint2x4 endPoint )
  940. {
  941. uint4 tmp;
  942. for ( uint j = 0; j < 2; j ++ )
  943. {
  944. tmp = endPoint[j] & 0x07;
  945. tmp.x += tmp.y + tmp.z + tmp.w;
  946. endPoint[j] = ( endPoint[j] & 0xF8 ) | ( ( tmp.x >> 2 ) & 0x04 );
  947. }
  948. }
  949. #define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low
  950. #define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high
  951. #define get_color_index(index) shared_temp[threadBase + index].error
  952. #define get_alpha_index(index) shared_temp[threadBase + index].mode
  953. void block_package0( out uint4 block, uint partition, uint threadBase )
  954. {
  955. block.x = 0x01 | ( (partition - 64) << 1 )
  956. | ( ( get_end_point_l(0).r & 0xF0 ) << 1 ) | ( ( get_end_point_h(0).r & 0xF0 ) << 5 )
  957. | ( ( get_end_point_l(1).r & 0xF0 ) << 9 ) | ( ( get_end_point_h(1).r & 0xF0 ) << 13 )
  958. | ( ( get_end_point_l(2).r & 0xF0 ) << 17 ) | ( ( get_end_point_h(2).r & 0xF0 ) << 21 )
  959. | ( ( get_end_point_l(0).g & 0xF0 ) << 25 );
  960. block.y = ( ( get_end_point_l(0).g & 0xF0 ) >> 7 ) | ( ( get_end_point_h(0).g & 0xF0 ) >> 3 )
  961. | ( ( get_end_point_l(1).g & 0xF0 ) << 1 ) | ( ( get_end_point_h(1).g & 0xF0 ) << 5 )
  962. | ( ( get_end_point_l(2).g & 0xF0 ) << 9 ) | ( ( get_end_point_h(2).g & 0xF0 ) << 13 )
  963. | ( ( get_end_point_l(0).b & 0xF0 ) << 17 ) | ( ( get_end_point_h(0).b & 0xF0 ) << 21 )
  964. | ( ( get_end_point_l(1).b & 0xF0 ) << 25 );
  965. block.z = ( ( get_end_point_l(1).b & 0xF0 ) >> 7 ) | ( ( get_end_point_h(1).b & 0xF0 ) >> 3 )
  966. | ( ( get_end_point_l(2).b & 0xF0 ) << 1 ) | ( ( get_end_point_h(2).b & 0xF0 ) << 5 )
  967. | ( ( get_end_point_l(0).r & 0x08 ) << 10 ) | ( ( get_end_point_h(0).r & 0x08 ) << 11 )
  968. | ( ( get_end_point_l(1).r & 0x08 ) << 12 ) | ( ( get_end_point_h(1).r & 0x08 ) << 13 )
  969. | ( ( get_end_point_l(2).r & 0x08 ) << 14 ) | ( ( get_end_point_h(2).r & 0x08 ) << 15 )
  970. | ( get_color_index(0) << 19 );
  971. block.w = 0;
  972. uint i = 1;
  973. for ( ; i <= min( candidateFixUpIndex1DOrdered[partition][0], 4 ); i ++ )
  974. {
  975. block.z |= get_color_index(i) << ( i * 3 + 18 );
  976. }
  977. if ( candidateFixUpIndex1DOrdered[partition][0] < 4 ) //i = 4
  978. {
  979. block.z |= get_color_index(4) << 29;
  980. i += 1;
  981. }
  982. else //i = 5
  983. {
  984. block.w |= ( get_color_index(4) & 0x04 ) >> 2;
  985. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  986. block.w |= get_color_index(i) << ( i * 3 - 14 );
  987. }
  988. for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
  989. {
  990. block.w |= get_color_index(i) << ( i * 3 - 15 );
  991. }
  992. for ( ; i < 16; i ++ )
  993. {
  994. block.w |= get_color_index(i) << ( i * 3 - 16 );
  995. }
  996. }
  997. void block_package1( out uint4 block, uint partition, uint threadBase )
  998. {
  999. block.x = 0x02 | ( partition << 2 )
  1000. | ( ( get_end_point_l(0).r & 0xFC ) << 6 ) | ( ( get_end_point_h(0).r & 0xFC ) << 12 )
  1001. | ( ( get_end_point_l(1).r & 0xFC ) << 18 ) | ( ( get_end_point_h(1).r & 0xFC ) << 24 );
  1002. block.y = ( ( get_end_point_l(0).g & 0xFC ) >> 2 ) | ( ( get_end_point_h(0).g & 0xFC ) << 4 )
  1003. | ( ( get_end_point_l(1).g & 0xFC ) << 10 ) | ( ( get_end_point_h(1).g & 0xFC ) << 16 )
  1004. | ( ( get_end_point_l(0).b & 0xFC ) << 22 ) | ( ( get_end_point_h(0).b & 0xFC ) << 28 );
  1005. block.z = ( ( get_end_point_h(0).b & 0xFC ) >> 4 ) | ( ( get_end_point_l(1).b & 0xFC ) << 2 )
  1006. | ( ( get_end_point_h(1).b & 0xFC ) << 8 )
  1007. | ( ( get_end_point_l(0).r & 0x02 ) << 15 ) | ( ( get_end_point_l(1).r & 0x02 ) << 16 )
  1008. | ( get_color_index(0) << 18 );
  1009. if ( candidateFixUpIndex1DOrdered[partition][0] == 15 )
  1010. {
  1011. block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) | (get_color_index(11) << 18) | (get_color_index(10) << 15)
  1012. | (get_color_index(9) << 12) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
  1013. block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1014. }
  1015. else if ( candidateFixUpIndex1DOrdered[partition][0] == 2 )
  1016. {
  1017. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
  1018. | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
  1019. block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1020. }
  1021. else if ( candidateFixUpIndex1DOrdered[partition][0] == 8 )
  1022. {
  1023. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
  1024. | (get_color_index(9) << 11) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
  1025. block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1026. }
  1027. else //candidateFixUpIndex1DOrdered[partition] == 6
  1028. {
  1029. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
  1030. | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 6) | (get_color_index(6) << 4) | get_color_index(5);
  1031. block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1032. }
  1033. }
  1034. void block_package2( out uint4 block, uint partition, uint threadBase )
  1035. {
  1036. block.x = 0x04 | ( (partition - 64) << 3 )
  1037. | ( ( get_end_point_l(0).r & 0xF8 ) << 6 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 11 )
  1038. | ( ( get_end_point_l(1).r & 0xF8 ) << 16 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 21 )
  1039. | ( ( get_end_point_l(2).r & 0xF8 ) << 26 );
  1040. block.y = ( ( get_end_point_l(2).r & 0xF8 ) >> 6 ) | ( ( get_end_point_h(2).r & 0xF8 ) >> 1 )
  1041. | ( ( get_end_point_l(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 9 )
  1042. | ( ( get_end_point_l(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_h(1).g & 0xF8 ) << 19 )
  1043. | ( ( get_end_point_l(2).g & 0xF8 ) << 24 );
  1044. block.z = ( ( get_end_point_h(2).g & 0xF8 ) >> 3 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 2 )
  1045. | ( ( get_end_point_h(0).b & 0xF8 ) << 7 ) | ( ( get_end_point_l(1).b & 0xF8 ) << 12 )
  1046. | ( ( get_end_point_h(1).b & 0xF8 ) << 17 ) | ( ( get_end_point_l(2).b & 0xF8 ) << 22 )
  1047. | ( ( get_end_point_h(2).b & 0xF8 ) << 27 );
  1048. block.w = ( ( get_end_point_h(2).b & 0xF8 ) >> 5 )
  1049. | ( get_color_index(0) << 3 );
  1050. uint i = 1;
  1051. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1052. {
  1053. block.w |= get_color_index(i) << ( i * 2 + 2 );
  1054. }
  1055. for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
  1056. {
  1057. block.w |= get_color_index(i) << ( i * 2 + 1 );
  1058. }
  1059. for ( ; i < 16; i ++ )
  1060. {
  1061. block.w |= get_color_index(i) << ( i * 2 );
  1062. }
  1063. }
  1064. void block_package3( out uint4 block, uint partition, uint threadBase )
  1065. {
  1066. block.x = 0x08 | ( partition << 4 )
  1067. | ( ( get_end_point_l(0).r & 0xFE ) << 9 ) | ( ( get_end_point_h(0).r & 0xFE ) << 16 )
  1068. | ( ( get_end_point_l(1).r & 0xFE ) << 23 ) | ( ( get_end_point_h(1).r & 0xFE ) << 30 );
  1069. block.y = ( ( get_end_point_h(1).r & 0xFE ) >> 2 ) | ( ( get_end_point_l(0).g & 0xFE ) << 5 )
  1070. | ( ( get_end_point_h(0).g & 0xFE ) << 12 ) | ( ( get_end_point_l(1).g & 0xFE ) << 19 )
  1071. | ( ( get_end_point_h(1).g & 0xFE ) << 26 );
  1072. block.z = ( ( get_end_point_h(1).g & 0xFE ) >> 6 ) | ( ( get_end_point_l(0).b & 0xFE ) << 1 )
  1073. | ( ( get_end_point_h(0).b & 0xFE ) << 8 ) | ( ( get_end_point_l(1).b & 0xFE ) << 15 )
  1074. | ( ( get_end_point_h(1).b & 0xFE ) << 22 )
  1075. | ( ( get_end_point_l(0).r & 0x01 ) << 30 ) | ( ( get_end_point_h(0).r & 0x01 ) << 31 );
  1076. block.w = ( ( get_end_point_l(1).r & 0x01 ) << 0 ) | ( ( get_end_point_h(1).r & 0x01 ) << 1 )
  1077. | ( get_color_index(0) << 2 );
  1078. uint i = 1;
  1079. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1080. {
  1081. block.w |= get_color_index(i) << ( i * 2 + 1 );
  1082. }
  1083. for ( ; i < 16; i ++ )
  1084. {
  1085. block.w |= get_color_index(i) << ( i * 2 );
  1086. }
  1087. }
  1088. void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase )
  1089. {
  1090. block.x = 0x10 | ( rotation << 5 ) | ( index_selector << 7 )
  1091. | ( ( get_end_point_l(0).r & 0xF8 ) << 5 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 10 )
  1092. | ( ( get_end_point_l(0).g & 0xF8 ) << 15 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 20 )
  1093. | ( ( get_end_point_l(0).b & 0xF8 ) << 25 );
  1094. block.y = ( ( get_end_point_l(0).b & 0xF8 ) >> 7 ) | ( ( get_end_point_h(0).b & 0xF8 ) >> 2 )
  1095. | ( ( get_end_point_l(0).a & 0xFC ) << 4 ) | ( ( get_end_point_h(0).a & 0xFC ) << 10 )
  1096. | ( get_color_index(0) << 18 ) | ( get_color_index(1) << 19 ) | ( get_color_index(2) << 21 ) | ( get_color_index(3) << 23 )
  1097. | ( get_color_index(4) << 25 ) | ( get_color_index(5) << 27 ) | ( get_color_index(6) << 29 ) | ( get_color_index(7) << 31 );
  1098. block.z = ( get_color_index(7) >> 1 ) | ( get_color_index(8) << 1 ) | ( get_color_index(9) << 3 ) | ( get_color_index(10)<< 5 )
  1099. | ( get_color_index(11)<< 7 ) | ( get_color_index(12)<< 9 ) | ( get_color_index(13)<< 11 ) | ( get_color_index(14)<< 13 )
  1100. | ( get_color_index(15)<< 15 ) | ( get_alpha_index(0) << 17 ) | ( get_alpha_index(1) << 19 ) | ( get_alpha_index(2) << 22 )
  1101. | ( get_alpha_index(3) << 25 ) | ( get_alpha_index(4) << 28 ) | ( get_alpha_index(5) << 31 );
  1102. block.w = ( get_alpha_index(5) >> 1 ) | ( get_alpha_index(6) << 2 ) | ( get_alpha_index(7) << 5 ) | ( get_alpha_index(8) << 8 )
  1103. | ( get_alpha_index(9) << 11 ) | ( get_alpha_index(10)<< 14 ) | ( get_alpha_index(11)<< 17 ) | ( get_alpha_index(12)<< 20 )
  1104. | ( get_alpha_index(13)<< 23 ) | ( get_alpha_index(14)<< 26 ) | ( get_alpha_index(15)<< 29 );
  1105. }
  1106. void block_package5( out uint4 block, uint rotation, uint threadBase )
  1107. {
  1108. block.x = 0x20 | ( rotation << 6 )
  1109. | ( ( get_end_point_l(0).r & 0xFE ) << 7 ) | ( ( get_end_point_h(0).r & 0xFE ) << 14 )
  1110. | ( ( get_end_point_l(0).g & 0xFE ) << 21 ) | ( ( get_end_point_h(0).g & 0xFE ) << 28 );
  1111. block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 4 ) | ( ( get_end_point_l(0).b & 0xFE ) << 3 )
  1112. | ( ( get_end_point_h(0).b & 0xFE ) << 10 ) | ( get_end_point_l(0).a << 18 ) | ( get_end_point_h(0).a << 26 );
  1113. block.z = ( get_end_point_h(0).a >> 6 )
  1114. | ( get_color_index(0) << 2 ) | ( get_color_index(1) << 3 ) | ( get_color_index(2) << 5 ) | ( get_color_index(3) << 7 )
  1115. | ( get_color_index(4) << 9 ) | ( get_color_index(5) << 11 ) | ( get_color_index(6) << 13 ) | ( get_color_index(7) << 15 )
  1116. | ( get_color_index(8) << 17 ) | ( get_color_index(9) << 19 ) | ( get_color_index(10)<< 21 ) | ( get_color_index(11)<< 23 )
  1117. | ( get_color_index(12)<< 25 ) | ( get_color_index(13)<< 27 ) | ( get_color_index(14)<< 29 ) | ( get_color_index(15)<< 31 );
  1118. block.w = ( get_color_index(15)>> 1 ) | ( get_alpha_index(0) << 0 ) | ( get_alpha_index(1) << 2 ) | ( get_alpha_index(2) << 4 )
  1119. | ( get_alpha_index(3) << 6 ) | ( get_alpha_index(4) << 8 ) | ( get_alpha_index(5) << 10 ) | ( get_alpha_index(6) << 12 )
  1120. | ( get_alpha_index(7) << 14 ) | ( get_alpha_index(8) << 16 ) | ( get_alpha_index(9) << 18 ) | ( get_alpha_index(10)<< 20 )
  1121. | ( get_alpha_index(11)<< 22 ) | ( get_alpha_index(12)<< 24 ) | ( get_alpha_index(13)<< 26 ) | ( get_alpha_index(14)<< 28 )
  1122. | ( get_alpha_index(15)<< 30 );
  1123. }
  1124. void block_package6( out uint4 block, uint threadBase )
  1125. {
  1126. block.x = 0x40
  1127. | ( ( get_end_point_l(0).r & 0xFE ) << 6 ) | ( ( get_end_point_h(0).r & 0xFE ) << 13 )
  1128. | ( ( get_end_point_l(0).g & 0xFE ) << 20 ) | ( ( get_end_point_h(0).g & 0xFE ) << 27 );
  1129. block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 5 ) | ( ( get_end_point_l(0).b & 0xFE ) << 2 )
  1130. | ( ( get_end_point_h(0).b & 0xFE ) << 9 ) | ( ( get_end_point_l(0).a & 0xFE ) << 16 )
  1131. | ( ( get_end_point_h(0).a & 0xFE ) << 23 )
  1132. | ( get_end_point_l(0).r & 0x01 ) << 31;
  1133. block.z = ( get_end_point_h(0).r & 0x01 )
  1134. | ( get_color_index(0) << 1 ) | ( get_color_index(1) << 4 ) | ( get_color_index(2) << 8 ) | ( get_color_index(3) << 12 )
  1135. | ( get_color_index(4) << 16 ) | ( get_color_index(5) << 20 ) | ( get_color_index(6) << 24 ) | ( get_color_index(7) << 28 );
  1136. block.w = ( get_color_index(8) << 0 ) | ( get_color_index(9) << 4 ) | ( get_color_index(10)<< 8 ) | ( get_color_index(11)<< 12 )
  1137. | ( get_color_index(12)<< 16 ) | ( get_color_index(13)<< 20 ) | ( get_color_index(14)<< 24 ) | ( get_color_index(15)<< 28 );
  1138. }
  1139. void block_package7( out uint4 block, uint partition, uint threadBase )
  1140. {
  1141. block.x = 0x80 | ( partition << 8 )
  1142. | ( ( get_end_point_l(0).r & 0xF8 ) << 11 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 16 )
  1143. | ( ( get_end_point_l(1).r & 0xF8 ) << 21 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 26 );
  1144. block.y = ( ( get_end_point_h(1).r & 0xF8 ) >> 6 ) | ( ( get_end_point_l(0).g & 0xF8 ) >> 1 )
  1145. | ( ( get_end_point_h(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_l(1).g & 0xF8 ) << 9 )
  1146. | ( ( get_end_point_h(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 19 )
  1147. | ( ( get_end_point_h(0).b & 0xF8 ) << 24 );
  1148. block.z = ( ( get_end_point_l(1).b & 0xF8 ) >> 3 ) | ( ( get_end_point_h(1).b & 0xF8 ) << 2 )
  1149. | ( ( get_end_point_l(0).a & 0xF8 ) << 7 ) | ( ( get_end_point_h(0).a & 0xF8 ) << 12 )
  1150. | ( ( get_end_point_l(1).a & 0xF8 ) << 17 ) | ( ( get_end_point_h(1).a & 0xF8 ) << 22 )
  1151. | ( ( get_end_point_l(0).r & 0x04 ) << 27 ) | ( ( get_end_point_h(0).r & 0x04 ) << 28 );
  1152. block.w = ( ( get_end_point_l(1).r & 0x04 ) >> 2 ) | ( ( get_end_point_h(1).r & 0x04 ) >> 1 )
  1153. | ( get_color_index(0) << 2 );
  1154. uint i = 1;
  1155. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1156. {
  1157. block.w |= get_color_index(i) << ( i * 2 + 1 );
  1158. }
  1159. for ( ; i < 16; i ++ )
  1160. {
  1161. block.w |= get_color_index(i) << ( i * 2 );
  1162. }
  1163. }