BC7Encode_TryMode456CS.hlsl 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // CHECK: flattenedThreadIdInGroup
  3. // CHECK: groupId
  4. // CHECK: bufferLoad
  5. // CHECK: textureLoad
  6. // CHECK: UMax
  7. // CHECK: UMin
  8. // CHECK: barrier
  9. // CHECK: IMad
  10. // CHECK: barrier
  11. // CHECK: bufferStore
  12. //--------------------------------------------------------------------------------------
  13. // File: BC7Encode.hlsl
  14. //
  15. // The Compute Shader for BC7 Encoder
  16. //
  17. // Copyright (c) Microsoft Corporation. All rights reserved.
  18. //--------------------------------------------------------------------------------------
  19. #define REF_DEVICE
  20. #define CHAR_LENGTH 8
  21. #define NCHANNELS 4
  22. #define BC7_UNORM 98
  23. #define MAX_UINT 0xFFFFFFFF
  24. #define MIN_UINT 0
  25. static const uint candidateSectionBit[64] = //Associated to partition 0-63
  26. {
  27. 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
  28. 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
  29. 0xC800, 0xFFEC, 0xFE80, 0xE800,
  30. 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
  31. 0xF710, 0x008E, 0x7100, 0x08CE,
  32. 0x008C, 0x7310, 0x3100, 0x8CCE,
  33. 0x088C, 0x3110, 0x6666, 0x366C,
  34. 0x17E8, 0x0FF0, 0x718E, 0x399C,
  35. 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
  36. 0x3c3c, 0x55aa, 0x9696, 0xa55a,
  37. 0x73ce, 0x13c8, 0x324c, 0x3bdc,
  38. 0x6996, 0xc33c, 0x9966, 0x660,
  39. 0x272, 0x4e4, 0x4e40, 0x2720,
  40. 0xc936, 0x936c, 0x39c6, 0x639c,
  41. 0x9336, 0x9cc6, 0x817e, 0xe718,
  42. 0xccf0, 0xfcc, 0x7744, 0xee22,
  43. };
  44. static const uint candidateSectionBit2[64] = //Associated to partition 64-127
  45. {
  46. 0xf60008cc, 0x73008cc8, 0x3310cc80, 0xceec00,
  47. 0xcc003300, 0xcc0000cc, 0xccff00, 0x3300cccc,
  48. 0xf0000f00, 0xf0000ff0, 0xff0000f0, 0x88884444,
  49. 0x88886666, 0xcccc2222, 0xec80136c, 0x7310008c,
  50. 0xc80036c8, 0x310008ce, 0xccc03330, 0xcccf000,
  51. 0xee0000ee, 0x77008888, 0xcc0022c0, 0x33004430,
  52. 0xcc0c22, 0xfc880344, 0x6606996, 0x66009960,
  53. 0xc88c0330, 0xf9000066, 0xcc0c22c, 0x73108c00,
  54. 0xec801300, 0x8cec400, 0xec80004c, 0x44442222,
  55. 0xf0000f0, 0x49242492, 0x42942942, 0xc30c30c,
  56. 0x3c0c03c, 0xff0000aa, 0x5500aa00, 0xcccc3030,
  57. 0xc0cc0c0, 0x66669090, 0xff0a00a, 0x5550aaa0,
  58. 0xf0000aaa, 0xe0ee0e0, 0x88887070, 0x99906660,
  59. 0xe00e0ee0, 0x88880770, 0xf0000666, 0x99006600,
  60. 0xff000066, 0xc00c0cc0, 0xcccc0330, 0x90006000,
  61. 0x8088080, 0xeeee1010, 0xfff0000a, 0x731008ce,
  62. };
  63. static const uint2 candidateFixUpIndex1D[128] =
  64. {
  65. {15, 0},{15, 0},{15, 0},{15, 0},
  66. {15, 0},{15, 0},{15, 0},{15, 0},
  67. {15, 0},{15, 0},{15, 0},{15, 0},
  68. {15, 0},{15, 0},{15, 0},{15, 0},
  69. {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
  70. { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
  71. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  72. { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  73. {15, 0},{15, 0},{ 6, 0},{ 8, 0},
  74. { 2, 0},{ 8, 0},{15, 0},{15, 0},
  75. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  76. { 2, 0},{15, 0},{15, 0},{ 6, 0},
  77. { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
  78. {15, 0},{15, 0},{ 2, 0},{ 2, 0},
  79. {15, 0},{15, 0},{15, 0},{15, 0},
  80. {15, 0},{ 2, 0},{ 2, 0},{15, 0},
  81. //candidateFixUpIndex1D[i][1], i < 64 should not be used
  82. { 3,15},{ 3, 8},{15, 8},{15, 3},
  83. { 8,15},{ 3,15},{15, 3},{15, 8},
  84. { 8,15},{ 8,15},{ 6,15},{ 6,15},
  85. { 6,15},{ 5,15},{ 3,15},{ 3, 8},
  86. { 3,15},{ 3, 8},{ 8,15},{15, 3},
  87. { 3,15},{ 3, 8},{ 6,15},{10, 8},
  88. { 5, 3},{ 8,15},{ 8, 6},{ 6,10},
  89. { 8,15},{ 5,15},{15,10},{15, 8},
  90. { 8,15},{15, 3},{ 3,15},{ 5,10},
  91. { 6,10},{10, 8},{ 8, 9},{15,10},
  92. {15, 6},{ 3,15},{15, 8},{ 5,15},
  93. {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
  94. { 3,15},{15, 3},{ 5,15},{ 5,15},
  95. { 5,15},{ 8,15},{ 5,15},{10,15},
  96. { 5,15},{10,15},{ 8,15},{13,15},
  97. {15, 3},{12,15},{ 3,15},{ 3, 8},
  98. };
  99. static const uint2 candidateFixUpIndex1DOrdered[128] = //Same with candidateFixUpIndex1D but order the result when i >= 64
  100. {
  101. {15, 0},{15, 0},{15, 0},{15, 0},
  102. {15, 0},{15, 0},{15, 0},{15, 0},
  103. {15, 0},{15, 0},{15, 0},{15, 0},
  104. {15, 0},{15, 0},{15, 0},{15, 0},
  105. {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
  106. { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
  107. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  108. { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  109. {15, 0},{15, 0},{ 6, 0},{ 8, 0},
  110. { 2, 0},{ 8, 0},{15, 0},{15, 0},
  111. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  112. { 2, 0},{15, 0},{15, 0},{ 6, 0},
  113. { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
  114. {15, 0},{15, 0},{ 2, 0},{ 2, 0},
  115. {15, 0},{15, 0},{15, 0},{15, 0},
  116. {15, 0},{ 2, 0},{ 2, 0},{15, 0},
  117. //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
  118. { 3,15},{ 3, 8},{ 8,15},{ 3,15},
  119. { 8,15},{ 3,15},{ 3,15},{ 8,15},
  120. { 8,15},{ 8,15},{ 6,15},{ 6,15},
  121. { 6,15},{ 5,15},{ 3,15},{ 3, 8},
  122. { 3,15},{ 3, 8},{ 8,15},{ 3,15},
  123. { 3,15},{ 3, 8},{ 6,15},{ 8,10},
  124. { 3, 5},{ 8,15},{ 6, 8},{ 6,10},
  125. { 8,15},{ 5,15},{10,15},{ 8,15},
  126. { 8,15},{ 3,15},{ 3,15},{ 5,10},
  127. { 6,10},{ 8,10},{ 8, 9},{10,15},
  128. { 6,15},{ 3,15},{ 8,15},{ 5,15},
  129. { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
  130. { 3,15},{ 3,15},{ 5,15},{ 5,15},
  131. { 5,15},{ 8,15},{ 5,15},{10,15},
  132. { 5,15},{10,15},{ 8,15},{13,15},
  133. { 3,15},{12,15},{ 3,15},{ 3, 8},
  134. };
  135. static const uint4x4 candidateRotation[4] =
  136. {
  137. {1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1},
  138. {0,0,0,1},{0,1,0,0},{0,0,1,0},{1,0,0,0},
  139. {1,0,0,0},{0,0,0,1},{0,0,1,0},{0,1,0,0},
  140. {1,0,0,0},{0,1,0,0},{0,0,0,1},{0,0,1,0}
  141. };
  142. static const uint2 candidateIndexPrec[8] = {{3,0},{3,0},{2,0},{2,0},
  143. {2,3}, //color index and alpha index can exchange
  144. {2,2},{4,4},{2,2}};
  145. static const uint aWeight[3][16] = { {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
  146. {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},
  147. {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
  148. //0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
  149. static const uint aStep[3][64] = { { 0, 0, 0, 1, 1, 1, 1, 2,
  150. 2, 2, 2, 2, 3, 3, 3, 3,
  151. 4, 4, 4, 4, 5, 5, 5, 5,
  152. 6, 6, 6, 6, 6, 7, 7, 7,
  153. 7, 8, 8, 8, 8, 9, 9, 9,
  154. 9,10,10,10,10,10,11,11,
  155. 11,11,12,12,12,12,13,13,
  156. 13,13,14,14,14,14,15,15 },
  157. //0, 9, 18, 27, 37, 46, 55, 64
  158. { 0,0,0,0,0,1,1,1,
  159. 1,1,1,1,1,1,2,2,
  160. 2,2,2,2,2,2,2,3,
  161. 3,3,3,3,3,3,3,3,
  162. 3,4,4,4,4,4,4,4,
  163. 4,4,5,5,5,5,5,5,
  164. 5,5,5,6,6,6,6,6,
  165. 6,6,6,6,7,7,7,7 },
  166. //0, 21, 43, 64
  167. { 0,0,0,0,0,0,0,0,
  168. 0,0,0,1,1,1,1,1,
  169. 1,1,1,1,1,1,1,1,
  170. 1,1,1,1,1,1,1,1,
  171. 1,2,2,2,2,2,2,2,
  172. 2,2,2,2,2,2,2,2,
  173. 2,2,2,2,2,2,3,3,
  174. 3,3,3,3,3,3,3,3 } };
  175. cbuffer cbCS : register( b0 )
  176. {
  177. uint g_tex_width;
  178. uint g_num_block_x;
  179. uint g_format;
  180. uint g_mode_id;
  181. uint g_start_block_id;
  182. uint g_num_total_blocks;
  183. };
  184. //Forward declaration
  185. void compress_endpoints0( inout uint2x4 endPoint ); //Mode = 0
  186. void compress_endpoints1( inout uint2x4 endPoint ); //Mode = 1
  187. void compress_endpoints2( inout uint2x4 endPoint ); //Mode = 2
  188. void compress_endpoints3( inout uint2x4 endPoint ); //Mode = 3
  189. void compress_endpoints7( inout uint2x4 endPoint ); //Mode = 7
  190. void compress_endpoints6( inout uint2x4 endPoint ); //Mode = 6
  191. void compress_endpoints4( inout uint2x4 endPoint ); //Mode = 4
  192. void compress_endpoints5( inout uint2x4 endPoint ); //Mode = 5
  193. void block_package0( out uint4 block, uint partition, uint threadBase ); //Mode0
  194. void block_package1( out uint4 block, uint partition, uint threadBase ); //Mode1
  195. void block_package2( out uint4 block, uint partition, uint threadBase ); //Mode2
  196. void block_package3( out uint4 block, uint partition, uint threadBase ); //Mode3
  197. void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase ); //Mode4
  198. void block_package5( out uint4 block, uint rotation, uint threadBase ); //Mode5
  199. void block_package6( out uint4 block, uint threadBase ); //Mode6
  200. void block_package7( out uint4 block, uint partition, uint threadBase ); //Mode7
  201. void swap(inout uint4 lhs, inout uint4 rhs)
  202. {
  203. int4 tmp = lhs;
  204. lhs = rhs;
  205. rhs = tmp;
  206. }
  207. void swap(inout uint3 lhs, inout uint3 rhs)
  208. {
  209. int3 tmp = lhs;
  210. lhs = rhs;
  211. rhs = tmp;
  212. }
  213. void swap(inout uint lhs, inout uint rhs)
  214. {
  215. int tmp = lhs;
  216. lhs = rhs;
  217. rhs = tmp;
  218. }
  219. Texture2D g_Input : register( t0 );
  220. StructuredBuffer<uint4> g_InBuff : register( t1 );
  221. RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
  222. #define THREAD_GROUP_SIZE 64
  223. #define BLOCK_SIZE_Y 4
  224. #define BLOCK_SIZE_X 4
  225. #define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X)
  226. struct BufferShared
  227. {
  228. uint4 pixel;
  229. uint error;
  230. uint mode;
  231. uint partition;
  232. uint index_selector;
  233. uint rotation;
  234. uint4 endPoint_low;
  235. uint4 endPoint_high;
  236. };
  237. groupshared BufferShared shared_temp[THREAD_GROUP_SIZE];
  238. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  239. void main( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
  240. {
  241. const uint MAX_USED_THREAD = 16;
  242. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  243. uint blockInGroup = GI / MAX_USED_THREAD;
  244. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  245. uint threadBase = blockInGroup * MAX_USED_THREAD;
  246. uint threadInBlock = GI - threadBase;
  247. #ifndef REF_DEVICE
  248. if (blockID >= g_num_total_blocks)
  249. {
  250. return;
  251. }
  252. #endif
  253. uint block_y = blockID / g_num_block_x;
  254. uint block_x = blockID - block_y * g_num_block_x;
  255. uint base_x = block_x * BLOCK_SIZE_X;
  256. uint base_y = block_y * BLOCK_SIZE_Y;
  257. if (threadInBlock < 16)
  258. {
  259. shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
  260. shared_temp[GI].endPoint_low = shared_temp[GI].pixel;
  261. shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
  262. }
  263. #ifdef REF_DEVICE
  264. GroupMemoryBarrierWithGroupSync();
  265. #endif
  266. if (threadInBlock < 8)
  267. {
  268. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  269. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  270. }
  271. #ifdef REF_DEVICE
  272. GroupMemoryBarrierWithGroupSync();
  273. #endif
  274. if (threadInBlock < 4)
  275. {
  276. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  277. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  278. }
  279. #ifdef REF_DEVICE
  280. GroupMemoryBarrierWithGroupSync();
  281. #endif
  282. if (threadInBlock < 2)
  283. {
  284. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  285. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  286. }
  287. #ifdef REF_DEVICE
  288. GroupMemoryBarrierWithGroupSync();
  289. #endif
  290. if (threadInBlock < 1)
  291. {
  292. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  293. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  294. }
  295. #ifdef REF_DEVICE
  296. GroupMemoryBarrierWithGroupSync();
  297. #endif
  298. uint2x4 endPoint;
  299. endPoint[0] = shared_temp[threadBase].endPoint_low;
  300. endPoint[1] = shared_temp[threadBase].endPoint_high;
  301. uint error = 0xFFFFFFFF;
  302. uint mode = 0;
  303. uint index_selector = 0;
  304. uint rotation = 0;
  305. uint2 indexPrec;
  306. if (threadInBlock < 8)
  307. {
  308. if (0 == (threadInBlock & 1))
  309. {
  310. //2 represents 2bit index precision; 1 represents 3bit index precision
  311. indexPrec = uint2( 2, 1 );
  312. }
  313. else
  314. {
  315. //2 represents 2bit index precision; 1 represents 3bit index precision
  316. index_selector = 1;
  317. indexPrec = uint2( 1, 2 );
  318. }
  319. }
  320. else
  321. {
  322. //2 represents 2bit index precision
  323. indexPrec = uint2( 2, 2 );
  324. }
  325. uint4 pixel_r;
  326. uint color_index;
  327. uint alpha_index;
  328. int4 span;
  329. int2 span_norm_sqr;
  330. int2 dotProduct;
  331. if (threadInBlock < 12)
  332. {
  333. if ((threadInBlock < 2) || (8 == threadInBlock)) // rotation = 0
  334. {
  335. rotation = 0;
  336. }
  337. else if ((threadInBlock < 4) || (9 == threadInBlock)) // rotation = 1
  338. {
  339. endPoint[0].ra = endPoint[0].ar;
  340. endPoint[1].ra = endPoint[1].ar;
  341. rotation = 1;
  342. }
  343. else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2
  344. {
  345. endPoint[0].ga = endPoint[0].ag;
  346. endPoint[1].ga = endPoint[1].ag;
  347. rotation = 2;
  348. }
  349. else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3
  350. {
  351. endPoint[0].ba = endPoint[0].ab;
  352. endPoint[1].ba = endPoint[1].ab;
  353. rotation = 3;
  354. }
  355. if (threadInBlock < 8)
  356. {
  357. mode = 4;
  358. compress_endpoints4( endPoint );
  359. }
  360. else
  361. {
  362. mode = 5;
  363. compress_endpoints5( endPoint );
  364. }
  365. uint4 pixel = shared_temp[threadBase + 0].pixel;
  366. if (1 == rotation)
  367. {
  368. pixel.ra = pixel.ar;
  369. }
  370. else if (2 == rotation)
  371. {
  372. pixel.ga = pixel.ag;
  373. }
  374. else if (3 == rotation)
  375. {
  376. pixel.ba = pixel.ab;
  377. }
  378. span = endPoint[1] - endPoint[0];
  379. span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
  380. dotProduct = int2( dot( span.rgb, pixel.rgb - endPoint[0].rgb ), span.a * ( pixel.a - endPoint[0].a ) );
  381. if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
  382. {
  383. span.rgb = -span.rgb;
  384. swap(endPoint[0].rgb, endPoint[1].rgb);
  385. }
  386. if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
  387. {
  388. span.a = -span.a;
  389. swap(endPoint[0].a, endPoint[1].a);
  390. }
  391. error = 0;
  392. for ( uint i = 0; i < 16; i ++ )
  393. {
  394. pixel = shared_temp[threadBase + i].pixel;
  395. if (1 == rotation)
  396. {
  397. pixel.ra = pixel.ar;
  398. }
  399. else if (2 == rotation)
  400. {
  401. pixel.ga = pixel.ag;
  402. }
  403. else if (3 == rotation)
  404. {
  405. pixel.ba = pixel.ab;
  406. }
  407. dotProduct.x = dot( span.rgb, pixel.rgb - endPoint[0].rgb );
  408. color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
  409. : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
  410. dotProduct.y = dot( span.a, pixel.a - endPoint[0].a );
  411. alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0
  412. : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
  413. if (index_selector)
  414. {
  415. swap(color_index, alpha_index);
  416. }
  417. pixel_r.rgb = ( ( 64 - aWeight[indexPrec.x][color_index] ) * endPoint[0].rgb
  418. + aWeight[indexPrec.x][color_index] * endPoint[1] + 32 ) >> 6;
  419. pixel_r.a = ( ( 64 - aWeight[indexPrec.y][alpha_index] ) * endPoint[0].a
  420. + aWeight[indexPrec.y][alpha_index] * endPoint[1] + 32 ) >> 6;
  421. pixel_r -= pixel;
  422. error += dot(pixel_r, pixel_r);
  423. }
  424. }
  425. else if (12 == threadInBlock)//Mode6
  426. {
  427. compress_endpoints6( endPoint );
  428. uint4 pixel = shared_temp[threadBase + 0].pixel;
  429. span = endPoint[1] - endPoint[0];
  430. span_norm_sqr = dot( span, span );
  431. dotProduct = dot( span, pixel - endPoint[0] );
  432. if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
  433. {
  434. span = -span;
  435. swap(endPoint[0], endPoint[1]);
  436. }
  437. error = 0;
  438. for ( uint i = 0; i < 16; i ++ )
  439. {
  440. pixel = shared_temp[threadBase + i].pixel;
  441. dotProduct.x = dot( span, pixel - endPoint[0] );
  442. color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
  443. : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] );
  444. pixel_r = ( ( 64 - aWeight[0][color_index] ) * endPoint[0]
  445. + aWeight[0][color_index] * endPoint[1] + 32 ) >> 6;
  446. pixel_r -= pixel;
  447. error += dot(pixel_r, pixel_r);
  448. }
  449. mode = 6;
  450. rotation = 0;
  451. }
  452. shared_temp[GI].error = error;
  453. shared_temp[GI].mode = mode;
  454. shared_temp[GI].index_selector = index_selector;
  455. shared_temp[GI].rotation = rotation;
  456. #ifdef REF_DEVICE
  457. GroupMemoryBarrierWithGroupSync();
  458. #endif
  459. if (threadInBlock < 8)
  460. {
  461. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  462. {
  463. shared_temp[GI].error = shared_temp[GI + 8].error;
  464. shared_temp[GI].mode = shared_temp[GI + 8].mode;
  465. shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector;
  466. shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
  467. }
  468. }
  469. #ifdef REF_DEVICE
  470. GroupMemoryBarrierWithGroupSync();
  471. #endif
  472. if (threadInBlock < 4)
  473. {
  474. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  475. {
  476. shared_temp[GI].error = shared_temp[GI + 4].error;
  477. shared_temp[GI].mode = shared_temp[GI + 4].mode;
  478. shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
  479. shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
  480. }
  481. }
  482. #ifdef REF_DEVICE
  483. GroupMemoryBarrierWithGroupSync();
  484. #endif
  485. if (threadInBlock < 2)
  486. {
  487. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  488. {
  489. shared_temp[GI].error = shared_temp[GI + 2].error;
  490. shared_temp[GI].mode = shared_temp[GI + 2].mode;
  491. shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
  492. shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
  493. }
  494. }
  495. #ifdef REF_DEVICE
  496. GroupMemoryBarrierWithGroupSync();
  497. #endif
  498. if (threadInBlock < 1)
  499. {
  500. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  501. {
  502. shared_temp[GI].error = shared_temp[GI + 1].error;
  503. shared_temp[GI].mode = shared_temp[GI + 1].mode;
  504. shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector;
  505. shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
  506. }
  507. g_OutBuff[blockID] = uint4(shared_temp[GI].error, (shared_temp[GI].index_selector << 31) | shared_temp[GI].mode,
  508. 0, shared_temp[GI].rotation);
  509. }
  510. }
  511. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  512. void TryMode137CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
  513. {
  514. const uint MAX_USED_THREAD = 64;
  515. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  516. uint blockInGroup = GI / MAX_USED_THREAD;
  517. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  518. uint threadBase = blockInGroup * MAX_USED_THREAD;
  519. uint threadInBlock = GI - threadBase;
  520. uint block_y = blockID / g_num_block_x;
  521. uint block_x = blockID - block_y * g_num_block_x;
  522. uint base_x = block_x * BLOCK_SIZE_X;
  523. uint base_y = block_y * BLOCK_SIZE_Y;
  524. if (threadInBlock < 16)
  525. {
  526. shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
  527. }
  528. GroupMemoryBarrierWithGroupSync();
  529. shared_temp[GI].error = 0xFFFFFFFF;
  530. uint4 pixel_r;
  531. uint2x4 endPoint[2];
  532. uint color_index;
  533. if (threadInBlock < 64)
  534. {
  535. uint partition = threadInBlock;
  536. uint error = 0;
  537. endPoint[0][0] = MAX_UINT;
  538. endPoint[0][1] = MIN_UINT;
  539. endPoint[1][0] = MAX_UINT;
  540. endPoint[1][1] = MIN_UINT;
  541. uint bits = candidateSectionBit[partition];
  542. for ( uint i = 0; i < 16; i ++ )
  543. {
  544. uint4 pixel = shared_temp[threadBase + i].pixel;
  545. if ( (( bits >> i ) & 0x01) == 1 )
  546. {
  547. endPoint[1][0] = min( endPoint[1][0], pixel );
  548. endPoint[1][1] = max( endPoint[1][1], pixel );
  549. }
  550. else
  551. {
  552. endPoint[0][0] = min( endPoint[0][0], pixel );
  553. endPoint[0][1] = max( endPoint[0][1], pixel );
  554. }
  555. }
  556. for ( uint i = 0; i < 2; i ++ )
  557. {
  558. if (g_mode_id == 1)
  559. {
  560. compress_endpoints1( endPoint[i] );
  561. }
  562. else if (g_mode_id == 3)
  563. {
  564. compress_endpoints3( endPoint[i] );
  565. }
  566. else //if (g_mode_id == 7)
  567. {
  568. compress_endpoints7( endPoint[i] );
  569. }
  570. }
  571. int4 span[2];
  572. span[0] = endPoint[0][1] - endPoint[0][0];
  573. span[1] = endPoint[1][1] - endPoint[1][0];
  574. if (g_mode_id != 7)
  575. {
  576. span[0].w = span[1].w = 0;
  577. }
  578. int span_norm_sqr[2];
  579. span_norm_sqr[0] = dot( span[0], span[0] );
  580. span_norm_sqr[1] = dot( span[1], span[1] );
  581. int dotProduct = dot( span[0], shared_temp[threadBase + 0].pixel - endPoint[0][0] );
  582. if ( span_norm_sqr[0] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[0] ) )
  583. {
  584. span[0] = -span[0];
  585. swap(endPoint[0][0], endPoint[0][1]);
  586. }
  587. dotProduct = dot( span[1], shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel - endPoint[1][0] );
  588. if ( span_norm_sqr[1] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[1] ) )
  589. {
  590. span[1] = -span[1];
  591. swap(endPoint[1][0], endPoint[1][1]);
  592. }
  593. uint step_selector;
  594. if (g_mode_id != 1)
  595. {
  596. step_selector = 2;
  597. }
  598. else
  599. {
  600. step_selector = 1;
  601. }
  602. uint bits2 = candidateSectionBit2[partition];
  603. for ( uint i = 0; i < 16; i ++ )
  604. {
  605. if (((bits >> i) & 0x01) == 1)
  606. {
  607. dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] );
  608. color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0) ? 0
  609. : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[1])] : aStep[step_selector][63]);
  610. }
  611. else
  612. {
  613. dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] );
  614. color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0) ? 0
  615. : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[0])] : aStep[step_selector][63]);
  616. }
  617. uint subset_index;
  618. if (g_mode_id == 7)
  619. {
  620. subset_index = (bits >> i) & 0x01;
  621. }
  622. else
  623. {
  624. subset_index = ((bits2 >> (i + 15)) & 0x02) | ((bits2 >> i) & 0x01);
  625. }
  626. pixel_r = ((64 - aWeight[step_selector][color_index]) * endPoint[subset_index][0]
  627. + aWeight[step_selector][color_index] * endPoint[subset_index][1] + 32) >> 6;
  628. pixel_r -= shared_temp[threadBase + i].pixel;
  629. if (g_mode_id != 7)
  630. {
  631. pixel_r.a = 0;
  632. }
  633. error += dot(pixel_r, pixel_r);
  634. }
  635. shared_temp[GI].error = error;
  636. shared_temp[GI].mode = g_mode_id;
  637. shared_temp[GI].partition = partition;
  638. }
  639. GroupMemoryBarrierWithGroupSync();
  640. if (threadInBlock < 32)
  641. {
  642. if ( shared_temp[GI].error > shared_temp[GI + 32].error )
  643. {
  644. shared_temp[GI].error = shared_temp[GI + 32].error;
  645. shared_temp[GI].mode = shared_temp[GI + 32].mode;
  646. shared_temp[GI].partition = shared_temp[GI + 32].partition;
  647. }
  648. }
  649. #ifdef REF_DEVICE
  650. GroupMemoryBarrierWithGroupSync();
  651. #endif
  652. if (threadInBlock < 16)
  653. {
  654. if ( shared_temp[GI].error > shared_temp[GI + 16].error )
  655. {
  656. shared_temp[GI].error = shared_temp[GI + 16].error;
  657. shared_temp[GI].mode = shared_temp[GI + 16].mode;
  658. shared_temp[GI].partition = shared_temp[GI + 16].partition;
  659. }
  660. }
  661. #ifdef REF_DEVICE
  662. GroupMemoryBarrierWithGroupSync();
  663. #endif
  664. if (threadInBlock < 8)
  665. {
  666. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  667. {
  668. shared_temp[GI].error = shared_temp[GI + 8].error;
  669. shared_temp[GI].mode = shared_temp[GI + 8].mode;
  670. shared_temp[GI].partition = shared_temp[GI + 8].partition;
  671. }
  672. }
  673. #ifdef REF_DEVICE
  674. GroupMemoryBarrierWithGroupSync();
  675. #endif
  676. if (threadInBlock < 4)
  677. {
  678. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  679. {
  680. shared_temp[GI].error = shared_temp[GI + 4].error;
  681. shared_temp[GI].mode = shared_temp[GI + 4].mode;
  682. shared_temp[GI].partition = shared_temp[GI + 4].partition;
  683. }
  684. }
  685. #ifdef REF_DEVICE
  686. GroupMemoryBarrierWithGroupSync();
  687. #endif
  688. if (threadInBlock < 2)
  689. {
  690. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  691. {
  692. shared_temp[GI].error = shared_temp[GI + 2].error;
  693. shared_temp[GI].mode = shared_temp[GI + 2].mode;
  694. shared_temp[GI].partition = shared_temp[GI + 2].partition;
  695. }
  696. }
  697. #ifdef REF_DEVICE
  698. GroupMemoryBarrierWithGroupSync();
  699. #endif
  700. if (threadInBlock < 1)
  701. {
  702. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  703. {
  704. shared_temp[GI].error = shared_temp[GI + 1].error;
  705. shared_temp[GI].mode = shared_temp[GI + 1].mode;
  706. shared_temp[GI].partition = shared_temp[GI + 1].partition;
  707. }
  708. if (g_InBuff[blockID].x > shared_temp[GI].error)
  709. {
  710. g_OutBuff[blockID] = uint4(shared_temp[GI].error, shared_temp[GI].mode, shared_temp[GI].partition, 0);
  711. }
  712. else
  713. {
  714. g_OutBuff[blockID] = g_InBuff[blockID];
  715. }
  716. }
  717. }
  718. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  719. void TryMode02CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
  720. {
  721. const uint MAX_USED_THREAD = 64;
  722. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  723. uint blockInGroup = GI / MAX_USED_THREAD;
  724. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  725. uint threadBase = blockInGroup * MAX_USED_THREAD;
  726. uint threadInBlock = GI - threadBase;
  727. uint block_y = blockID / g_num_block_x;
  728. uint block_x = blockID - block_y * g_num_block_x;
  729. uint base_x = block_x * BLOCK_SIZE_X;
  730. uint base_y = block_y * BLOCK_SIZE_Y;
  731. if (threadInBlock < 16)
  732. {
  733. shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
  734. }
  735. GroupMemoryBarrierWithGroupSync();
  736. shared_temp[GI].error = 0xFFFFFFFF;
  737. uint num_partitions;
  738. if (0 == g_mode_id)
  739. {
  740. num_partitions = 16;
  741. }
  742. else
  743. {
  744. num_partitions = 64;
  745. }
  746. uint4 pixel_r;
  747. uint2x4 endPoint[3];
  748. uint color_index[16];
  749. if (threadInBlock < num_partitions)
  750. {
  751. uint partition = threadInBlock + 64;
  752. endPoint[0][0] = MAX_UINT;
  753. endPoint[0][1] = MIN_UINT;
  754. endPoint[1][0] = MAX_UINT;
  755. endPoint[1][1] = MIN_UINT;
  756. endPoint[2][0] = MAX_UINT;
  757. endPoint[2][1] = MIN_UINT;
  758. uint bits2 = candidateSectionBit2[partition - 64];
  759. for ( uint i = 0; i < 16; i ++ )
  760. {
  761. uint4 pixel = shared_temp[threadBase + i].pixel;
  762. if ( (( bits2 >> ( i + 15 ) ) & 0x02) == 2 ) //It gets error when using "candidateSectionCompressed" as "endPoint" index
  763. {
  764. endPoint[2][0] = min( endPoint[2][0], pixel );
  765. endPoint[2][1] = max( endPoint[2][1], pixel );
  766. }
  767. else if ( (( bits2 >> i ) & 0x01) == 1 )
  768. {
  769. endPoint[1][0] = min( endPoint[1][0], pixel );
  770. endPoint[1][1] = max( endPoint[1][1], pixel );
  771. }
  772. else
  773. {
  774. endPoint[0][0] = min( endPoint[0][0], pixel );
  775. endPoint[0][1] = max( endPoint[0][1], pixel );
  776. }
  777. }
  778. for ( uint i = 0; i < 3; i ++ )
  779. {
  780. if (0 == g_mode_id)
  781. {
  782. compress_endpoints0( endPoint[i] );
  783. }
  784. else
  785. {
  786. compress_endpoints2( endPoint[i] );
  787. }
  788. }
  789. uint step_selector = 1 + (2 == g_mode_id);
  790. int4 span[3];
  791. span[0] = endPoint[0][1] - endPoint[0][0];
  792. span[1] = endPoint[1][1] - endPoint[1][0];
  793. span[2] = endPoint[2][1] - endPoint[2][0];
  794. span[0].w = span[1].w = span[2].w = 0;
  795. int span_norm_sqr[3];
  796. span_norm_sqr[0] = dot( span[0], span[0] );
  797. span_norm_sqr[1] = dot( span[1], span[1] );
  798. span_norm_sqr[2] = dot( span[2], span[2] );
  799. int dotProduct = dot( span[0], shared_temp[threadBase + 0].pixel - endPoint[0][0] );
  800. if ( span_norm_sqr[0] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[0] ) )
  801. {
  802. span[0] = -span[0];
  803. swap(endPoint[0][0], endPoint[0][1]);
  804. }
  805. dotProduct = dot( span[1], shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel - endPoint[1][0] );
  806. if ( span_norm_sqr[1] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[1] ) )
  807. {
  808. span[1] = -span[1];
  809. swap(endPoint[1][0], endPoint[1][1]);
  810. }
  811. dotProduct = dot( span[2], shared_temp[threadBase + candidateFixUpIndex1D[partition].y].pixel - endPoint[2][0] );
  812. if ( span_norm_sqr[2] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[2] ) )
  813. {
  814. span[2] = -span[2];
  815. swap(endPoint[2][0], endPoint[2][1]);
  816. }
  817. uint error = 0;
  818. for ( uint i = 0; i < 16; i ++ )
  819. {
  820. if ( (( bits2 >> ( i + 15 ) ) & 0x02) == 2 )
  821. {
  822. dotProduct = dot( span[2], shared_temp[threadBase + i].pixel - endPoint[2][0] );
  823. color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
  824. : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] );
  825. }
  826. else if ( (( bits2 >> i ) & 0x01) == 1 )
  827. {
  828. dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] );
  829. color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
  830. : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] );
  831. }
  832. else
  833. {
  834. dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] );
  835. color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
  836. : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] );
  837. }
  838. uint subset_index = ((bits2 >> (i + 15)) & 0x02) | ((bits2 >> i) & 0x01);
  839. pixel_r = ( ( 64 - aWeight[step_selector][color_index[i]] ) * endPoint[subset_index][0]
  840. + aWeight[step_selector][color_index[i]] * endPoint[subset_index][1] + 32 ) >> 6;
  841. pixel_r -= shared_temp[threadBase + i].pixel;
  842. pixel_r.w = 0;
  843. error += dot(pixel_r, pixel_r);
  844. }
  845. shared_temp[GI].error = error;
  846. shared_temp[GI].partition = partition;
  847. }
  848. GroupMemoryBarrierWithGroupSync();
  849. if (threadInBlock < 32)
  850. {
  851. if ( shared_temp[GI].error > shared_temp[GI + 32].error )
  852. {
  853. shared_temp[GI].error = shared_temp[GI + 32].error;
  854. shared_temp[GI].partition = shared_temp[GI + 32].partition;
  855. }
  856. }
  857. #ifdef REF_DEVICE
  858. GroupMemoryBarrierWithGroupSync();
  859. #endif
  860. if (threadInBlock < 16)
  861. {
  862. if ( shared_temp[GI].error > shared_temp[GI + 16].error )
  863. {
  864. shared_temp[GI].error = shared_temp[GI + 16].error;
  865. shared_temp[GI].partition = shared_temp[GI + 16].partition;
  866. }
  867. }
  868. #ifdef REF_DEVICE
  869. GroupMemoryBarrierWithGroupSync();
  870. #endif
  871. if (threadInBlock < 8)
  872. {
  873. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  874. {
  875. shared_temp[GI].error = shared_temp[GI + 8].error;
  876. shared_temp[GI].partition = shared_temp[GI + 8].partition;
  877. }
  878. }
  879. #ifdef REF_DEVICE
  880. GroupMemoryBarrierWithGroupSync();
  881. #endif
  882. if (threadInBlock < 4)
  883. {
  884. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  885. {
  886. shared_temp[GI].error = shared_temp[GI + 4].error;
  887. shared_temp[GI].partition = shared_temp[GI + 4].partition;
  888. }
  889. }
  890. #ifdef REF_DEVICE
  891. GroupMemoryBarrierWithGroupSync();
  892. #endif
  893. if (threadInBlock < 2)
  894. {
  895. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  896. {
  897. shared_temp[GI].error = shared_temp[GI + 2].error;
  898. shared_temp[GI].partition = shared_temp[GI + 2].partition;
  899. }
  900. }
  901. #ifdef REF_DEVICE
  902. GroupMemoryBarrierWithGroupSync();
  903. #endif
  904. if (threadInBlock < 1)
  905. {
  906. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  907. {
  908. shared_temp[GI].error = shared_temp[GI + 1].error;
  909. shared_temp[GI].partition = shared_temp[GI + 1].partition;
  910. }
  911. if (g_InBuff[blockID].x > shared_temp[GI].error)
  912. {
  913. g_OutBuff[blockID] = uint4(shared_temp[GI].error, g_mode_id, shared_temp[GI].partition, 0);
  914. }
  915. else
  916. {
  917. g_OutBuff[blockID] = g_InBuff[blockID];
  918. }
  919. }
  920. }
  921. [numthreads( THREAD_GROUP_SIZE, 1, 1 )]
  922. void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
  923. {
  924. const uint MAX_USED_THREAD = 16;
  925. uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  926. uint blockInGroup = GI / MAX_USED_THREAD;
  927. uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  928. uint threadBase = blockInGroup * MAX_USED_THREAD;
  929. uint threadInBlock = GI - threadBase;
  930. #ifndef REF_DEVICE
  931. if (blockID >= g_num_total_blocks)
  932. {
  933. return;
  934. }
  935. #endif
  936. uint block_y = blockID / g_num_block_x;
  937. uint block_x = blockID - block_y * g_num_block_x;
  938. uint base_x = block_x * BLOCK_SIZE_X;
  939. uint base_y = block_y * BLOCK_SIZE_Y;
  940. uint mode = g_InBuff[blockID].y & 0x7FFFFFFF;
  941. uint partition = g_InBuff[blockID].z;
  942. uint index_selector = (g_InBuff[blockID].y >> 31) & 1;
  943. uint rotation = g_InBuff[blockID].w;
  944. if (threadInBlock < 16)
  945. {
  946. uint4 pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
  947. if (1 == rotation)
  948. {
  949. pixel.ra = pixel.ar;
  950. }
  951. else if (2 == rotation)
  952. {
  953. pixel.ga = pixel.ag;
  954. }
  955. else if (3 == rotation)
  956. {
  957. pixel.ba = pixel.ab;
  958. }
  959. shared_temp[GI].pixel = pixel;
  960. }
  961. #ifdef REF_DEVICE
  962. GroupMemoryBarrierWithGroupSync();
  963. #endif
  964. uint bits = candidateSectionBit[partition];
  965. uint bits2 = candidateSectionBit2[partition - 64];
  966. uint2x4 ep;
  967. [unroll]
  968. for (int ii = 2; ii >= 0; -- ii)
  969. {
  970. if (threadInBlock < 16)
  971. {
  972. uint2x4 ep;
  973. ep[0] = MAX_UINT;
  974. ep[1] = MIN_UINT;
  975. uint4 pixel = shared_temp[GI].pixel;
  976. if (0 == ii)
  977. {
  978. if ((0 == mode) || (2 == mode))
  979. {
  980. if ((((bits2 >> (threadInBlock + 15)) & 0x02) != 2)
  981. && (((bits2 >> threadInBlock) & 0x01) != 1))
  982. {
  983. ep[0] = ep[1] = pixel;
  984. }
  985. }
  986. else if ((1 == mode) || (3 == mode) || (7 == mode))
  987. {
  988. if ( (( bits >> threadInBlock ) & 0x01) != 1 )
  989. {
  990. ep[0] = ep[1] = pixel;
  991. }
  992. }
  993. else if ((4 == mode) || (5 == mode) || (6 == mode))
  994. {
  995. ep[0] = ep[1] = pixel;
  996. }
  997. }
  998. else if (1 == ii)
  999. {
  1000. if ((0 == mode) || (2 == mode))
  1001. {
  1002. if ((((bits2 >> (threadInBlock + 15)) & 0x02) != 2)
  1003. && (((bits2 >> threadInBlock) & 0x01) == 1))
  1004. {
  1005. ep[0] = ep[1] = pixel;
  1006. }
  1007. }
  1008. else if ((1 == mode) || (3 == mode) || (7 == mode))
  1009. {
  1010. if ( (( bits >> threadInBlock ) & 0x01) == 1 )
  1011. {
  1012. ep[0] = ep[1] = pixel;
  1013. }
  1014. }
  1015. }
  1016. else
  1017. {
  1018. if ((0 == mode) || (2 == mode))
  1019. {
  1020. if (((bits2 >> (threadInBlock + 15)) & 0x02) == 2)
  1021. {
  1022. ep[0] = ep[1] = pixel;
  1023. }
  1024. }
  1025. }
  1026. shared_temp[GI].endPoint_low = ep[0];
  1027. shared_temp[GI].endPoint_high = ep[1];
  1028. }
  1029. #ifdef REF_DEVICE
  1030. GroupMemoryBarrierWithGroupSync();
  1031. #endif
  1032. if (threadInBlock < 8)
  1033. {
  1034. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  1035. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  1036. }
  1037. #ifdef REF_DEVICE
  1038. GroupMemoryBarrierWithGroupSync();
  1039. #endif
  1040. if (threadInBlock < 4)
  1041. {
  1042. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  1043. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  1044. }
  1045. #ifdef REF_DEVICE
  1046. GroupMemoryBarrierWithGroupSync();
  1047. #endif
  1048. if (threadInBlock < 2)
  1049. {
  1050. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  1051. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  1052. }
  1053. #ifdef REF_DEVICE
  1054. GroupMemoryBarrierWithGroupSync();
  1055. #endif
  1056. if (threadInBlock < 1)
  1057. {
  1058. shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  1059. shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  1060. }
  1061. #ifdef REF_DEVICE
  1062. GroupMemoryBarrierWithGroupSync();
  1063. #endif
  1064. if (ii == threadInBlock)
  1065. {
  1066. ep[0] = shared_temp[threadBase].endPoint_low;
  1067. ep[1] = shared_temp[threadBase].endPoint_high;
  1068. }
  1069. }
  1070. if (threadInBlock < 3)
  1071. {
  1072. if (0 == mode)
  1073. {
  1074. compress_endpoints0( ep );
  1075. }
  1076. else if (1 == mode)
  1077. {
  1078. compress_endpoints1( ep );
  1079. }
  1080. else if (2 == mode)
  1081. {
  1082. compress_endpoints2( ep );
  1083. }
  1084. else if (3 == mode)
  1085. {
  1086. compress_endpoints3( ep );
  1087. }
  1088. else if (4 == mode)
  1089. {
  1090. compress_endpoints4( ep );
  1091. }
  1092. else if (5 == mode)
  1093. {
  1094. compress_endpoints5( ep );
  1095. }
  1096. else if (6 == mode)
  1097. {
  1098. compress_endpoints6( ep );
  1099. }
  1100. else //if (7 == mode)
  1101. {
  1102. compress_endpoints7( ep );
  1103. }
  1104. int4 span = ep[1] - ep[0];
  1105. if (mode < 4)
  1106. {
  1107. span.w = 0;
  1108. }
  1109. if ((4 == mode) || (5 == mode))
  1110. {
  1111. if (0 == threadInBlock)
  1112. {
  1113. int2 span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
  1114. int2 dotProduct = int2( dot( span.rgb, shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) );
  1115. if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
  1116. {
  1117. swap(ep[0].rgb, ep[1].rgb);
  1118. }
  1119. if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
  1120. {
  1121. swap(ep[0].a, ep[1].a);
  1122. }
  1123. }
  1124. }
  1125. else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
  1126. {
  1127. int p;
  1128. if (0 == threadInBlock)
  1129. {
  1130. p = 0;
  1131. }
  1132. else if (1 == threadInBlock)
  1133. {
  1134. p = candidateFixUpIndex1D[partition].x;
  1135. }
  1136. else //if (2 == threadInBlock)
  1137. {
  1138. p = candidateFixUpIndex1D[partition].y;
  1139. }
  1140. int span_norm_sqr = dot( span, span );
  1141. int dotProduct = dot( span, shared_temp[threadBase + p].pixel - ep[0] );
  1142. if ( span_norm_sqr > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr ) )
  1143. {
  1144. swap(ep[0], ep[1]);
  1145. }
  1146. }
  1147. shared_temp[GI].endPoint_low = ep[0];
  1148. shared_temp[GI].endPoint_high = ep[1];
  1149. }
  1150. #ifdef REF_DEVICE
  1151. GroupMemoryBarrierWithGroupSync();
  1152. #endif
  1153. if (threadInBlock < 16)
  1154. {
  1155. uint color_index = 0;
  1156. uint alpha_index = 0;
  1157. uint2x4 ep;
  1158. uint2 indexPrec;
  1159. if ((0 == mode) || (1 == mode))
  1160. {
  1161. indexPrec = 1;
  1162. }
  1163. else if (6 == mode)
  1164. {
  1165. indexPrec = 0;
  1166. }
  1167. else if (4 == mode)
  1168. {
  1169. if (0 == index_selector)
  1170. {
  1171. indexPrec = uint2(2, 1);
  1172. }
  1173. else
  1174. {
  1175. indexPrec = uint2(1, 2);
  1176. }
  1177. }
  1178. else
  1179. {
  1180. indexPrec = 2;
  1181. }
  1182. int subset_index;
  1183. if ((0 == mode) || (2 == mode))
  1184. {
  1185. if ( (( bits2 >> ( threadInBlock + 15 ) ) & 0x02) == 2 )
  1186. {
  1187. subset_index = 2;
  1188. }
  1189. else if ( (( bits2 >> threadInBlock ) & 0x01) == 1 )
  1190. {
  1191. subset_index = 1;
  1192. }
  1193. else
  1194. {
  1195. subset_index = 0;
  1196. }
  1197. }
  1198. else if ((1 == mode) || (3 == mode) || (7 == mode))
  1199. {
  1200. if ( (( bits >> threadInBlock ) & 0x01) == 1 )
  1201. {
  1202. subset_index = 1;
  1203. }
  1204. else
  1205. {
  1206. subset_index = 0;
  1207. }
  1208. }
  1209. else
  1210. {
  1211. subset_index = 0;
  1212. }
  1213. ep[0] = shared_temp[threadBase + subset_index].endPoint_low;
  1214. ep[1] = shared_temp[threadBase + subset_index].endPoint_high;
  1215. int4 span = ep[1] - ep[0];
  1216. if (mode < 4)
  1217. {
  1218. span.w = 0;
  1219. }
  1220. if ((4 == mode) || (5 == mode))
  1221. {
  1222. int2 span_norm_sqr;
  1223. span_norm_sqr.x = dot( span.rgb, span.rgb );
  1224. span_norm_sqr.y = span.a * span.a;
  1225. int dotProduct = dot( span.rgb, shared_temp[threadBase + threadInBlock].pixel.rgb - ep[0].rgb );
  1226. color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0
  1227. : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
  1228. dotProduct = dot( span.a, shared_temp[threadBase + threadInBlock].pixel.a - ep[0].a );
  1229. alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0
  1230. : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
  1231. if (index_selector)
  1232. {
  1233. swap(color_index, alpha_index);
  1234. }
  1235. }
  1236. else
  1237. {
  1238. int span_norm_sqr = dot( span, span );
  1239. int dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel - ep[0] );
  1240. color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
  1241. : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] );
  1242. }
  1243. shared_temp[GI].error = color_index;
  1244. shared_temp[GI].mode = alpha_index;
  1245. }
  1246. #ifdef REF_DEVICE
  1247. GroupMemoryBarrierWithGroupSync();
  1248. #endif
  1249. if (0 == threadInBlock)
  1250. {
  1251. uint4 block;
  1252. if (0 == mode)
  1253. {
  1254. block_package0( block, partition, threadBase );
  1255. }
  1256. else if (1 == mode)
  1257. {
  1258. block_package1( block, partition, threadBase );
  1259. }
  1260. else if (2 == mode)
  1261. {
  1262. block_package2( block, partition, threadBase );
  1263. }
  1264. else if (3 == mode)
  1265. {
  1266. block_package3( block, partition, threadBase );
  1267. }
  1268. else if (4 == mode)
  1269. {
  1270. block_package4( block, rotation, index_selector, threadBase );
  1271. }
  1272. else if (5 == mode)
  1273. {
  1274. block_package5( block, rotation, threadBase );
  1275. }
  1276. else if (6 == mode)
  1277. {
  1278. block_package6( block, threadBase );
  1279. }
  1280. else //if (7 == mode)
  1281. {
  1282. block_package7( block, partition, threadBase );
  1283. }
  1284. g_OutBuff[blockID] = block;
  1285. }
  1286. }
  1287. void compress_endpoints0( inout uint2x4 endPoint )
  1288. {
  1289. uint3 tmp;
  1290. for ( uint j = 0; j < 2; j ++ )
  1291. {
  1292. tmp = endPoint[j].rgb & 0x0F;
  1293. tmp.x += tmp.y + tmp.z;
  1294. endPoint[j].rgb = ( endPoint[j].rgb & 0xF0 ) | ( ( tmp.x / 3 ) & 0x08 );
  1295. }
  1296. }
  1297. void compress_endpoints1( inout uint2x4 endPoint )
  1298. {
  1299. uint3 tmp;
  1300. tmp = ( endPoint[0].rgb & 0x03 ) + ( endPoint[1].rgb & 0x03 );
  1301. tmp.x += tmp.y + tmp.z;
  1302. tmp.x = ( tmp.x / 6 ) & 0x02;
  1303. for ( uint j = 0; j < 2; j ++ )
  1304. {
  1305. endPoint[j].rgb = ( endPoint[j].rgb & 0xFC ) | tmp.x;
  1306. }
  1307. }
  1308. void compress_endpoints2( inout uint2x4 endPoint )
  1309. {
  1310. for ( uint j = 0; j < 2; j ++ )
  1311. {
  1312. endPoint[j].rgb = min(255, ( endPoint[j].rgb + 0x04 ) ) & 0xF8;
  1313. }
  1314. }
  1315. void compress_endpoints3( inout uint2x4 endPoint )
  1316. {
  1317. uint3 tmp;
  1318. for ( uint j = 0; j < 2; j ++ )
  1319. {
  1320. tmp = endPoint[j].rgb & 0x01;
  1321. tmp.x += tmp.y + tmp.z;
  1322. endPoint[j].rgb = ( endPoint[j].rgb & 0xFE ) | ( tmp.x / 3 );
  1323. }
  1324. }
  1325. void compress_endpoints4( inout uint2x4 endPoint )
  1326. {
  1327. for ( uint j = 0; j < 2; j ++ )
  1328. {
  1329. endPoint[j] = min(255, ( endPoint[j] + uint4(0x04.xxx, 0x02) ) ) & uint4(0xF8.xxx, 0xFC);
  1330. }
  1331. }
  1332. void compress_endpoints5( inout uint2x4 endPoint )
  1333. {
  1334. for ( uint j = 0; j < 2; j ++ )
  1335. {
  1336. endPoint[j].rgb = min(255, ( endPoint[j].rgb + 0x01 ) ) & 0xFE;
  1337. }
  1338. }
  1339. void compress_endpoints6( inout uint2x4 endPoint )
  1340. {
  1341. uint4 tmp;
  1342. for ( uint j = 0; j < 2; j ++ )
  1343. {
  1344. tmp = endPoint[j] & 0x01;
  1345. tmp.x += tmp.y + tmp.z + tmp.w;
  1346. endPoint[j] = ( endPoint[j] & 0xFE ) | ( ( tmp.x >> 2 ) & 0x01 );
  1347. }
  1348. }
  1349. void compress_endpoints7( inout uint2x4 endPoint )
  1350. {
  1351. uint4 tmp;
  1352. for ( uint j = 0; j < 2; j ++ )
  1353. {
  1354. tmp = endPoint[j] & 0x07;
  1355. tmp.x += tmp.y + tmp.z + tmp.w;
  1356. endPoint[j] = ( endPoint[j] & 0xF8 ) | ( ( tmp.x >> 2 ) & 0x04 );
  1357. }
  1358. }
  1359. #define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low
  1360. #define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high
  1361. #define get_color_index(index) shared_temp[threadBase + index].error
  1362. #define get_alpha_index(index) shared_temp[threadBase + index].mode
  1363. void block_package0( out uint4 block, uint partition, uint threadBase )
  1364. {
  1365. block.x = 0x01 | ( (partition - 64) << 1 )
  1366. | ( ( get_end_point_l(0).r & 0xF0 ) << 1 ) | ( ( get_end_point_h(0).r & 0xF0 ) << 5 )
  1367. | ( ( get_end_point_l(1).r & 0xF0 ) << 9 ) | ( ( get_end_point_h(1).r & 0xF0 ) << 13 )
  1368. | ( ( get_end_point_l(2).r & 0xF0 ) << 17 ) | ( ( get_end_point_h(2).r & 0xF0 ) << 21 )
  1369. | ( ( get_end_point_l(0).g & 0xF0 ) << 25 );
  1370. block.y = ( ( get_end_point_l(0).g & 0xF0 ) >> 7 ) | ( ( get_end_point_h(0).g & 0xF0 ) >> 3 )
  1371. | ( ( get_end_point_l(1).g & 0xF0 ) << 1 ) | ( ( get_end_point_h(1).g & 0xF0 ) << 5 )
  1372. | ( ( get_end_point_l(2).g & 0xF0 ) << 9 ) | ( ( get_end_point_h(2).g & 0xF0 ) << 13 )
  1373. | ( ( get_end_point_l(0).b & 0xF0 ) << 17 ) | ( ( get_end_point_h(0).b & 0xF0 ) << 21 )
  1374. | ( ( get_end_point_l(1).b & 0xF0 ) << 25 );
  1375. block.z = ( ( get_end_point_l(1).b & 0xF0 ) >> 7 ) | ( ( get_end_point_h(1).b & 0xF0 ) >> 3 )
  1376. | ( ( get_end_point_l(2).b & 0xF0 ) << 1 ) | ( ( get_end_point_h(2).b & 0xF0 ) << 5 )
  1377. | ( ( get_end_point_l(0).r & 0x08 ) << 10 ) | ( ( get_end_point_h(0).r & 0x08 ) << 11 )
  1378. | ( ( get_end_point_l(1).r & 0x08 ) << 12 ) | ( ( get_end_point_h(1).r & 0x08 ) << 13 )
  1379. | ( ( get_end_point_l(2).r & 0x08 ) << 14 ) | ( ( get_end_point_h(2).r & 0x08 ) << 15 )
  1380. | ( get_color_index(0) << 19 );
  1381. block.w = 0;
  1382. uint i = 1;
  1383. for ( ; i <= min( candidateFixUpIndex1DOrdered[partition][0], 4 ); i ++ )
  1384. {
  1385. block.z |= get_color_index(i) << ( i * 3 + 18 );
  1386. }
  1387. if ( candidateFixUpIndex1DOrdered[partition][0] < 4 ) //i = 4
  1388. {
  1389. block.z |= get_color_index(4) << 29;
  1390. i += 1;
  1391. }
  1392. else //i = 5
  1393. {
  1394. block.w |= ( get_color_index(4) & 0x04 ) >> 2;
  1395. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1396. block.w |= get_color_index(i) << ( i * 3 - 14 );
  1397. }
  1398. for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
  1399. {
  1400. block.w |= get_color_index(i) << ( i * 3 - 15 );
  1401. }
  1402. for ( ; i < 16; i ++ )
  1403. {
  1404. block.w |= get_color_index(i) << ( i * 3 - 16 );
  1405. }
  1406. }
  1407. void block_package1( out uint4 block, uint partition, uint threadBase )
  1408. {
  1409. block.x = 0x02 | ( partition << 2 )
  1410. | ( ( get_end_point_l(0).r & 0xFC ) << 6 ) | ( ( get_end_point_h(0).r & 0xFC ) << 12 )
  1411. | ( ( get_end_point_l(1).r & 0xFC ) << 18 ) | ( ( get_end_point_h(1).r & 0xFC ) << 24 );
  1412. block.y = ( ( get_end_point_l(0).g & 0xFC ) >> 2 ) | ( ( get_end_point_h(0).g & 0xFC ) << 4 )
  1413. | ( ( get_end_point_l(1).g & 0xFC ) << 10 ) | ( ( get_end_point_h(1).g & 0xFC ) << 16 )
  1414. | ( ( get_end_point_l(0).b & 0xFC ) << 22 ) | ( ( get_end_point_h(0).b & 0xFC ) << 28 );
  1415. block.z = ( ( get_end_point_h(0).b & 0xFC ) >> 4 ) | ( ( get_end_point_l(1).b & 0xFC ) << 2 )
  1416. | ( ( get_end_point_h(1).b & 0xFC ) << 8 )
  1417. | ( ( get_end_point_l(0).r & 0x02 ) << 15 ) | ( ( get_end_point_l(1).r & 0x02 ) << 16 )
  1418. | ( get_color_index(0) << 18 );
  1419. if ( candidateFixUpIndex1DOrdered[partition][0] == 15 )
  1420. {
  1421. block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) | (get_color_index(11) << 18) | (get_color_index(10) << 15)
  1422. | (get_color_index(9) << 12) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
  1423. block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1424. }
  1425. else if ( candidateFixUpIndex1DOrdered[partition][0] == 2 )
  1426. {
  1427. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
  1428. | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
  1429. block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1430. }
  1431. else if ( candidateFixUpIndex1DOrdered[partition][0] == 8 )
  1432. {
  1433. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
  1434. | (get_color_index(9) << 11) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
  1435. block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1436. }
  1437. else //candidateFixUpIndex1DOrdered[partition] == 6
  1438. {
  1439. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
  1440. | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 6) | (get_color_index(6) << 4) | get_color_index(5);
  1441. block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1442. }
  1443. }
  1444. void block_package2( out uint4 block, uint partition, uint threadBase )
  1445. {
  1446. block.x = 0x04 | ( (partition - 64) << 3 )
  1447. | ( ( get_end_point_l(0).r & 0xF8 ) << 6 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 11 )
  1448. | ( ( get_end_point_l(1).r & 0xF8 ) << 16 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 21 )
  1449. | ( ( get_end_point_l(2).r & 0xF8 ) << 26 );
  1450. block.y = ( ( get_end_point_l(2).r & 0xF8 ) >> 6 ) | ( ( get_end_point_h(2).r & 0xF8 ) >> 1 )
  1451. | ( ( get_end_point_l(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 9 )
  1452. | ( ( get_end_point_l(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_h(1).g & 0xF8 ) << 19 )
  1453. | ( ( get_end_point_l(2).g & 0xF8 ) << 24 );
  1454. block.z = ( ( get_end_point_h(2).g & 0xF8 ) >> 3 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 2 )
  1455. | ( ( get_end_point_h(0).b & 0xF8 ) << 7 ) | ( ( get_end_point_l(1).b & 0xF8 ) << 12 )
  1456. | ( ( get_end_point_h(1).b & 0xF8 ) << 17 ) | ( ( get_end_point_l(2).b & 0xF8 ) << 22 )
  1457. | ( ( get_end_point_h(2).b & 0xF8 ) << 27 );
  1458. block.w = ( ( get_end_point_h(2).b & 0xF8 ) >> 5 )
  1459. | ( get_color_index(0) << 3 );
  1460. uint i = 1;
  1461. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1462. {
  1463. block.w |= get_color_index(i) << ( i * 2 + 2 );
  1464. }
  1465. for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
  1466. {
  1467. block.w |= get_color_index(i) << ( i * 2 + 1 );
  1468. }
  1469. for ( ; i < 16; i ++ )
  1470. {
  1471. block.w |= get_color_index(i) << ( i * 2 );
  1472. }
  1473. }
  1474. void block_package3( out uint4 block, uint partition, uint threadBase )
  1475. {
  1476. block.x = 0x08 | ( partition << 4 )
  1477. | ( ( get_end_point_l(0).r & 0xFE ) << 9 ) | ( ( get_end_point_h(0).r & 0xFE ) << 16 )
  1478. | ( ( get_end_point_l(1).r & 0xFE ) << 23 ) | ( ( get_end_point_h(1).r & 0xFE ) << 30 );
  1479. block.y = ( ( get_end_point_h(1).r & 0xFE ) >> 2 ) | ( ( get_end_point_l(0).g & 0xFE ) << 5 )
  1480. | ( ( get_end_point_h(0).g & 0xFE ) << 12 ) | ( ( get_end_point_l(1).g & 0xFE ) << 19 )
  1481. | ( ( get_end_point_h(1).g & 0xFE ) << 26 );
  1482. block.z = ( ( get_end_point_h(1).g & 0xFE ) >> 6 ) | ( ( get_end_point_l(0).b & 0xFE ) << 1 )
  1483. | ( ( get_end_point_h(0).b & 0xFE ) << 8 ) | ( ( get_end_point_l(1).b & 0xFE ) << 15 )
  1484. | ( ( get_end_point_h(1).b & 0xFE ) << 22 )
  1485. | ( ( get_end_point_l(0).r & 0x01 ) << 30 ) | ( ( get_end_point_h(0).r & 0x01 ) << 31 );
  1486. block.w = ( ( get_end_point_l(1).r & 0x01 ) << 0 ) | ( ( get_end_point_h(1).r & 0x01 ) << 1 )
  1487. | ( get_color_index(0) << 2 );
  1488. uint i = 1;
  1489. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1490. {
  1491. block.w |= get_color_index(i) << ( i * 2 + 1 );
  1492. }
  1493. for ( ; i < 16; i ++ )
  1494. {
  1495. block.w |= get_color_index(i) << ( i * 2 );
  1496. }
  1497. }
  1498. void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase )
  1499. {
  1500. block.x = 0x10 | ( rotation << 5 ) | ( index_selector << 7 )
  1501. | ( ( get_end_point_l(0).r & 0xF8 ) << 5 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 10 )
  1502. | ( ( get_end_point_l(0).g & 0xF8 ) << 15 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 20 )
  1503. | ( ( get_end_point_l(0).b & 0xF8 ) << 25 );
  1504. block.y = ( ( get_end_point_l(0).b & 0xF8 ) >> 7 ) | ( ( get_end_point_h(0).b & 0xF8 ) >> 2 )
  1505. | ( ( get_end_point_l(0).a & 0xFC ) << 4 ) | ( ( get_end_point_h(0).a & 0xFC ) << 10 )
  1506. | ( get_color_index(0) << 18 ) | ( get_color_index(1) << 19 ) | ( get_color_index(2) << 21 ) | ( get_color_index(3) << 23 )
  1507. | ( get_color_index(4) << 25 ) | ( get_color_index(5) << 27 ) | ( get_color_index(6) << 29 ) | ( get_color_index(7) << 31 );
  1508. block.z = ( get_color_index(7) >> 1 ) | ( get_color_index(8) << 1 ) | ( get_color_index(9) << 3 ) | ( get_color_index(10)<< 5 )
  1509. | ( get_color_index(11)<< 7 ) | ( get_color_index(12)<< 9 ) | ( get_color_index(13)<< 11 ) | ( get_color_index(14)<< 13 )
  1510. | ( get_color_index(15)<< 15 ) | ( get_alpha_index(0) << 17 ) | ( get_alpha_index(1) << 19 ) | ( get_alpha_index(2) << 22 )
  1511. | ( get_alpha_index(3) << 25 ) | ( get_alpha_index(4) << 28 ) | ( get_alpha_index(5) << 31 );
  1512. block.w = ( get_alpha_index(5) >> 1 ) | ( get_alpha_index(6) << 2 ) | ( get_alpha_index(7) << 5 ) | ( get_alpha_index(8) << 8 )
  1513. | ( get_alpha_index(9) << 11 ) | ( get_alpha_index(10)<< 14 ) | ( get_alpha_index(11)<< 17 ) | ( get_alpha_index(12)<< 20 )
  1514. | ( get_alpha_index(13)<< 23 ) | ( get_alpha_index(14)<< 26 ) | ( get_alpha_index(15)<< 29 );
  1515. }
  1516. void block_package5( out uint4 block, uint rotation, uint threadBase )
  1517. {
  1518. block.x = 0x20 | ( rotation << 6 )
  1519. | ( ( get_end_point_l(0).r & 0xFE ) << 7 ) | ( ( get_end_point_h(0).r & 0xFE ) << 14 )
  1520. | ( ( get_end_point_l(0).g & 0xFE ) << 21 ) | ( ( get_end_point_h(0).g & 0xFE ) << 28 );
  1521. block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 4 ) | ( ( get_end_point_l(0).b & 0xFE ) << 3 )
  1522. | ( ( get_end_point_h(0).b & 0xFE ) << 10 ) | ( get_end_point_l(0).a << 18 ) | ( get_end_point_h(0).a << 26 );
  1523. block.z = ( get_end_point_h(0).a >> 6 )
  1524. | ( get_color_index(0) << 2 ) | ( get_color_index(1) << 3 ) | ( get_color_index(2) << 5 ) | ( get_color_index(3) << 7 )
  1525. | ( get_color_index(4) << 9 ) | ( get_color_index(5) << 11 ) | ( get_color_index(6) << 13 ) | ( get_color_index(7) << 15 )
  1526. | ( get_color_index(8) << 17 ) | ( get_color_index(9) << 19 ) | ( get_color_index(10)<< 21 ) | ( get_color_index(11)<< 23 )
  1527. | ( get_color_index(12)<< 25 ) | ( get_color_index(13)<< 27 ) | ( get_color_index(14)<< 29 ) | ( get_color_index(15)<< 31 );
  1528. block.w = ( get_color_index(15)>> 1 ) | ( get_alpha_index(0) << 0 ) | ( get_alpha_index(1) << 2 ) | ( get_alpha_index(2) << 4 )
  1529. | ( get_alpha_index(3) << 6 ) | ( get_alpha_index(4) << 8 ) | ( get_alpha_index(5) << 10 ) | ( get_alpha_index(6) << 12 )
  1530. | ( get_alpha_index(7) << 14 ) | ( get_alpha_index(8) << 16 ) | ( get_alpha_index(9) << 18 ) | ( get_alpha_index(10)<< 20 )
  1531. | ( get_alpha_index(11)<< 22 ) | ( get_alpha_index(12)<< 24 ) | ( get_alpha_index(13)<< 26 ) | ( get_alpha_index(14)<< 28 )
  1532. | ( get_alpha_index(15)<< 30 );
  1533. }
  1534. void block_package6( out uint4 block, uint threadBase )
  1535. {
  1536. block.x = 0x40
  1537. | ( ( get_end_point_l(0).r & 0xFE ) << 6 ) | ( ( get_end_point_h(0).r & 0xFE ) << 13 )
  1538. | ( ( get_end_point_l(0).g & 0xFE ) << 20 ) | ( ( get_end_point_h(0).g & 0xFE ) << 27 );
  1539. block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 5 ) | ( ( get_end_point_l(0).b & 0xFE ) << 2 )
  1540. | ( ( get_end_point_h(0).b & 0xFE ) << 9 ) | ( ( get_end_point_l(0).a & 0xFE ) << 16 )
  1541. | ( ( get_end_point_h(0).a & 0xFE ) << 23 )
  1542. | ( get_end_point_l(0).r & 0x01 ) << 31;
  1543. block.z = ( get_end_point_h(0).r & 0x01 )
  1544. | ( get_color_index(0) << 1 ) | ( get_color_index(1) << 4 ) | ( get_color_index(2) << 8 ) | ( get_color_index(3) << 12 )
  1545. | ( get_color_index(4) << 16 ) | ( get_color_index(5) << 20 ) | ( get_color_index(6) << 24 ) | ( get_color_index(7) << 28 );
  1546. block.w = ( get_color_index(8) << 0 ) | ( get_color_index(9) << 4 ) | ( get_color_index(10)<< 8 ) | ( get_color_index(11)<< 12 )
  1547. | ( get_color_index(12)<< 16 ) | ( get_color_index(13)<< 20 ) | ( get_color_index(14)<< 24 ) | ( get_color_index(15)<< 28 );
  1548. }
  1549. void block_package7( out uint4 block, uint partition, uint threadBase )
  1550. {
  1551. block.x = 0x80 | ( partition << 8 )
  1552. | ( ( get_end_point_l(0).r & 0xF8 ) << 11 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 16 )
  1553. | ( ( get_end_point_l(1).r & 0xF8 ) << 21 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 26 );
  1554. block.y = ( ( get_end_point_h(1).r & 0xF8 ) >> 6 ) | ( ( get_end_point_l(0).g & 0xF8 ) >> 1 )
  1555. | ( ( get_end_point_h(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_l(1).g & 0xF8 ) << 9 )
  1556. | ( ( get_end_point_h(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 19 )
  1557. | ( ( get_end_point_h(0).b & 0xF8 ) << 24 );
  1558. block.z = ( ( get_end_point_l(1).b & 0xF8 ) >> 3 ) | ( ( get_end_point_h(1).b & 0xF8 ) << 2 )
  1559. | ( ( get_end_point_l(0).a & 0xF8 ) << 7 ) | ( ( get_end_point_h(0).a & 0xF8 ) << 12 )
  1560. | ( ( get_end_point_l(1).a & 0xF8 ) << 17 ) | ( ( get_end_point_h(1).a & 0xF8 ) << 22 )
  1561. | ( ( get_end_point_l(0).r & 0x04 ) << 27 ) | ( ( get_end_point_h(0).r & 0x04 ) << 28 );
  1562. block.w = ( ( get_end_point_l(1).r & 0x04 ) >> 2 ) | ( ( get_end_point_h(1).r & 0x04 ) >> 1 )
  1563. | ( get_color_index(0) << 2 );
  1564. uint i = 1;
  1565. for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
  1566. {
  1567. block.w |= get_color_index(i) << ( i * 2 + 1 );
  1568. }
  1569. for ( ; i < 16; i ++ )
  1570. {
  1571. block.w |= get_color_index(i) << ( i * 2 );
  1572. }
  1573. }