DecodeRGB.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. #include "DecodeRGB.hpp"
  2. #include "Tables.hpp"
  3. #include "Math.hpp"
  4. #include <string.h>
  5. #ifdef __ARM_NEON
  6. # include <arm_neon.h>
  7. #endif
  8. #if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
  9. # ifdef _MSC_VER
  10. # include <intrin.h>
  11. # include <Windows.h>
  12. # define _bswap(x) _byteswap_ulong(x)
  13. # define _bswap64(x) _byteswap_uint64(x)
  14. # else
  15. # include <x86intrin.h>
  16. # endif
  17. #endif
  18. #ifndef _bswap
  19. # define _bswap(x) __builtin_bswap32(x)
  20. # define _bswap64(x) __builtin_bswap64(x)
  21. #endif
  22. static uint8_t table59T58H[8] = { 3,6,11,16,23,32,41,64 };
  23. namespace
  24. {
  25. static etcpak_force_inline int32_t expand6(uint32_t value)
  26. {
  27. return (value << 2) | (value >> 4);
  28. }
  29. static etcpak_force_inline int32_t expand7(uint32_t value)
  30. {
  31. return (value << 1) | (value >> 6);
  32. }
  33. static etcpak_force_inline void DecodeT( uint64_t block, uint32_t* dst, uint32_t w )
  34. {
  35. const auto r0 = ( block >> 24 ) & 0x1B;
  36. const auto rh0 = ( r0 >> 3 ) & 0x3;
  37. const auto rl0 = r0 & 0x3;
  38. const auto g0 = ( block >> 20 ) & 0xF;
  39. const auto b0 = ( block >> 16 ) & 0xF;
  40. const auto r1 = ( block >> 12 ) & 0xF;
  41. const auto g1 = ( block >> 8 ) & 0xF;
  42. const auto b1 = ( block >> 4 ) & 0xF;
  43. const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
  44. const auto cg0 = ( g0 << 4 ) | g0;
  45. const auto cb0 = ( b0 << 4 ) | b0;
  46. const auto cr1 = ( r1 << 4 ) | r1;
  47. const auto cg1 = ( g1 << 4 ) | g1;
  48. const auto cb1 = ( b1 << 4 ) | b1;
  49. const auto codeword_hi = ( block >> 2 ) & 0x3;
  50. const auto codeword_lo = block & 0x1;
  51. const auto codeword = ( codeword_hi << 1 ) | codeword_lo;
  52. const auto c2r = clampu8( cr1 + table59T58H[codeword] );
  53. const auto c2g = clampu8( cg1 + table59T58H[codeword] );
  54. const auto c2b = clampu8( cb1 + table59T58H[codeword] );
  55. const auto c3r = clampu8( cr1 - table59T58H[codeword] );
  56. const auto c3g = clampu8( cg1 - table59T58H[codeword] );
  57. const auto c3b = clampu8( cb1 - table59T58H[codeword] );
  58. const uint32_t col_tab[4] = {
  59. uint32_t( cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) | 0xFF000000 ),
  60. uint32_t( c2r | ( c2g << 8 ) | ( c2b << 16 ) | 0xFF000000 ),
  61. uint32_t( cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) | 0xFF000000 ),
  62. uint32_t( c3r | ( c3g << 8 ) | ( c3b << 16 ) | 0xFF000000 )
  63. };
  64. const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
  65. for( uint8_t j = 0; j < 4; j++ )
  66. {
  67. for( uint8_t i = 0; i < 4; i++ )
  68. {
  69. //2bit indices distributed on two lane 16bit numbers
  70. const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1) | ( ( indexes >> ( j + i * 4 ) ) & 0x1);
  71. dst[j * w + i] = col_tab[index];
  72. }
  73. }
  74. }
  75. static etcpak_force_inline void DecodeTAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
  76. {
  77. const auto r0 = ( block >> 24 ) & 0x1B;
  78. const auto rh0 = ( r0 >> 3 ) & 0x3;
  79. const auto rl0 = r0 & 0x3;
  80. const auto g0 = ( block >> 20 ) & 0xF;
  81. const auto b0 = ( block >> 16 ) & 0xF;
  82. const auto r1 = ( block >> 12 ) & 0xF;
  83. const auto g1 = ( block >> 8 ) & 0xF;
  84. const auto b1 = ( block >> 4 ) & 0xF;
  85. const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
  86. const auto cg0 = ( g0 << 4 ) | g0;
  87. const auto cb0 = ( b0 << 4 ) | b0;
  88. const auto cr1 = ( r1 << 4 ) | r1;
  89. const auto cg1 = ( g1 << 4 ) | g1;
  90. const auto cb1 = ( b1 << 4 ) | b1;
  91. const auto codeword_hi = ( block >> 2 ) & 0x3;
  92. const auto codeword_lo = block & 0x1;
  93. const auto codeword = (codeword_hi << 1) | codeword_lo;
  94. const int32_t base = alpha >> 56;
  95. const int32_t mul = ( alpha >> 52 ) & 0xF;
  96. const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
  97. const auto c2r = clampu8( cr1 + table59T58H[codeword] );
  98. const auto c2g = clampu8( cg1 + table59T58H[codeword] );
  99. const auto c2b = clampu8( cb1 + table59T58H[codeword] );
  100. const auto c3r = clampu8( cr1 - table59T58H[codeword] );
  101. const auto c3g = clampu8( cg1 - table59T58H[codeword] );
  102. const auto c3b = clampu8( cb1 - table59T58H[codeword] );
  103. const uint32_t col_tab[4] = {
  104. uint32_t( cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) ),
  105. uint32_t( c2r | ( c2g << 8 ) | ( c2b << 16 ) ),
  106. uint32_t( cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) ),
  107. uint32_t( c3r | ( c3g << 8 ) | ( c3b << 16 ) )
  108. };
  109. const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
  110. for( uint8_t j = 0; j < 4; j++ )
  111. {
  112. for( uint8_t i = 0; i < 4; i++ )
  113. {
  114. //2bit indices distributed on two lane 16bit numbers
  115. const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
  116. const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12 ) ) & 0x7];
  117. const uint32_t a = clampu8( base + amod * mul );
  118. dst[j * w + i] = col_tab[index] | ( a << 24 );
  119. }
  120. }
  121. }
  122. static etcpak_force_inline void DecodeH( uint64_t block, uint32_t* dst, uint32_t w )
  123. {
  124. const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
  125. const auto r0444 = ( block >> 27 ) & 0xF;
  126. const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
  127. const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
  128. const auto r1444 = ( block >> 11 ) & 0xF;
  129. const auto g1444 = ( block >> 7 ) & 0xF;
  130. const auto b1444 = ( block >> 3 ) & 0xF;
  131. const auto r0 = ( r0444 << 4 ) | r0444;
  132. const auto g0 = ( g0444 << 4 ) | g0444;
  133. const auto b0 = ( b0444 << 4 ) | b0444;
  134. const auto r1 = ( r1444 << 4 ) | r1444;
  135. const auto g1 = ( g1444 << 4 ) | g1444;
  136. const auto b1 = ( b1444 << 4 ) | b1444;
  137. const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
  138. const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
  139. const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
  140. const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
  141. const auto codeword = codeword_hi | codeword_lo;
  142. const uint32_t col_tab[] = {
  143. uint32_t( clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ) ),
  144. uint32_t( clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ) ),
  145. uint32_t( clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ) ),
  146. uint32_t( clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ) )
  147. };
  148. for( uint8_t j = 0; j < 4; j++ )
  149. {
  150. for( uint8_t i = 0; i < 4; i++ )
  151. {
  152. const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
  153. dst[j * w + i] = col_tab[index] | 0xFF000000;
  154. }
  155. }
  156. }
  157. static etcpak_force_inline void DecodeHAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
  158. {
  159. const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
  160. const auto r0444 = ( block >> 27 ) & 0xF;
  161. const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
  162. const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
  163. const auto r1444 = ( block >> 11 ) & 0xF;
  164. const auto g1444 = ( block >> 7 ) & 0xF;
  165. const auto b1444 = ( block >> 3 ) & 0xF;
  166. const auto r0 = ( r0444 << 4 ) | r0444;
  167. const auto g0 = ( g0444 << 4 ) | g0444;
  168. const auto b0 = ( b0444 << 4 ) | b0444;
  169. const auto r1 = ( r1444 << 4 ) | r1444;
  170. const auto g1 = ( g1444 << 4 ) | g1444;
  171. const auto b1 = ( b1444 << 4 ) | b1444;
  172. const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
  173. const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
  174. const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
  175. const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
  176. const auto codeword = codeword_hi | codeword_lo;
  177. const int32_t base = alpha >> 56;
  178. const int32_t mul = ( alpha >> 52 ) & 0xF;
  179. const auto tbl = g_alpha[(alpha >> 48) & 0xF];
  180. const uint32_t col_tab[] = {
  181. uint32_t( clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ) ),
  182. uint32_t( clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ) ),
  183. uint32_t( clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ) ),
  184. uint32_t( clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ) )
  185. };
  186. for( uint8_t j = 0; j < 4; j++ )
  187. {
  188. for( uint8_t i = 0; i < 4; i++ )
  189. {
  190. const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
  191. const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12) ) & 0x7];
  192. const uint32_t a = clampu8( base + amod * mul );
  193. dst[j * w + i] = col_tab[index] | ( a << 24 );
  194. }
  195. }
  196. }
  197. static etcpak_force_inline void DecodePlanar( uint64_t block, uint32_t* dst, uint32_t w )
  198. {
  199. const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
  200. const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
  201. const auto rv = expand6((block >> (13 + 32)) & 0x3F);
  202. const auto bh = expand6((block >> (19 + 32)) & 0x3F);
  203. const auto gh = expand7((block >> (25 + 32)) & 0x7F);
  204. const auto rh0 = (block >> (32 - 32)) & 0x01;
  205. const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
  206. const auto rh = expand6(rh0 | rh1);
  207. const auto bo0 = (block >> (39 - 32)) & 0x07;
  208. const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
  209. const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
  210. const auto bo = expand6(bo0 | bo1 | bo2);
  211. const auto go0 = (block >> (49 - 32)) & 0x3F;
  212. const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
  213. const auto go = expand7(go0 | go1);
  214. const auto ro = expand6((block >> (57 - 32)) & 0x3F);
  215. #ifdef __ARM_NEON
  216. uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
  217. int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
  218. init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
  219. int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
  220. init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 ) | ( uint64_t(0xFFF) << 48 );
  221. int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
  222. for( int j=0; j<4; j++ )
  223. {
  224. for( int i=0; i<4; i++ )
  225. {
  226. uint8x8_t c = vqshrun_n_s16( col, 2 );
  227. vst1_lane_u32( dst+j*w+i, vreinterpret_u32_u8( c ), 0 );
  228. col = vaddq_s16( col, chco );
  229. }
  230. col = vaddq_s16( col, cvco );
  231. }
  232. #elif defined __AVX2__
  233. const auto R0 = 4*ro+2;
  234. const auto G0 = 4*go+2;
  235. const auto B0 = 4*bo+2;
  236. const auto RHO = rh-ro;
  237. const auto GHO = gh-go;
  238. const auto BHO = bh-bo;
  239. __m256i cvco = _mm256_setr_epi16( rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0 );
  240. __m256i col = _mm256_setr_epi16( R0, G0, B0, 0xFFF, R0+RHO, G0+GHO, B0+BHO, 0xFFF, R0+2*RHO, G0+2*GHO, B0+2*BHO, 0xFFF, R0+3*RHO, G0+3*GHO, B0+3*BHO, 0xFFF );
  241. for( int j=0; j<4; j++ )
  242. {
  243. __m256i c = _mm256_srai_epi16( col, 2 );
  244. __m128i s = _mm_packus_epi16( _mm256_castsi256_si128( c ), _mm256_extracti128_si256( c, 1 ) );
  245. _mm_storeu_si128( (__m128i*)(dst+j*w), s );
  246. col = _mm256_add_epi16( col, cvco );
  247. }
  248. #elif defined __SSE4_1__
  249. __m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
  250. __m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
  251. __m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0xFFF, 0, 0, 0, 0 );
  252. for( int j=0; j<4; j++ )
  253. {
  254. for( int i=0; i<4; i++ )
  255. {
  256. __m128i c = _mm_srai_epi16( col, 2 );
  257. __m128i s = _mm_packus_epi16( c, c );
  258. dst[j*w+i] = _mm_cvtsi128_si32( s );
  259. col = _mm_add_epi16( col, chco );
  260. }
  261. col = _mm_add_epi16( col, cvco );
  262. }
  263. #else
  264. for( int j=0; j<4; j++ )
  265. {
  266. for( int i=0; i<4; i++ )
  267. {
  268. const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
  269. const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
  270. const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
  271. if( ( ( r | g | b ) & ~0xFF ) == 0 )
  272. {
  273. dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
  274. }
  275. else
  276. {
  277. const auto rc = clampu8( r );
  278. const auto gc = clampu8( g );
  279. const auto bc = clampu8( b );
  280. dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
  281. }
  282. }
  283. }
  284. #endif
  285. }
  286. static etcpak_force_inline void DecodePlanarAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
  287. {
  288. const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
  289. const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
  290. const auto rv = expand6((block >> (13 + 32)) & 0x3F);
  291. const auto bh = expand6((block >> (19 + 32)) & 0x3F);
  292. const auto gh = expand7((block >> (25 + 32)) & 0x7F);
  293. const auto rh0 = (block >> (32 - 32)) & 0x01;
  294. const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
  295. const auto rh = expand6(rh0 | rh1);
  296. const auto bo0 = (block >> (39 - 32)) & 0x07;
  297. const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
  298. const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
  299. const auto bo = expand6(bo0 | bo1 | bo2);
  300. const auto go0 = (block >> (49 - 32)) & 0x3F;
  301. const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
  302. const auto go = expand7(go0 | go1);
  303. const auto ro = expand6((block >> (57 - 32)) & 0x3F);
  304. const int32_t base = alpha >> 56;
  305. const int32_t mul = ( alpha >> 52 ) & 0xF;
  306. const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
  307. #ifdef __ARM_NEON
  308. uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
  309. int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
  310. init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
  311. int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
  312. init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 );
  313. int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
  314. for( int j=0; j<4; j++ )
  315. {
  316. for( int i=0; i<4; i++ )
  317. {
  318. const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
  319. const uint32_t a = clampu8( base + amod * mul );
  320. uint8x8_t c = vqshrun_n_s16( col, 2 );
  321. dst[j*w+i] = vget_lane_u32( vreinterpret_u32_u8( c ), 0 ) | ( a << 24 );
  322. col = vaddq_s16( col, chco );
  323. }
  324. col = vaddq_s16( col, cvco );
  325. }
  326. #elif defined __SSE4_1__
  327. __m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
  328. __m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
  329. __m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0, 0, 0, 0, 0 );
  330. for( int j=0; j<4; j++ )
  331. {
  332. for( int i=0; i<4; i++ )
  333. {
  334. const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
  335. const uint32_t a = clampu8( base + amod * mul );
  336. __m128i c = _mm_srai_epi16( col, 2 );
  337. __m128i s = _mm_packus_epi16( c, c );
  338. dst[j*w+i] = _mm_cvtsi128_si32( s ) | ( a << 24 );
  339. col = _mm_add_epi16( col, chco );
  340. }
  341. col = _mm_add_epi16( col, cvco );
  342. }
  343. #else
  344. for (auto j = 0; j < 4; j++)
  345. {
  346. for (auto i = 0; i < 4; i++)
  347. {
  348. const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
  349. const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
  350. const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
  351. const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
  352. const uint32_t a = clampu8( base + amod * mul );
  353. if( ( ( r | g | b ) & ~0xFF ) == 0 )
  354. {
  355. dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
  356. }
  357. else
  358. {
  359. const auto rc = clampu8( r );
  360. const auto gc = clampu8( g );
  361. const auto bc = clampu8( b );
  362. dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
  363. }
  364. }
  365. }
  366. #endif
  367. }
  368. }
  369. static etcpak_force_inline uint64_t ConvertByteOrder( uint64_t d )
  370. {
  371. uint32_t word[2];
  372. memcpy( word, &d, 8 );
  373. word[0] = _bswap( word[0] );
  374. word[1] = _bswap( word[1] );
  375. memcpy( &d, word, 8 );
  376. return d;
  377. }
  378. static etcpak_force_inline void DecodeRGBPart( uint64_t d, uint32_t* dst, uint32_t w )
  379. {
  380. d = ConvertByteOrder( d );
  381. uint32_t br[2], bg[2], bb[2];
  382. if( d & 0x2 )
  383. {
  384. int32_t dr, dg, db;
  385. uint32_t r0 = ( d & 0xF8000000 ) >> 27;
  386. uint32_t g0 = ( d & 0x00F80000 ) >> 19;
  387. uint32_t b0 = ( d & 0x0000F800 ) >> 11;
  388. dr = ( int32_t(d) << 5 ) >> 29;
  389. dg = ( int32_t(d) << 13 ) >> 29;
  390. db = ( int32_t(d) << 21 ) >> 29;
  391. int32_t r1 = int32_t(r0) + dr;
  392. int32_t g1 = int32_t(g0) + dg;
  393. int32_t b1 = int32_t(b0) + db;
  394. // T mode
  395. if ( (r1 < 0) || (r1 > 31) )
  396. {
  397. DecodeT( d, dst, w );
  398. return;
  399. }
  400. // H mode
  401. if ((g1 < 0) || (g1 > 31))
  402. {
  403. DecodeH( d, dst, w );
  404. return;
  405. }
  406. // P mode
  407. if( (b1 < 0) || (b1 > 31) )
  408. {
  409. DecodePlanar( d, dst, w );
  410. return;
  411. }
  412. br[0] = ( r0 << 3 ) | ( r0 >> 2 );
  413. br[1] = ( r1 << 3 ) | ( r1 >> 2 );
  414. bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
  415. bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
  416. bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
  417. bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
  418. }
  419. else
  420. {
  421. br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
  422. br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
  423. bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
  424. bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
  425. bb[0] = ( ( d & 0x0000F000 ) >> 8 ) | ( ( d & 0x0000F000 ) >> 12 );
  426. bb[1] = ( ( d & 0x00000F00 ) >> 4 ) | ( ( d & 0x00000F00 ) >> 8 );
  427. }
  428. unsigned int tcw[2];
  429. tcw[0] = ( d & 0xE0 ) >> 5;
  430. tcw[1] = ( d & 0x1C ) >> 2;
  431. uint32_t b1 = ( d >> 32 ) & 0xFFFF;
  432. uint32_t b2 = ( d >> 48 );
  433. b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
  434. b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
  435. b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
  436. b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
  437. b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
  438. b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
  439. b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
  440. b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
  441. uint32_t idx = b1 | ( b2 << 1 );
  442. if( d & 0x1 )
  443. {
  444. for( int i=0; i<4; i++ )
  445. {
  446. for( int j=0; j<4; j++ )
  447. {
  448. const auto mod = g_table[tcw[j/2]][idx & 0x3];
  449. const auto r = br[j/2] + mod;
  450. const auto g = bg[j/2] + mod;
  451. const auto b = bb[j/2] + mod;
  452. if( ( ( r | g | b ) & ~0xFF ) == 0 )
  453. {
  454. dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
  455. }
  456. else
  457. {
  458. const auto rc = clampu8( r );
  459. const auto gc = clampu8( g );
  460. const auto bc = clampu8( b );
  461. dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
  462. }
  463. idx >>= 2;
  464. }
  465. }
  466. }
  467. else
  468. {
  469. for( int i=0; i<4; i++ )
  470. {
  471. const auto tbl = g_table[tcw[i/2]];
  472. const auto cr = br[i/2];
  473. const auto cg = bg[i/2];
  474. const auto cb = bb[i/2];
  475. for( int j=0; j<4; j++ )
  476. {
  477. const auto mod = tbl[idx & 0x3];
  478. const auto r = cr + mod;
  479. const auto g = cg + mod;
  480. const auto b = cb + mod;
  481. if( ( ( r | g | b ) & ~0xFF ) == 0 )
  482. {
  483. dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
  484. }
  485. else
  486. {
  487. const auto rc = clampu8( r );
  488. const auto gc = clampu8( g );
  489. const auto bc = clampu8( b );
  490. dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
  491. }
  492. idx >>= 2;
  493. }
  494. }
  495. }
  496. }
  497. static etcpak_force_inline void DecodeRGBAPart( uint64_t d, uint64_t alpha, uint32_t* dst, uint32_t w )
  498. {
  499. d = ConvertByteOrder( d );
  500. alpha = _bswap64( alpha );
  501. uint32_t br[2], bg[2], bb[2];
  502. if( d & 0x2 )
  503. {
  504. int32_t dr, dg, db;
  505. uint32_t r0 = ( d & 0xF8000000 ) >> 27;
  506. uint32_t g0 = ( d & 0x00F80000 ) >> 19;
  507. uint32_t b0 = ( d & 0x0000F800 ) >> 11;
  508. dr = ( int32_t(d) << 5 ) >> 29;
  509. dg = ( int32_t(d) << 13 ) >> 29;
  510. db = ( int32_t(d) << 21 ) >> 29;
  511. int32_t r1 = int32_t(r0) + dr;
  512. int32_t g1 = int32_t(g0) + dg;
  513. int32_t b1 = int32_t(b0) + db;
  514. // T mode
  515. if ( (r1 < 0) || (r1 > 31) )
  516. {
  517. DecodeTAlpha( d, alpha, dst, w );
  518. return;
  519. }
  520. // H mode
  521. if ( (g1 < 0) || (g1 > 31) )
  522. {
  523. DecodeHAlpha( d, alpha, dst, w );
  524. return;
  525. }
  526. // P mode
  527. if ( (b1 < 0) || (b1 > 31) )
  528. {
  529. DecodePlanarAlpha( d, alpha, dst, w );
  530. return;
  531. }
  532. br[0] = ( r0 << 3 ) | ( r0 >> 2 );
  533. br[1] = ( r1 << 3 ) | ( r1 >> 2 );
  534. bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
  535. bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
  536. bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
  537. bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
  538. }
  539. else
  540. {
  541. br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
  542. br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
  543. bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
  544. bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
  545. bb[0] = ( ( d & 0x0000F000 ) >> 8 ) | ( ( d & 0x0000F000 ) >> 12 );
  546. bb[1] = ( ( d & 0x00000F00 ) >> 4 ) | ( ( d & 0x00000F00 ) >> 8 );
  547. }
  548. unsigned int tcw[2];
  549. tcw[0] = ( d & 0xE0 ) >> 5;
  550. tcw[1] = ( d & 0x1C ) >> 2;
  551. uint32_t b1 = ( d >> 32 ) & 0xFFFF;
  552. uint32_t b2 = ( d >> 48 );
  553. b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
  554. b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
  555. b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
  556. b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
  557. b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
  558. b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
  559. b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
  560. b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
  561. uint32_t idx = b1 | ( b2 << 1 );
  562. const int32_t base = alpha >> 56;
  563. const int32_t mul = ( alpha >> 52 ) & 0xF;
  564. const auto atbl = g_alpha[( alpha >> 48 ) & 0xF];
  565. if( d & 0x1 )
  566. {
  567. for( int i=0; i<4; i++ )
  568. {
  569. for( int j=0; j<4; j++ )
  570. {
  571. const auto mod = g_table[tcw[j/2]][idx & 0x3];
  572. const auto r = br[j/2] + mod;
  573. const auto g = bg[j/2] + mod;
  574. const auto b = bb[j/2] + mod;
  575. const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
  576. const uint32_t a = clampu8( base + amod * mul );
  577. if( ( ( r | g | b ) & ~0xFF ) == 0 )
  578. {
  579. dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
  580. }
  581. else
  582. {
  583. const auto rc = clampu8( r );
  584. const auto gc = clampu8( g );
  585. const auto bc = clampu8( b );
  586. dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
  587. }
  588. idx >>= 2;
  589. }
  590. }
  591. }
  592. else
  593. {
  594. for( int i=0; i<4; i++ )
  595. {
  596. const auto tbl = g_table[tcw[i/2]];
  597. const auto cr = br[i/2];
  598. const auto cg = bg[i/2];
  599. const auto cb = bb[i/2];
  600. for( int j=0; j<4; j++ )
  601. {
  602. const auto mod = tbl[idx & 0x3];
  603. const auto r = cr + mod;
  604. const auto g = cg + mod;
  605. const auto b = cb + mod;
  606. const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
  607. const uint32_t a = clampu8( base + amod * mul );
  608. if( ( ( r | g | b ) & ~0xFF ) == 0 )
  609. {
  610. dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
  611. }
  612. else
  613. {
  614. const auto rc = clampu8( r );
  615. const auto gc = clampu8( g );
  616. const auto bc = clampu8( b );
  617. dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
  618. }
  619. idx >>= 2;
  620. }
  621. }
  622. }
  623. }
  624. static etcpak_force_inline void DecodeRPart( uint64_t r, uint32_t* dst, uint32_t w )
  625. {
  626. r = _bswap64( r );
  627. const int32_t base = ( r >> 56 )*8+4;
  628. const int32_t mul = ( r >> 52 ) & 0xF;
  629. const auto atbl = g_alpha[( r >> 48 ) & 0xF];
  630. for( int i=0; i<4; i++ )
  631. {
  632. for ( int j=0; j<4; j++ )
  633. {
  634. const auto amod = atbl[(r >> ( 45 - j*3 - i*12 )) & 0x7];
  635. const uint32_t rc = clampu8( ( base + amod * g_alpha11Mul[mul] )/8 );
  636. dst[j*w+i] = rc | 0xFF000000;
  637. }
  638. }
  639. }
  640. static etcpak_force_inline void DecodeRGPart( uint64_t r, uint64_t g, uint32_t* dst, uint32_t w )
  641. {
  642. r = _bswap64( r );
  643. g = _bswap64( g );
  644. const int32_t rbase = ( r >> 56 )*8+4;
  645. const int32_t rmul = ( r >> 52 ) & 0xF;
  646. const auto rtbl = g_alpha[( r >> 48 ) & 0xF];
  647. const int32_t gbase = ( g >> 56 )*8+4;
  648. const int32_t gmul = ( g >> 52 ) & 0xF;
  649. const auto gtbl = g_alpha[( g >> 48 ) & 0xF];
  650. for( int i=0; i<4; i++ )
  651. {
  652. for( int j=0; j<4; j++ )
  653. {
  654. const auto rmod = rtbl[(r >> ( 45 - j*3 - i*12 )) & 0x7];
  655. const uint32_t rc = clampu8( ( rbase + rmod * g_alpha11Mul[rmul] )/8 );
  656. const auto gmod = gtbl[(g >> ( 45 - j*3 - i*12 )) & 0x7];
  657. const uint32_t gc = clampu8( ( gbase + gmod * g_alpha11Mul[gmul] )/8 );
  658. dst[j*w+i] = rc | (gc << 8) | 0xFF000000;
  659. }
  660. }
  661. }
  662. void DecodeRBlock( const void* src, void* dst, size_t width )
  663. {
  664. uint64_t* srcPtr = (uint64_t*)src;
  665. uint64_t r = *srcPtr++;
  666. DecodeRPart( r, (uint32_t*)dst, width );
  667. }
  668. void DecodeRGBlock( const void* src, void* dst, size_t width )
  669. {
  670. uint64_t* srcPtr = (uint64_t*)src;
  671. uint64_t r = *srcPtr++;
  672. uint64_t g = *srcPtr++;
  673. DecodeRGPart( r, g, (uint32_t*)dst, width );
  674. }
  675. void DecodeRGBBlock( const void* src, void* dst, size_t width )
  676. {
  677. uint64_t* srcPtr = (uint64_t*)src;
  678. uint64_t d = *srcPtr++;
  679. DecodeRGBPart( d, (uint32_t*)dst, width );
  680. }
  681. void DecodeRGBABlock( const void* src, void* dst, size_t width )
  682. {
  683. uint64_t* srcPtr = (uint64_t*)src;
  684. uint64_t a = *srcPtr++;
  685. uint64_t d = *srcPtr++;
  686. DecodeRGBAPart( d, a, (uint32_t*)dst, width );
  687. }