jdmrgext-altivec.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. /*
  2. * AltiVec optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. /* This file is included by jdmerge-altivec.c */
  23. void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
  24. JSAMPIMAGE input_buf,
  25. JDIMENSION in_row_group_ctr,
  26. JSAMPARRAY output_buf)
  27. {
  28. JSAMPROW outptr, inptr0, inptr1, inptr2;
  29. int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
  30. #if __BIG_ENDIAN__
  31. int offset;
  32. #endif
  33. unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
  34. __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
  35. y, cb, cr;
  36. #if __BIG_ENDIAN__
  37. __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
  38. #if RGB_PIXELSIZE == 4
  39. __vector unsigned char out4;
  40. #endif
  41. #endif
  42. #if RGB_PIXELSIZE == 4
  43. __vector unsigned char rgb3;
  44. #endif
  45. __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
  46. crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
  47. rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
  48. __vector int g_y0, g_y1, g_y2, g_y3;
  49. /* Constants
  50. * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
  51. * high-order bits, not 16.
  52. */
  53. __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
  54. pw_mf0228 = { __8X(-F_0_228 >> 1) },
  55. pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
  56. pw_one = { __8X(1) }, pw_255 = { __8X(255) },
  57. pw_cj = { __8X(CENTERJSAMPLE) };
  58. __vector int pd_onehalf = { __4X(ONE_HALF) };
  59. __vector unsigned char pb_zero = { __16X(0) },
  60. #if __BIG_ENDIAN__
  61. shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
  62. even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
  63. odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
  64. #else
  65. shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
  66. even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
  67. odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
  68. #endif
  69. inptr0 = input_buf[0][in_row_group_ctr];
  70. inptr1 = input_buf[1][in_row_group_ctr];
  71. inptr2 = input_buf[2][in_row_group_ctr];
  72. outptr = output_buf[0];
  73. for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
  74. cb = vec_ld(0, inptr1);
  75. /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
  76. * support unsigned vectors.
  77. */
  78. cbl = (__vector signed short)VEC_UNPACKHU(cb);
  79. cbh = (__vector signed short)VEC_UNPACKLU(cb);
  80. cbl = vec_sub(cbl, pw_cj);
  81. cbh = vec_sub(cbh, pw_cj);
  82. cr = vec_ld(0, inptr2);
  83. crl = (__vector signed short)VEC_UNPACKHU(cr);
  84. crh = (__vector signed short)VEC_UNPACKLU(cr);
  85. crl = vec_sub(crl, pw_cj);
  86. crh = vec_sub(crh, pw_cj);
  87. /* (Original)
  88. * R = Y + 1.40200 * Cr
  89. * G = Y - 0.34414 * Cb - 0.71414 * Cr
  90. * B = Y + 1.77200 * Cb
  91. *
  92. * (This implementation)
  93. * R = Y + 0.40200 * Cr + Cr
  94. * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
  95. * B = Y - 0.22800 * Cb + Cb + Cb
  96. */
  97. b_yl = vec_add(cbl, cbl);
  98. b_yh = vec_add(cbh, cbh);
  99. b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
  100. b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
  101. b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
  102. b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
  103. b_yl = vec_add(b_yl, cbl);
  104. b_yh = vec_add(b_yh, cbh);
  105. b_yl = vec_add(b_yl, cbl);
  106. b_yh = vec_add(b_yh, cbh);
  107. r_yl = vec_add(crl, crl);
  108. r_yh = vec_add(crh, crh);
  109. r_yl = vec_madds(r_yl, pw_f0402, pw_one);
  110. r_yh = vec_madds(r_yh, pw_f0402, pw_one);
  111. r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
  112. r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
  113. r_yl = vec_add(r_yl, crl);
  114. r_yh = vec_add(r_yh, crh);
  115. g_y0w = vec_mergeh(cbl, crl);
  116. g_y1w = vec_mergel(cbl, crl);
  117. g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
  118. g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
  119. g_y2w = vec_mergeh(cbh, crh);
  120. g_y3w = vec_mergel(cbh, crh);
  121. g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
  122. g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
  123. /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
  124. * each dword into a new 16-bit vector, which is the equivalent of
  125. * descaling the 32-bit results (right-shifting by 16 bits) and then
  126. * packing them.
  127. */
  128. g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
  129. shift_pack_index);
  130. g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
  131. shift_pack_index);
  132. g_yl = vec_sub(g_yl, crl);
  133. g_yh = vec_sub(g_yh, crh);
  134. for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
  135. num_cols -= RGB_PIXELSIZE * 16,
  136. outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
  137. y = vec_ld(0, inptr0);
  138. ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
  139. yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
  140. if (yloop == 0) {
  141. be = vec_add(b_yl, ye);
  142. bo = vec_add(b_yl, yo);
  143. re = vec_add(r_yl, ye);
  144. ro = vec_add(r_yl, yo);
  145. ge = vec_add(g_yl, ye);
  146. go = vec_add(g_yl, yo);
  147. } else {
  148. be = vec_add(b_yh, ye);
  149. bo = vec_add(b_yh, yo);
  150. re = vec_add(r_yh, ye);
  151. ro = vec_add(r_yh, yo);
  152. ge = vec_add(g_yh, ye);
  153. go = vec_add(g_yh, yo);
  154. }
  155. rl = vec_mergeh(re, ro);
  156. rh = vec_mergel(re, ro);
  157. gl = vec_mergeh(ge, go);
  158. gh = vec_mergel(ge, go);
  159. bl = vec_mergeh(be, bo);
  160. bh = vec_mergel(be, bo);
  161. rg0 = vec_mergeh(rl, gl);
  162. bx0 = vec_mergeh(bl, pw_255);
  163. rg1 = vec_mergel(rl, gl);
  164. bx1 = vec_mergel(bl, pw_255);
  165. rg2 = vec_mergeh(rh, gh);
  166. bx2 = vec_mergeh(bh, pw_255);
  167. rg3 = vec_mergel(rh, gh);
  168. bx3 = vec_mergel(bh, pw_255);
  169. rgbx0 = vec_packsu(rg0, bx0);
  170. rgbx1 = vec_packsu(rg1, bx1);
  171. rgbx2 = vec_packsu(rg2, bx2);
  172. rgbx3 = vec_packsu(rg3, bx3);
  173. #if RGB_PIXELSIZE == 3
  174. /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
  175. * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
  176. * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
  177. * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
  178. *
  179. * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
  180. * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
  181. * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
  182. */
  183. rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
  184. rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
  185. rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
  186. #else
  187. /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
  188. * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
  189. * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
  190. * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
  191. *
  192. * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
  193. * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
  194. * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
  195. * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
  196. */
  197. rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
  198. rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
  199. rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
  200. rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
  201. #endif
  202. #if __BIG_ENDIAN__
  203. offset = (size_t)outptr & 15;
  204. if (offset) {
  205. __vector unsigned char unaligned_shift_index;
  206. int bytes = num_cols + offset;
  207. if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
  208. /* Slow path to prevent buffer overwrite. Since there is no way to
  209. * write a partial AltiVec register, overwrite would occur on the
  210. * last chunk of the last image row if the right edge is not on a
  211. * 16-byte boundary. It could also occur on other rows if the bytes
  212. * per row is low enough. Since we can't determine whether we're on
  213. * the last image row, we have to assume every row is the last.
  214. */
  215. vec_st(rgb0, 0, tmpbuf);
  216. vec_st(rgb1, 16, tmpbuf);
  217. vec_st(rgb2, 32, tmpbuf);
  218. #if RGB_PIXELSIZE == 4
  219. vec_st(rgb3, 48, tmpbuf);
  220. #endif
  221. memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
  222. } else {
  223. /* Fast path */
  224. unaligned_shift_index = vec_lvsl(0, outptr);
  225. edgel = vec_ld(0, outptr);
  226. edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
  227. edges = vec_perm(edgeh, edgel, unaligned_shift_index);
  228. unaligned_shift_index = vec_lvsr(0, outptr);
  229. out0 = vec_perm(edges, rgb0, unaligned_shift_index);
  230. out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
  231. out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
  232. #if RGB_PIXELSIZE == 4
  233. out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
  234. out4 = vec_perm(rgb3, edges, unaligned_shift_index);
  235. #else
  236. out3 = vec_perm(rgb2, edges, unaligned_shift_index);
  237. #endif
  238. vec_st(out0, 0, outptr);
  239. if (bytes > 16)
  240. vec_st(out1, 16, outptr);
  241. if (bytes > 32)
  242. vec_st(out2, 32, outptr);
  243. if (bytes > 48)
  244. vec_st(out3, 48, outptr);
  245. #if RGB_PIXELSIZE == 4
  246. if (bytes > 64)
  247. vec_st(out4, 64, outptr);
  248. #endif
  249. }
  250. } else {
  251. #endif /* __BIG_ENDIAN__ */
  252. if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
  253. /* Slow path */
  254. VEC_ST(rgb0, 0, tmpbuf);
  255. VEC_ST(rgb1, 16, tmpbuf);
  256. VEC_ST(rgb2, 32, tmpbuf);
  257. #if RGB_PIXELSIZE == 4
  258. VEC_ST(rgb3, 48, tmpbuf);
  259. #endif
  260. memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
  261. } else {
  262. /* Fast path */
  263. VEC_ST(rgb0, 0, outptr);
  264. if (num_cols > 16)
  265. VEC_ST(rgb1, 16, outptr);
  266. if (num_cols > 32)
  267. VEC_ST(rgb2, 32, outptr);
  268. #if RGB_PIXELSIZE == 4
  269. if (num_cols > 48)
  270. VEC_ST(rgb3, 48, outptr);
  271. #endif
  272. }
  273. #if __BIG_ENDIAN__
  274. }
  275. #endif
  276. }
  277. }
  278. }
  279. void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
  280. JSAMPIMAGE input_buf,
  281. JDIMENSION in_row_group_ctr,
  282. JSAMPARRAY output_buf)
  283. {
  284. JSAMPROW inptr, outptr;
  285. inptr = input_buf[0][in_row_group_ctr];
  286. outptr = output_buf[0];
  287. input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
  288. jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
  289. output_buf);
  290. input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
  291. output_buf[0] = output_buf[1];
  292. jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
  293. output_buf);
  294. input_buf[0][in_row_group_ctr] = inptr;
  295. output_buf[0] = outptr;
  296. }