transform-sse1.c 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. // qcms
  2. // Copyright (C) 2009 Mozilla Foundation
  3. // Copyright (C) 2010 Steve Snyder
  4. //
  5. // Permission is hereby granted, free of charge, to any person obtaining
  6. // a copy of this software and associated documentation files (the "Software"),
  7. // to deal in the Software without restriction, including without limitation
  8. // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9. // and/or sell copies of the Software, and to permit persons to whom the Software
  10. // is furnished to do so, subject to the following conditions:
  11. //
  12. // The above copyright notice and this permission notice shall be included in
  13. // all copies or substantial portions of the Software.
  14. //
  15. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  17. // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  19. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  20. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  21. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22. #include <xmmintrin.h>
  23. #include "qcmsint.h"
  24. /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
  25. #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
  26. #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
  27. static const ALIGN float floatScaleX4[4] =
  28. { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
  29. static const ALIGN float clampMaxValueX4[4] =
  30. { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
  31. void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
  32. unsigned char *src,
  33. unsigned char *dest,
  34. size_t length)
  35. {
  36. unsigned int i;
  37. float (*mat)[4] = transform->matrix;
  38. char input_back[32];
  39. /* Ensure we have a buffer that's 16 byte aligned regardless of the original
  40. * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
  41. * because they don't work on stack variables. gcc 4.4 does do the right thing
  42. * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
  43. float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
  44. /* share input and output locations to save having to keep the
  45. * locations in separate registers */
  46. uint32_t const * output = (uint32_t*)input;
  47. /* deref *transform now to avoid it in loop */
  48. const float *igtbl_r = transform->input_gamma_table_r;
  49. const float *igtbl_g = transform->input_gamma_table_g;
  50. const float *igtbl_b = transform->input_gamma_table_b;
  51. /* deref *transform now to avoid it in loop */
  52. const uint8_t *otdata_r = &transform->output_table_r->data[0];
  53. const uint8_t *otdata_g = &transform->output_table_g->data[0];
  54. const uint8_t *otdata_b = &transform->output_table_b->data[0];
  55. /* input matrix values never change */
  56. const __m128 mat0 = _mm_load_ps(mat[0]);
  57. const __m128 mat1 = _mm_load_ps(mat[1]);
  58. const __m128 mat2 = _mm_load_ps(mat[2]);
  59. /* these values don't change, either */
  60. const __m128 max = _mm_load_ps(clampMaxValueX4);
  61. const __m128 min = _mm_setzero_ps();
  62. const __m128 scale = _mm_load_ps(floatScaleX4);
  63. /* working variables */
  64. __m128 vec_r, vec_g, vec_b, result;
  65. /* CYA */
  66. if (!length)
  67. return;
  68. /* one pixel is handled outside of the loop */
  69. length--;
  70. /* setup for transforming 1st pixel */
  71. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  72. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  73. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  74. src += 3;
  75. /* transform all but final pixel */
  76. for (i=0; i<length; i++)
  77. {
  78. /* position values from gamma tables */
  79. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  80. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  81. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  82. /* gamma * matrix */
  83. vec_r = _mm_mul_ps(vec_r, mat0);
  84. vec_g = _mm_mul_ps(vec_g, mat1);
  85. vec_b = _mm_mul_ps(vec_b, mat2);
  86. /* crunch, crunch, crunch */
  87. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  88. vec_r = _mm_max_ps(min, vec_r);
  89. vec_r = _mm_min_ps(max, vec_r);
  90. result = _mm_mul_ps(vec_r, scale);
  91. /* store calc'd output tables indices */
  92. *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
  93. result = _mm_movehl_ps(result, result);
  94. *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
  95. /* load for next loop while store completes */
  96. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  97. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  98. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  99. src += 3;
  100. /* use calc'd indices to output RGB values */
  101. dest[0] = otdata_r[output[0]];
  102. dest[1] = otdata_g[output[1]];
  103. dest[2] = otdata_b[output[2]];
  104. dest += 3;
  105. }
  106. /* handle final (maybe only) pixel */
  107. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  108. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  109. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  110. vec_r = _mm_mul_ps(vec_r, mat0);
  111. vec_g = _mm_mul_ps(vec_g, mat1);
  112. vec_b = _mm_mul_ps(vec_b, mat2);
  113. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  114. vec_r = _mm_max_ps(min, vec_r);
  115. vec_r = _mm_min_ps(max, vec_r);
  116. result = _mm_mul_ps(vec_r, scale);
  117. *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
  118. result = _mm_movehl_ps(result, result);
  119. *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
  120. dest[0] = otdata_r[output[0]];
  121. dest[1] = otdata_g[output[1]];
  122. dest[2] = otdata_b[output[2]];
  123. _mm_empty();
  124. }
  125. void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
  126. unsigned char *src,
  127. unsigned char *dest,
  128. size_t length)
  129. {
  130. unsigned int i;
  131. float (*mat)[4] = transform->matrix;
  132. char input_back[32];
  133. /* Ensure we have a buffer that's 16 byte aligned regardless of the original
  134. * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
  135. * because they don't work on stack variables. gcc 4.4 does do the right thing
  136. * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
  137. float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
  138. /* share input and output locations to save having to keep the
  139. * locations in separate registers */
  140. uint32_t const * output = (uint32_t*)input;
  141. /* deref *transform now to avoid it in loop */
  142. const float *igtbl_r = transform->input_gamma_table_r;
  143. const float *igtbl_g = transform->input_gamma_table_g;
  144. const float *igtbl_b = transform->input_gamma_table_b;
  145. /* deref *transform now to avoid it in loop */
  146. const uint8_t *otdata_r = &transform->output_table_r->data[0];
  147. const uint8_t *otdata_g = &transform->output_table_g->data[0];
  148. const uint8_t *otdata_b = &transform->output_table_b->data[0];
  149. /* input matrix values never change */
  150. const __m128 mat0 = _mm_load_ps(mat[0]);
  151. const __m128 mat1 = _mm_load_ps(mat[1]);
  152. const __m128 mat2 = _mm_load_ps(mat[2]);
  153. /* these values don't change, either */
  154. const __m128 max = _mm_load_ps(clampMaxValueX4);
  155. const __m128 min = _mm_setzero_ps();
  156. const __m128 scale = _mm_load_ps(floatScaleX4);
  157. /* working variables */
  158. __m128 vec_r, vec_g, vec_b, result;
  159. unsigned char alpha;
  160. /* CYA */
  161. if (!length)
  162. return;
  163. /* one pixel is handled outside of the loop */
  164. length--;
  165. /* setup for transforming 1st pixel */
  166. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  167. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  168. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  169. alpha = src[3];
  170. src += 4;
  171. /* transform all but final pixel */
  172. for (i=0; i<length; i++)
  173. {
  174. /* position values from gamma tables */
  175. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  176. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  177. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  178. /* gamma * matrix */
  179. vec_r = _mm_mul_ps(vec_r, mat0);
  180. vec_g = _mm_mul_ps(vec_g, mat1);
  181. vec_b = _mm_mul_ps(vec_b, mat2);
  182. /* store alpha for this pixel; load alpha for next */
  183. dest[3] = alpha;
  184. alpha = src[3];
  185. /* crunch, crunch, crunch */
  186. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  187. vec_r = _mm_max_ps(min, vec_r);
  188. vec_r = _mm_min_ps(max, vec_r);
  189. result = _mm_mul_ps(vec_r, scale);
  190. /* store calc'd output tables indices */
  191. *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
  192. result = _mm_movehl_ps(result, result);
  193. *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
  194. /* load gamma values for next loop while store completes */
  195. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  196. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  197. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  198. src += 4;
  199. /* use calc'd indices to output RGB values */
  200. dest[0] = otdata_r[output[0]];
  201. dest[1] = otdata_g[output[1]];
  202. dest[2] = otdata_b[output[2]];
  203. dest += 4;
  204. }
  205. /* handle final (maybe only) pixel */
  206. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  207. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  208. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  209. vec_r = _mm_mul_ps(vec_r, mat0);
  210. vec_g = _mm_mul_ps(vec_g, mat1);
  211. vec_b = _mm_mul_ps(vec_b, mat2);
  212. dest[3] = alpha;
  213. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  214. vec_r = _mm_max_ps(min, vec_r);
  215. vec_r = _mm_min_ps(max, vec_r);
  216. result = _mm_mul_ps(vec_r, scale);
  217. *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
  218. result = _mm_movehl_ps(result, result);
  219. *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
  220. dest[0] = otdata_r[output[0]];
  221. dest[1] = otdata_g[output[1]];
  222. dest[2] = otdata_b[output[2]];
  223. _mm_empty();
  224. }