transform-sse2.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. // qcms
  2. // Copyright (C) 2009 Mozilla Foundation
  3. // Copyright (C) 2010 Steve Snyder
  4. //
  5. // Permission is hereby granted, free of charge, to any person obtaining
  6. // a copy of this software and associated documentation files (the "Software"),
  7. // to deal in the Software without restriction, including without limitation
  8. // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9. // and/or sell copies of the Software, and to permit persons to whom the Software
  10. // is furnished to do so, subject to the following conditions:
  11. //
  12. // The above copyright notice and this permission notice shall be included in
  13. // all copies or substantial portions of the Software.
  14. //
  15. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  17. // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  19. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  20. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  21. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22. #include <emmintrin.h>
  23. #include "qcmsint.h"
  24. /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
  25. #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
  26. #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
  27. static const ALIGN float floatScaleX4[4] =
  28. { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
  29. static const ALIGN float clampMaxValueX4[4] =
  30. { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
  31. void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
  32. unsigned char *src,
  33. unsigned char *dest,
  34. size_t length)
  35. {
  36. unsigned int i;
  37. float (*mat)[4] = transform->matrix;
  38. char input_back[32];
  39. /* Ensure we have a buffer that's 16 byte aligned regardless of the original
  40. * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
  41. * because they don't work on stack variables. gcc 4.4 does do the right thing
  42. * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
  43. float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
  44. /* share input and output locations to save having to keep the
  45. * locations in separate registers */
  46. uint32_t const * output = (uint32_t*)input;
  47. /* deref *transform now to avoid it in loop */
  48. const float *igtbl_r = transform->input_gamma_table_r;
  49. const float *igtbl_g = transform->input_gamma_table_g;
  50. const float *igtbl_b = transform->input_gamma_table_b;
  51. /* deref *transform now to avoid it in loop */
  52. const uint8_t *otdata_r = &transform->output_table_r->data[0];
  53. const uint8_t *otdata_g = &transform->output_table_g->data[0];
  54. const uint8_t *otdata_b = &transform->output_table_b->data[0];
  55. /* input matrix values never change */
  56. const __m128 mat0 = _mm_load_ps(mat[0]);
  57. const __m128 mat1 = _mm_load_ps(mat[1]);
  58. const __m128 mat2 = _mm_load_ps(mat[2]);
  59. /* these values don't change, either */
  60. const __m128 max = _mm_load_ps(clampMaxValueX4);
  61. const __m128 min = _mm_setzero_ps();
  62. const __m128 scale = _mm_load_ps(floatScaleX4);
  63. /* working variables */
  64. __m128 vec_r, vec_g, vec_b, result;
  65. /* CYA */
  66. if (!length)
  67. return;
  68. /* one pixel is handled outside of the loop */
  69. length--;
  70. /* setup for transforming 1st pixel */
  71. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  72. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  73. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  74. src += 3;
  75. /* transform all but final pixel */
  76. for (i=0; i<length; i++)
  77. {
  78. /* position values from gamma tables */
  79. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  80. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  81. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  82. /* gamma * matrix */
  83. vec_r = _mm_mul_ps(vec_r, mat0);
  84. vec_g = _mm_mul_ps(vec_g, mat1);
  85. vec_b = _mm_mul_ps(vec_b, mat2);
  86. /* crunch, crunch, crunch */
  87. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  88. vec_r = _mm_max_ps(min, vec_r);
  89. vec_r = _mm_min_ps(max, vec_r);
  90. result = _mm_mul_ps(vec_r, scale);
  91. /* store calc'd output tables indices */
  92. _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
  93. /* load for next loop while store completes */
  94. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  95. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  96. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  97. src += 3;
  98. /* use calc'd indices to output RGB values */
  99. dest[0] = otdata_r[output[0]];
  100. dest[1] = otdata_g[output[1]];
  101. dest[2] = otdata_b[output[2]];
  102. dest += 3;
  103. }
  104. /* handle final (maybe only) pixel */
  105. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  106. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  107. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  108. vec_r = _mm_mul_ps(vec_r, mat0);
  109. vec_g = _mm_mul_ps(vec_g, mat1);
  110. vec_b = _mm_mul_ps(vec_b, mat2);
  111. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  112. vec_r = _mm_max_ps(min, vec_r);
  113. vec_r = _mm_min_ps(max, vec_r);
  114. result = _mm_mul_ps(vec_r, scale);
  115. _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
  116. dest[0] = otdata_r[output[0]];
  117. dest[1] = otdata_g[output[1]];
  118. dest[2] = otdata_b[output[2]];
  119. }
  120. void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
  121. unsigned char *src,
  122. unsigned char *dest,
  123. size_t length)
  124. {
  125. unsigned int i;
  126. float (*mat)[4] = transform->matrix;
  127. char input_back[32];
  128. /* Ensure we have a buffer that's 16 byte aligned regardless of the original
  129. * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
  130. * because they don't work on stack variables. gcc 4.4 does do the right thing
  131. * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
  132. float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
  133. /* share input and output locations to save having to keep the
  134. * locations in separate registers */
  135. uint32_t const * output = (uint32_t*)input;
  136. /* deref *transform now to avoid it in loop */
  137. const float *igtbl_r = transform->input_gamma_table_r;
  138. const float *igtbl_g = transform->input_gamma_table_g;
  139. const float *igtbl_b = transform->input_gamma_table_b;
  140. /* deref *transform now to avoid it in loop */
  141. const uint8_t *otdata_r = &transform->output_table_r->data[0];
  142. const uint8_t *otdata_g = &transform->output_table_g->data[0];
  143. const uint8_t *otdata_b = &transform->output_table_b->data[0];
  144. /* input matrix values never change */
  145. const __m128 mat0 = _mm_load_ps(mat[0]);
  146. const __m128 mat1 = _mm_load_ps(mat[1]);
  147. const __m128 mat2 = _mm_load_ps(mat[2]);
  148. /* these values don't change, either */
  149. const __m128 max = _mm_load_ps(clampMaxValueX4);
  150. const __m128 min = _mm_setzero_ps();
  151. const __m128 scale = _mm_load_ps(floatScaleX4);
  152. /* working variables */
  153. __m128 vec_r, vec_g, vec_b, result;
  154. unsigned char alpha;
  155. /* CYA */
  156. if (!length)
  157. return;
  158. /* one pixel is handled outside of the loop */
  159. length--;
  160. /* setup for transforming 1st pixel */
  161. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  162. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  163. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  164. alpha = src[3];
  165. src += 4;
  166. /* transform all but final pixel */
  167. for (i=0; i<length; i++)
  168. {
  169. /* position values from gamma tables */
  170. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  171. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  172. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  173. /* gamma * matrix */
  174. vec_r = _mm_mul_ps(vec_r, mat0);
  175. vec_g = _mm_mul_ps(vec_g, mat1);
  176. vec_b = _mm_mul_ps(vec_b, mat2);
  177. /* store alpha for this pixel; load alpha for next */
  178. dest[3] = alpha;
  179. alpha = src[3];
  180. /* crunch, crunch, crunch */
  181. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  182. vec_r = _mm_max_ps(min, vec_r);
  183. vec_r = _mm_min_ps(max, vec_r);
  184. result = _mm_mul_ps(vec_r, scale);
  185. /* store calc'd output tables indices */
  186. _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
  187. /* load gamma values for next loop while store completes */
  188. vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  189. vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  190. vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  191. src += 4;
  192. /* use calc'd indices to output RGB values */
  193. dest[0] = otdata_r[output[0]];
  194. dest[1] = otdata_g[output[1]];
  195. dest[2] = otdata_b[output[2]];
  196. dest += 4;
  197. }
  198. /* handle final (maybe only) pixel */
  199. vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  200. vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  201. vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  202. vec_r = _mm_mul_ps(vec_r, mat0);
  203. vec_g = _mm_mul_ps(vec_g, mat1);
  204. vec_b = _mm_mul_ps(vec_b, mat2);
  205. dest[3] = alpha;
  206. vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
  207. vec_r = _mm_max_ps(min, vec_r);
  208. vec_r = _mm_min_ps(max, vec_r);
  209. result = _mm_mul_ps(vec_r, scale);
  210. _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
  211. dest[0] = otdata_r[output[0]];
  212. dest[1] = otdata_g[output[1]];
  213. dest[2] = otdata_b[output[2]];
  214. }