mixer_sse.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #include "config.h"
  2. #include <xmmintrin.h>
  3. #include "AL/al.h"
  4. #include "AL/alc.h"
  5. #include "alMain.h"
  6. #include "alu.h"
  7. #include "alSource.h"
  8. #include "alAuxEffectSlot.h"
  9. #include "mixer_defs.h"
  10. const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *restrict src,
  11. ALuint frac, ALuint increment, ALfloat *restrict dst,
  12. ALuint dstlen)
  13. {
  14. const __m128 sf4 = _mm_set1_ps(state->sf);
  15. const ALuint m = state->m;
  16. const ALint l = state->l;
  17. const ALfloat *fil, *scd, *phd, *spd;
  18. ALuint pi, j_f, i;
  19. ALfloat pf;
  20. ALint j_s;
  21. __m128 r4;
  22. for(i = 0;i < dstlen;i++)
  23. {
  24. // Calculate the phase index and factor.
  25. #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
  26. pi = frac >> FRAC_PHASE_BITDIFF;
  27. pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
  28. #undef FRAC_PHASE_BITDIFF
  29. fil = state->coeffs[pi].filter;
  30. scd = state->coeffs[pi].scDelta;
  31. phd = state->coeffs[pi].phDelta;
  32. spd = state->coeffs[pi].spDelta;
  33. // Apply the scale and phase interpolated filter.
  34. r4 = _mm_setzero_ps();
  35. {
  36. const __m128 pf4 = _mm_set1_ps(pf);
  37. for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
  38. {
  39. const __m128 f4 = _mm_add_ps(
  40. _mm_add_ps(
  41. _mm_load_ps(&fil[j_f]),
  42. _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
  43. ),
  44. _mm_mul_ps(
  45. pf4,
  46. _mm_add_ps(
  47. _mm_load_ps(&phd[j_f]),
  48. _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
  49. )
  50. )
  51. );
  52. r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
  53. }
  54. }
  55. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  56. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  57. dst[i] = _mm_cvtss_f32(r4);
  58. frac += increment;
  59. src += frac>>FRACTIONBITS;
  60. frac &= FRACTIONMASK;
  61. }
  62. return dst;
  63. }
  64. static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
  65. const ALuint IrSize,
  66. ALfloat (*restrict Coeffs)[2],
  67. const ALfloat (*restrict CoeffStep)[2],
  68. ALfloat left, ALfloat right)
  69. {
  70. const __m128 lrlr = _mm_setr_ps(left, right, left, right);
  71. __m128 coeffs, deltas, imp0, imp1;
  72. __m128 vals = _mm_setzero_ps();
  73. ALuint i;
  74. if((Offset&1))
  75. {
  76. const ALuint o0 = Offset&HRIR_MASK;
  77. const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
  78. coeffs = _mm_load_ps(&Coeffs[0][0]);
  79. deltas = _mm_load_ps(&CoeffStep[0][0]);
  80. vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
  81. imp0 = _mm_mul_ps(lrlr, coeffs);
  82. coeffs = _mm_add_ps(coeffs, deltas);
  83. vals = _mm_add_ps(imp0, vals);
  84. _mm_store_ps(&Coeffs[0][0], coeffs);
  85. _mm_storel_pi((__m64*)&Values[o0][0], vals);
  86. for(i = 1;i < IrSize-1;i += 2)
  87. {
  88. const ALuint o2 = (Offset+i)&HRIR_MASK;
  89. coeffs = _mm_load_ps(&Coeffs[i+1][0]);
  90. deltas = _mm_load_ps(&CoeffStep[i+1][0]);
  91. vals = _mm_load_ps(&Values[o2][0]);
  92. imp1 = _mm_mul_ps(lrlr, coeffs);
  93. coeffs = _mm_add_ps(coeffs, deltas);
  94. imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  95. vals = _mm_add_ps(imp0, vals);
  96. _mm_store_ps(&Coeffs[i+1][0], coeffs);
  97. _mm_store_ps(&Values[o2][0], vals);
  98. imp0 = imp1;
  99. }
  100. vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
  101. imp0 = _mm_movehl_ps(imp0, imp0);
  102. vals = _mm_add_ps(imp0, vals);
  103. _mm_storel_pi((__m64*)&Values[o1][0], vals);
  104. }
  105. else
  106. {
  107. for(i = 0;i < IrSize;i += 2)
  108. {
  109. const ALuint o = (Offset + i)&HRIR_MASK;
  110. coeffs = _mm_load_ps(&Coeffs[i][0]);
  111. deltas = _mm_load_ps(&CoeffStep[i][0]);
  112. vals = _mm_load_ps(&Values[o][0]);
  113. imp0 = _mm_mul_ps(lrlr, coeffs);
  114. coeffs = _mm_add_ps(coeffs, deltas);
  115. vals = _mm_add_ps(imp0, vals);
  116. _mm_store_ps(&Coeffs[i][0], coeffs);
  117. _mm_store_ps(&Values[o][0], vals);
  118. }
  119. }
  120. }
  121. static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
  122. const ALuint IrSize,
  123. ALfloat (*restrict Coeffs)[2],
  124. ALfloat left, ALfloat right)
  125. {
  126. const __m128 lrlr = _mm_setr_ps(left, right, left, right);
  127. __m128 vals = _mm_setzero_ps();
  128. __m128 coeffs;
  129. ALuint i;
  130. if((Offset&1))
  131. {
  132. const ALuint o0 = Offset&HRIR_MASK;
  133. const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
  134. __m128 imp0, imp1;
  135. coeffs = _mm_load_ps(&Coeffs[0][0]);
  136. vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
  137. imp0 = _mm_mul_ps(lrlr, coeffs);
  138. vals = _mm_add_ps(imp0, vals);
  139. _mm_storel_pi((__m64*)&Values[o0][0], vals);
  140. for(i = 1;i < IrSize-1;i += 2)
  141. {
  142. const ALuint o2 = (Offset+i)&HRIR_MASK;
  143. coeffs = _mm_load_ps(&Coeffs[i+1][0]);
  144. vals = _mm_load_ps(&Values[o2][0]);
  145. imp1 = _mm_mul_ps(lrlr, coeffs);
  146. imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  147. vals = _mm_add_ps(imp0, vals);
  148. _mm_store_ps(&Values[o2][0], vals);
  149. imp0 = imp1;
  150. }
  151. vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
  152. imp0 = _mm_movehl_ps(imp0, imp0);
  153. vals = _mm_add_ps(imp0, vals);
  154. _mm_storel_pi((__m64*)&Values[o1][0], vals);
  155. }
  156. else
  157. {
  158. for(i = 0;i < IrSize;i += 2)
  159. {
  160. const ALuint o = (Offset + i)&HRIR_MASK;
  161. coeffs = _mm_load_ps(&Coeffs[i][0]);
  162. vals = _mm_load_ps(&Values[o][0]);
  163. vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
  164. _mm_store_ps(&Values[o][0], vals);
  165. }
  166. }
  167. }
  168. #define MixHrtf MixHrtf_SSE
  169. #define MixDirectHrtf MixDirectHrtf_SSE
  170. #include "mixer_inc.c"
  171. #undef MixHrtf
  172. void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
  173. ALfloat *CurrentGains, const ALfloat *TargetGains, ALuint Counter, ALuint OutPos,
  174. ALuint BufferSize)
  175. {
  176. ALfloat gain, delta, step;
  177. __m128 gain4;
  178. ALuint c;
  179. delta = (Counter > 0) ? 1.0f/(ALfloat)Counter : 0.0f;
  180. for(c = 0;c < OutChans;c++)
  181. {
  182. ALuint pos = 0;
  183. gain = CurrentGains[c];
  184. step = (TargetGains[c] - gain) * delta;
  185. if(fabsf(step) > FLT_EPSILON)
  186. {
  187. ALuint minsize = minu(BufferSize, Counter);
  188. /* Mix with applying gain steps in aligned multiples of 4. */
  189. if(minsize-pos > 3)
  190. {
  191. __m128 step4;
  192. gain4 = _mm_setr_ps(
  193. gain,
  194. gain + step,
  195. gain + step + step,
  196. gain + step + step + step
  197. );
  198. step4 = _mm_set1_ps(step + step + step + step);
  199. do {
  200. const __m128 val4 = _mm_load_ps(&data[pos]);
  201. __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
  202. dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
  203. gain4 = _mm_add_ps(gain4, step4);
  204. _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
  205. pos += 4;
  206. } while(minsize-pos > 3);
  207. /* NOTE: gain4 now represents the next four gains after the
  208. * last four mixed samples, so the lowest element represents
  209. * the next gain to apply.
  210. */
  211. gain = _mm_cvtss_f32(gain4);
  212. }
  213. /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
  214. for(;pos < minsize;pos++)
  215. {
  216. OutBuffer[c][OutPos+pos] += data[pos]*gain;
  217. gain += step;
  218. }
  219. if(pos == Counter)
  220. gain = TargetGains[c];
  221. CurrentGains[c] = gain;
  222. /* Mix until pos is aligned with 4 or the mix is done. */
  223. minsize = minu(BufferSize, (pos+3)&~3);
  224. for(;pos < minsize;pos++)
  225. OutBuffer[c][OutPos+pos] += data[pos]*gain;
  226. }
  227. if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
  228. continue;
  229. gain4 = _mm_set1_ps(gain);
  230. for(;BufferSize-pos > 3;pos += 4)
  231. {
  232. const __m128 val4 = _mm_load_ps(&data[pos]);
  233. __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
  234. dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
  235. _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
  236. }
  237. for(;pos < BufferSize;pos++)
  238. OutBuffer[c][OutPos+pos] += data[pos]*gain;
  239. }
  240. }
  241. void MixRow_SSE(ALfloat *OutBuffer, const ALfloat *Gains, const ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint InPos, ALuint BufferSize)
  242. {
  243. __m128 gain4;
  244. ALuint c;
  245. for(c = 0;c < InChans;c++)
  246. {
  247. ALuint pos = 0;
  248. ALfloat gain = Gains[c];
  249. if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
  250. continue;
  251. gain4 = _mm_set1_ps(gain);
  252. for(;BufferSize-pos > 3;pos += 4)
  253. {
  254. const __m128 val4 = _mm_load_ps(&data[c][InPos+pos]);
  255. __m128 dry4 = _mm_load_ps(&OutBuffer[pos]);
  256. dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
  257. _mm_store_ps(&OutBuffer[pos], dry4);
  258. }
  259. for(;pos < BufferSize;pos++)
  260. OutBuffer[pos] += data[c][InPos+pos]*gain;
  261. }
  262. }