mixer_neon.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #include "config.h"
  2. #include <arm_neon.h>
  3. #include "AL/al.h"
  4. #include "AL/alc.h"
  5. #include "alMain.h"
  6. #include "alu.h"
  7. #include "hrtf.h"
  8. static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs)[2],
  9. const HrtfParams *hrtfparams,
  10. ALuint IrSize, ALuint Counter)
  11. {
  12. ALuint c;
  13. float32x4_t counter4;
  14. {
  15. float32x2_t counter2 = vdup_n_f32(-(float)Counter);
  16. counter4 = vcombine_f32(counter2, counter2);
  17. }
  18. for(c = 0;c < IrSize;c += 2)
  19. {
  20. float32x4_t step4 = vld1q_f32((float32_t*)hrtfparams->CoeffStep[c]);
  21. float32x4_t coeffs = vld1q_f32((float32_t*)hrtfparams->Coeffs[c]);
  22. coeffs = vmlaq_f32(coeffs, step4, counter4);
  23. vst1q_f32((float32_t*)OutCoeffs[c], coeffs);
  24. }
  25. }
  26. static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
  27. const ALuint IrSize,
  28. ALfloat (*restrict Coeffs)[2],
  29. const ALfloat (*restrict CoeffStep)[2],
  30. ALfloat left, ALfloat right)
  31. {
  32. ALuint c;
  33. float32x4_t leftright4;
  34. {
  35. float32x2_t leftright2 = vdup_n_f32(0.0);
  36. leftright2 = vset_lane_f32(left, leftright2, 0);
  37. leftright2 = vset_lane_f32(right, leftright2, 1);
  38. leftright4 = vcombine_f32(leftright2, leftright2);
  39. }
  40. for(c = 0;c < IrSize;c += 2)
  41. {
  42. const ALuint o0 = (Offset+c)&HRIR_MASK;
  43. const ALuint o1 = (o0+1)&HRIR_MASK;
  44. float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  45. vld1_f32((float32_t*)&Values[o1][0]));
  46. float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  47. float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);
  48. vals = vmlaq_f32(vals, coefs, leftright4);
  49. coefs = vaddq_f32(coefs, deltas);
  50. vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  51. vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  52. vst1q_f32(&Coeffs[c][0], coefs);
  53. }
  54. }
  55. static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
  56. const ALuint IrSize,
  57. ALfloat (*restrict Coeffs)[2],
  58. ALfloat left, ALfloat right)
  59. {
  60. ALuint c;
  61. float32x4_t leftright4;
  62. {
  63. float32x2_t leftright2 = vdup_n_f32(0.0);
  64. leftright2 = vset_lane_f32(left, leftright2, 0);
  65. leftright2 = vset_lane_f32(right, leftright2, 1);
  66. leftright4 = vcombine_f32(leftright2, leftright2);
  67. }
  68. for(c = 0;c < IrSize;c += 2)
  69. {
  70. const ALuint o0 = (Offset+c)&HRIR_MASK;
  71. const ALuint o1 = (o0+1)&HRIR_MASK;
  72. float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  73. vld1_f32((float32_t*)&Values[o1][0]));
  74. float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  75. vals = vmlaq_f32(vals, coefs, leftright4);
  76. vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  77. vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  78. }
  79. }
  80. #define MixHrtf MixHrtf_Neon
  81. #include "mixer_inc.c"
  82. #undef MixHrtf
  83. void Mix_Neon(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
  84. MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
  85. {
  86. ALfloat gain, step;
  87. float32x4_t gain4;
  88. ALuint c;
  89. for(c = 0;c < OutChans;c++)
  90. {
  91. ALuint pos = 0;
  92. gain = Gains[c].Current;
  93. step = Gains[c].Step;
  94. if(step != 0.0f && Counter > 0)
  95. {
  96. ALuint minsize = minu(BufferSize, Counter);
  97. for(;pos < minsize;pos++)
  98. {
  99. OutBuffer[c][OutPos+pos] += data[pos]*gain;
  100. gain += step;
  101. }
  102. if(pos == Counter)
  103. gain = Gains[c].Target;
  104. Gains[c].Current = gain;
  105. /* Mix until pos is aligned with 4 or the mix is done. */
  106. minsize = minu(BufferSize, (pos+3)&~3);
  107. for(;pos < minsize;pos++)
  108. OutBuffer[c][OutPos+pos] += data[pos]*gain;
  109. }
  110. if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
  111. continue;
  112. gain4 = vdupq_n_f32(gain);
  113. for(;BufferSize-pos > 3;pos += 4)
  114. {
  115. const float32x4_t val4 = vld1q_f32(&data[pos]);
  116. float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
  117. dry4 = vmlaq_f32(dry4, val4, gain4);
  118. vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
  119. }
  120. for(;pos < BufferSize;pos++)
  121. OutBuffer[c][OutPos+pos] += data[pos]*gain;
  122. }
  123. }