phase_shifter.h 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #ifndef PHASE_SHIFTER_H
  2. #define PHASE_SHIFTER_H
  3. #ifdef HAVE_SSE_INTRINSICS
  4. #include <xmmintrin.h>
  5. #elif defined(HAVE_NEON)
  6. #include <arm_neon.h>
  7. #endif
  8. #include <array>
  9. #include <cmath>
  10. #include <cstddef>
  11. #include "alnumbers.h"
  12. #include "alspan.h"
  13. #include "opthelpers.h"
  14. /* Implements a wide-band +90 degree phase-shift. Note that this should be
  15. * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
  16. * signal delay (FilterSize/2) to properly align.
  17. */
  18. template<std::size_t FilterSize>
  19. struct PhaseShifterT {
  20. static_assert(FilterSize >= 16, "FilterSize needs to be at least 16");
  21. static_assert((FilterSize&(FilterSize-1)) == 0, "FilterSize needs to be power-of-two");
  22. alignas(16) std::array<float,FilterSize/2> mCoeffs{};
  23. PhaseShifterT()
  24. {
  25. /* Every other coefficient is 0, so we only need to calculate and store
  26. * the non-0 terms and double-step over the input to apply it. The
  27. * calculated coefficients are in reverse to make applying in the time-
  28. * domain more efficient.
  29. */
  30. for(std::size_t i{0};i < FilterSize/2;++i)
  31. {
  32. const int k{static_cast<int>(i*2 + 1) - int{FilterSize/2}};
  33. /* Calculate the Blackman window value for this coefficient. */
  34. const double w{2.0*al::numbers::pi * static_cast<double>(i*2 + 1)
  35. / double{FilterSize}};
  36. const double window{0.3635819 - 0.4891775*std::cos(w) + 0.1365995*std::cos(2.0*w)
  37. - 0.0106411*std::cos(3.0*w)};
  38. const double pk{al::numbers::pi * static_cast<double>(k)};
  39. mCoeffs[i] = static_cast<float>(window * (1.0-std::cos(pk)) / pk);
  40. }
  41. }
  42. void process(const al::span<float> dst, const al::span<const float> src) const;
  43. private:
  44. #if defined(HAVE_NEON)
  45. static auto unpacklo(float32x4_t a, float32x4_t b)
  46. {
  47. float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
  48. return vcombine_f32(result.val[0], result.val[1]);
  49. }
  50. static auto unpackhi(float32x4_t a, float32x4_t b)
  51. {
  52. float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
  53. return vcombine_f32(result.val[0], result.val[1]);
  54. }
  55. static auto load4(float32_t a, float32_t b, float32_t c, float32_t d)
  56. {
  57. float32x4_t ret{vmovq_n_f32(a)};
  58. ret = vsetq_lane_f32(b, ret, 1);
  59. ret = vsetq_lane_f32(c, ret, 2);
  60. ret = vsetq_lane_f32(d, ret, 3);
  61. return ret;
  62. }
  63. static void vtranspose4(float32x4_t &x0, float32x4_t &x1, float32x4_t &x2, float32x4_t &x3)
  64. {
  65. float32x4x2_t t0_{vzipq_f32(x0, x2)};
  66. float32x4x2_t t1_{vzipq_f32(x1, x3)};
  67. float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
  68. float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
  69. x0 = u0_.val[0];
  70. x1 = u0_.val[1];
  71. x2 = u1_.val[0];
  72. x3 = u1_.val[1];
  73. }
  74. #endif
  75. };
  76. template<std::size_t S>
  77. NOINLINE inline
  78. void PhaseShifterT<S>::process(const al::span<float> dst, const al::span<const float> src) const
  79. {
  80. auto in = src.begin();
  81. #ifdef HAVE_SSE_INTRINSICS
  82. if(const std::size_t todo{dst.size()>>2})
  83. {
  84. auto out = al::span{reinterpret_cast<__m128*>(dst.data()), todo};
  85. std::generate(out.begin(), out.end(), [&in,this]
  86. {
  87. __m128 r0{_mm_setzero_ps()};
  88. __m128 r1{_mm_setzero_ps()};
  89. __m128 r2{_mm_setzero_ps()};
  90. __m128 r3{_mm_setzero_ps()};
  91. for(std::size_t j{0};j < mCoeffs.size();j+=4)
  92. {
  93. const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
  94. const __m128 s0{_mm_loadu_ps(&in[j*2])};
  95. const __m128 s1{_mm_loadu_ps(&in[j*2 + 4])};
  96. const __m128 s2{_mm_movehl_ps(_mm_movelh_ps(s1, s1), s0)};
  97. const __m128 s3{_mm_loadh_pi(_mm_movehl_ps(s1, s1),
  98. reinterpret_cast<const __m64*>(&in[j*2 + 8]))};
  99. __m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))};
  100. r0 = _mm_add_ps(r0, _mm_mul_ps(s, coeffs));
  101. s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
  102. r1 = _mm_add_ps(r1, _mm_mul_ps(s, coeffs));
  103. s = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(2, 0, 2, 0));
  104. r2 = _mm_add_ps(r2, _mm_mul_ps(s, coeffs));
  105. s = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(3, 1, 3, 1));
  106. r3 = _mm_add_ps(r3, _mm_mul_ps(s, coeffs));
  107. }
  108. in += 4;
  109. _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
  110. return _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
  111. });
  112. }
  113. if(const std::size_t todo{dst.size()&3})
  114. {
  115. auto out = dst.last(todo);
  116. std::generate(out.begin(), out.end(), [&in,this]
  117. {
  118. __m128 r4{_mm_setzero_ps()};
  119. for(std::size_t j{0};j < mCoeffs.size();j+=4)
  120. {
  121. const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
  122. const __m128 s{_mm_setr_ps(in[j*2], in[j*2 + 2], in[j*2 + 4], in[j*2 + 6])};
  123. r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
  124. }
  125. ++in;
  126. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  127. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  128. return _mm_cvtss_f32(r4);
  129. });
  130. }
  131. #elif defined(HAVE_NEON)
  132. if(const std::size_t todo{dst.size()>>2})
  133. {
  134. auto out = al::span{reinterpret_cast<float32x4_t*>(dst.data()), todo};
  135. std::generate(out.begin(), out.end(), [&in,this]
  136. {
  137. float32x4_t r0{vdupq_n_f32(0.0f)};
  138. float32x4_t r1{vdupq_n_f32(0.0f)};
  139. float32x4_t r2{vdupq_n_f32(0.0f)};
  140. float32x4_t r3{vdupq_n_f32(0.0f)};
  141. for(std::size_t j{0};j < mCoeffs.size();j+=4)
  142. {
  143. const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
  144. const float32x4_t s0{vld1q_f32(&in[j*2])};
  145. const float32x4_t s1{vld1q_f32(&in[j*2 + 4])};
  146. const float32x4_t s2{vcombine_f32(vget_high_f32(s0), vget_low_f32(s1))};
  147. const float32x4_t s3{vcombine_f32(vget_high_f32(s1), vld1_f32(&in[j*2 + 8]))};
  148. const float32x4x2_t values0{vuzpq_f32(s0, s1)};
  149. const float32x4x2_t values1{vuzpq_f32(s2, s3)};
  150. r0 = vmlaq_f32(r0, values0.val[0], coeffs);
  151. r1 = vmlaq_f32(r1, values0.val[1], coeffs);
  152. r2 = vmlaq_f32(r2, values1.val[0], coeffs);
  153. r3 = vmlaq_f32(r3, values1.val[1], coeffs);
  154. }
  155. in += 4;
  156. vtranspose4(r0, r1, r2, r3);
  157. return vaddq_f32(vaddq_f32(r0, r1), vaddq_f32(r2, r3));
  158. });
  159. }
  160. if(const std::size_t todo{dst.size()&3})
  161. {
  162. auto out = dst.last(todo);
  163. std::generate(out.begin(), out.end(), [&in,this]
  164. {
  165. float32x4_t r4{vdupq_n_f32(0.0f)};
  166. for(std::size_t j{0};j < mCoeffs.size();j+=4)
  167. {
  168. const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
  169. const float32x4_t s{load4(in[j*2], in[j*2 + 2], in[j*2 + 4], in[j*2 + 6])};
  170. r4 = vmlaq_f32(r4, s, coeffs);
  171. }
  172. ++in;
  173. r4 = vaddq_f32(r4, vrev64q_f32(r4));
  174. return vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
  175. });
  176. }
  177. #else
  178. std::generate(dst.begin(), dst.end(), [&in,this]
  179. {
  180. float ret{0.0f};
  181. for(std::size_t j{0};j < mCoeffs.size();++j)
  182. ret += in[j*2] * mCoeffs[j];
  183. ++in;
  184. return ret;
  185. });
  186. #endif
  187. }
  188. #endif /* PHASE_SHIFTER_H */