mixer_sse.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. #include "config.h"
  2. #include <mmintrin.h>
  3. #include <xmmintrin.h>
  4. #include <algorithm>
  5. #include <array>
  6. #include <cstddef>
  7. #include <cstdint>
  8. #include <limits>
  9. #include <variant>
  10. #include "alnumeric.h"
  11. #include "alspan.h"
  12. #include "core/bsinc_defs.h"
  13. #include "core/bufferline.h"
  14. #include "core/cubic_defs.h"
  15. #include "core/mixer/hrtfdefs.h"
  16. #include "core/resampler_limits.h"
  17. #include "defs.h"
  18. #include "hrtfbase.h"
  19. #include "opthelpers.h"
  20. struct CTag;
  21. struct SSETag;
  22. struct CubicTag;
  23. struct BSincTag;
  24. struct FastBSincTag;
  25. #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
  26. #pragma GCC target("sse")
  27. #endif
  28. namespace {
  29. constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
  30. constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
  31. constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
  32. constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
  33. constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
  34. constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
  35. force_inline __m128 vmadd(const __m128 x, const __m128 y, const __m128 z) noexcept
  36. { return _mm_add_ps(x, _mm_mul_ps(y, z)); }
  37. inline void ApplyCoeffs(const al::span<float2> Values, const size_t IrSize,
  38. const ConstHrirSpan Coeffs, const float left, const float right)
  39. {
  40. ASSUME(IrSize >= MinIrLength);
  41. ASSUME(IrSize <= HrirLength);
  42. const auto lrlr = _mm_setr_ps(left, right, left, right);
  43. /* Round up the IR size to a multiple of 2 for SIMD (2 IRs for 2 channels
  44. * is 4 floats), to avoid cutting the last sample for odd IR counts. The
  45. * underlying HRIR is a fixed-size multiple of 2, any extra samples are
  46. * either 0 (silence) or more IR samples that get applied for "free".
  47. */
  48. const auto count4 = size_t{(IrSize+1) >> 1};
  49. /* This isn't technically correct to test alignment, but it's true for
  50. * systems that support SSE, which is the only one that needs to know the
  51. * alignment of Values (which alternates between 8- and 16-byte aligned).
  52. */
  53. if(!(reinterpret_cast<uintptr_t>(Values.data())&15))
  54. {
  55. const auto vals4 = al::span{reinterpret_cast<__m128*>(Values[0].data()), count4};
  56. const auto coeffs4 = al::span{reinterpret_cast<const __m128*>(Coeffs[0].data()), count4};
  57. std::transform(vals4.cbegin(), vals4.cend(), coeffs4.cbegin(), vals4.begin(),
  58. [lrlr](const __m128 &val, const __m128 &coeff) -> __m128
  59. { return vmadd(val, coeff, lrlr); });
  60. }
  61. else
  62. {
  63. auto coeffs = _mm_load_ps(Coeffs[0].data());
  64. auto vals = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64*>(Values[0].data()));
  65. auto imp0 = _mm_mul_ps(lrlr, coeffs);
  66. vals = _mm_add_ps(imp0, vals);
  67. _mm_storel_pi(reinterpret_cast<__m64*>(Values[0].data()), vals);
  68. size_t td{count4 - 1};
  69. size_t i{1};
  70. do {
  71. coeffs = _mm_load_ps(Coeffs[i+1].data());
  72. vals = _mm_load_ps(Values[i].data());
  73. const auto imp1 = _mm_mul_ps(lrlr, coeffs);
  74. imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  75. vals = _mm_add_ps(imp0, vals);
  76. _mm_store_ps(Values[i].data(), vals);
  77. imp0 = imp1;
  78. i += 2;
  79. } while(--td);
  80. vals = _mm_loadl_pi(vals, reinterpret_cast<__m64*>(Values[i].data()));
  81. imp0 = _mm_movehl_ps(imp0, imp0);
  82. vals = _mm_add_ps(imp0, vals);
  83. _mm_storel_pi(reinterpret_cast<__m64*>(Values[i].data()), vals);
  84. }
  85. }
  86. force_inline void MixLine(const al::span<const float> InSamples, const al::span<float> dst,
  87. float &CurrentGain, const float TargetGain, const float delta, const size_t fade_len,
  88. const size_t realign_len, size_t Counter)
  89. {
  90. const auto step = float{(TargetGain-CurrentGain) * delta};
  91. size_t pos{0};
  92. if(std::abs(step) > std::numeric_limits<float>::epsilon())
  93. {
  94. const auto gain = CurrentGain;
  95. auto step_count = 0.0f;
  96. /* Mix with applying gain steps in aligned multiples of 4. */
  97. if(const size_t todo{fade_len >> 2})
  98. {
  99. const auto four4 = _mm_set1_ps(4.0f);
  100. const auto step4 = _mm_set1_ps(step);
  101. const auto gain4 = _mm_set1_ps(gain);
  102. auto step_count4 = _mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f);
  103. const auto in4 = al::span{reinterpret_cast<const __m128*>(InSamples.data()),
  104. InSamples.size()/4}.first(todo);
  105. const auto out4 = al::span{reinterpret_cast<__m128*>(dst.data()), dst.size()/4};
  106. std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
  107. [gain4,step4,four4,&step_count4](const __m128 val4, __m128 dry4) -> __m128
  108. {
  109. /* dry += val * (gain + step*step_count) */
  110. dry4 = vmadd(dry4, val4, vmadd(gain4, step4, step_count4));
  111. step_count4 = _mm_add_ps(step_count4, four4);
  112. return dry4;
  113. });
  114. pos += in4.size()*4;
  115. /* NOTE: step_count4 now represents the next four counts after the
  116. * last four mixed samples, so the lowest element represents the
  117. * next step count to apply.
  118. */
  119. step_count = _mm_cvtss_f32(step_count4);
  120. }
  121. /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
  122. if(const size_t leftover{fade_len&3})
  123. {
  124. const auto in = InSamples.subspan(pos, leftover);
  125. const auto out = dst.subspan(pos);
  126. std::transform(in.begin(), in.end(), out.begin(), out.begin(),
  127. [gain,step,&step_count](const float val, float dry) noexcept -> float
  128. {
  129. dry += val * (gain + step*step_count);
  130. step_count += 1.0f;
  131. return dry;
  132. });
  133. pos += leftover;
  134. }
  135. if(pos < Counter)
  136. {
  137. CurrentGain = gain + step*step_count;
  138. return;
  139. }
  140. /* Mix until pos is aligned with 4 or the mix is done. */
  141. if(const size_t leftover{realign_len&3})
  142. {
  143. const auto in = InSamples.subspan(pos, leftover);
  144. const auto out = dst.subspan(pos);
  145. std::transform(in.begin(), in.end(), out.begin(), out.begin(),
  146. [TargetGain](const float val, const float dry) noexcept -> float
  147. { return dry + val*TargetGain; });
  148. pos += leftover;
  149. }
  150. }
  151. CurrentGain = TargetGain;
  152. if(!(std::abs(TargetGain) > GainSilenceThreshold))
  153. return;
  154. if(size_t todo{(InSamples.size()-pos) >> 2})
  155. {
  156. const auto in4 = al::span{reinterpret_cast<const __m128*>(InSamples.data()),
  157. InSamples.size()/4}.last(todo);
  158. const auto out = dst.subspan(pos);
  159. const auto out4 = al::span{reinterpret_cast<__m128*>(out.data()), out.size()/4};
  160. const auto gain4 = _mm_set1_ps(TargetGain);
  161. std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
  162. [gain4](const __m128 val4, const __m128 dry4) -> __m128
  163. { return vmadd(dry4, val4, gain4); });
  164. pos += in4.size()*4;
  165. }
  166. if(const size_t leftover{(InSamples.size()-pos)&3})
  167. {
  168. const auto in = InSamples.last(leftover);
  169. const auto out = dst.subspan(pos);
  170. std::transform(in.begin(), in.end(), out.begin(), out.begin(),
  171. [TargetGain](const float val, const float dry) noexcept -> float
  172. { return dry + val*TargetGain; });
  173. }
  174. }
  175. } // namespace
  176. template<>
  177. void Resample_<CubicTag,SSETag>(const InterpState *state, const al::span<const float> src,
  178. uint frac, const uint increment, const al::span<float> dst)
  179. {
  180. ASSUME(frac < MixerFracOne);
  181. const auto filter = std::get<CubicState>(*state).filter;
  182. size_t pos{MaxResamplerEdge-1};
  183. std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,filter]() -> float
  184. {
  185. const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
  186. const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
  187. const __m128 pf4{_mm_set1_ps(pf)};
  188. /* Apply the phase interpolated filter. */
  189. /* f = fil + pf*phd */
  190. const __m128 f4 = vmadd(_mm_load_ps(filter[pi].mCoeffs.data()), pf4,
  191. _mm_load_ps(filter[pi].mDeltas.data()));
  192. /* r = f*src */
  193. __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(&src[pos]))};
  194. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  195. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  196. const float output{_mm_cvtss_f32(r4)};
  197. frac += increment;
  198. pos += frac>>MixerFracBits;
  199. frac &= MixerFracMask;
  200. return output;
  201. });
  202. }
  203. template<>
  204. void Resample_<BSincTag,SSETag>(const InterpState *state, const al::span<const float> src,
  205. uint frac, const uint increment, const al::span<float> dst)
  206. {
  207. const auto &bsinc = std::get<BsincState>(*state);
  208. const auto sf4 = _mm_set1_ps(bsinc.sf);
  209. const auto m = size_t{bsinc.m};
  210. ASSUME(m > 0);
  211. ASSUME(m <= MaxResamplerPadding);
  212. ASSUME(frac < MixerFracOne);
  213. const auto filter = bsinc.filter.first(4_uz*BSincPhaseCount*m);
  214. ASSUME(bsinc.l <= MaxResamplerEdge);
  215. auto pos = size_t{MaxResamplerEdge-bsinc.l};
  216. std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,sf4,m,filter]() -> float
  217. {
  218. // Calculate the phase index and factor.
  219. const size_t pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
  220. const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
  221. // Apply the scale and phase interpolated filter.
  222. auto r4 = _mm_setzero_ps();
  223. {
  224. const auto pf4 = _mm_set1_ps(pf);
  225. const auto fil = filter.subspan(2_uz*pi*m);
  226. const auto phd = fil.subspan(m);
  227. const auto scd = fil.subspan(2_uz*BSincPhaseCount*m);
  228. const auto spd = scd.subspan(m);
  229. auto td = size_t{m >> 2};
  230. auto j = size_t{0};
  231. do {
  232. /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
  233. const __m128 f4 = vmadd(
  234. vmadd(_mm_load_ps(&fil[j]), sf4, _mm_load_ps(&scd[j])),
  235. pf4, vmadd(_mm_load_ps(&phd[j]), sf4, _mm_load_ps(&spd[j])));
  236. /* r += f*src */
  237. r4 = vmadd(r4, f4, _mm_loadu_ps(&src[pos+j]));
  238. j += 4;
  239. } while(--td);
  240. }
  241. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  242. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  243. const auto output = _mm_cvtss_f32(r4);
  244. frac += increment;
  245. pos += frac>>MixerFracBits;
  246. frac &= MixerFracMask;
  247. return output;
  248. });
  249. }
  250. template<>
  251. void Resample_<FastBSincTag,SSETag>(const InterpState *state, const al::span<const float> src,
  252. uint frac, const uint increment, const al::span<float> dst)
  253. {
  254. const auto &bsinc = std::get<BsincState>(*state);
  255. const auto m = size_t{bsinc.m};
  256. ASSUME(m > 0);
  257. ASSUME(m <= MaxResamplerPadding);
  258. ASSUME(frac < MixerFracOne);
  259. const auto filter = bsinc.filter.first(2_uz*m*BSincPhaseCount);
  260. ASSUME(bsinc.l <= MaxResamplerEdge);
  261. size_t pos{MaxResamplerEdge-bsinc.l};
  262. std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,filter,m]() -> float
  263. {
  264. // Calculate the phase index and factor.
  265. const size_t pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
  266. const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
  267. // Apply the phase interpolated filter.
  268. auto r4 = _mm_setzero_ps();
  269. {
  270. const auto pf4 = _mm_set1_ps(pf);
  271. const auto fil = filter.subspan(2_uz*m*pi);
  272. const auto phd = fil.subspan(m);
  273. auto td = size_t{m >> 2};
  274. auto j = size_t{0};
  275. do {
  276. /* f = fil + pf*phd */
  277. const auto f4 = vmadd(_mm_load_ps(&fil[j]), pf4, _mm_load_ps(&phd[j]));
  278. /* r += f*src */
  279. r4 = vmadd(r4, f4, _mm_loadu_ps(&src[pos+j]));
  280. j += 4;
  281. } while(--td);
  282. }
  283. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  284. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  285. const auto output = _mm_cvtss_f32(r4);
  286. frac += increment;
  287. pos += frac>>MixerFracBits;
  288. frac &= MixerFracMask;
  289. return output;
  290. });
  291. }
  292. template<>
  293. void MixHrtf_<SSETag>(const al::span<const float> InSamples, const al::span<float2> AccumSamples,
  294. const uint IrSize, const MixHrtfFilter *hrtfparams, const size_t SamplesToDo)
  295. { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, SamplesToDo); }
  296. template<>
  297. void MixHrtfBlend_<SSETag>(const al::span<const float> InSamples,
  298. const al::span<float2> AccumSamples, const uint IrSize, const HrtfFilter *oldparams,
  299. const MixHrtfFilter *newparams, const size_t SamplesToDo)
  300. {
  301. MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
  302. SamplesToDo);
  303. }
  304. template<>
  305. void MixDirectHrtf_<SSETag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
  306. const al::span<const FloatBufferLine> InSamples, const al::span<float2> AccumSamples,
  307. const al::span<float,BufferLineSize> TempBuf, const al::span<HrtfChannelState> ChanState,
  308. const size_t IrSize, const size_t SamplesToDo)
  309. {
  310. MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
  311. IrSize, SamplesToDo);
  312. }
  313. template<>
  314. void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
  315. const al::span<float> CurrentGains, const al::span<const float> TargetGains,
  316. const size_t Counter, const size_t OutPos)
  317. {
  318. if((OutPos&3) != 0) UNLIKELY
  319. return Mix_<CTag>(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos);
  320. const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
  321. const auto fade_len = std::min(Counter, InSamples.size());
  322. const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
  323. auto curgains = CurrentGains.begin();
  324. auto targetgains = TargetGains.cbegin();
  325. for(FloatBufferLine &output : OutBuffer)
  326. MixLine(InSamples, al::span{output}.subspan(OutPos), *curgains++, *targetgains++, delta,
  327. fade_len, realign_len, Counter);
  328. }
  329. template<>
  330. void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<float> OutBuffer,
  331. float &CurrentGain, const float TargetGain, const size_t Counter)
  332. {
  333. if((reinterpret_cast<uintptr_t>(OutBuffer.data())&15) != 0) UNLIKELY
  334. return Mix_<CTag>(InSamples, OutBuffer, CurrentGain, TargetGain, Counter);
  335. const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
  336. const auto fade_len = std::min(Counter, InSamples.size());
  337. const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
  338. MixLine(InSamples, OutBuffer, CurrentGain, TargetGain, delta, fade_len, realign_len, Counter);
  339. }