mixer_neon.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. #include "config.h"
  2. #include <arm_neon.h>
  3. #include <algorithm>
  4. #include <array>
  5. #include <cstddef>
  6. #include <limits>
  7. #include <variant>
  8. #include "alnumeric.h"
  9. #include "alspan.h"
  10. #include "core/bsinc_defs.h"
  11. #include "core/bufferline.h"
  12. #include "core/cubic_defs.h"
  13. #include "core/mixer/hrtfdefs.h"
  14. #include "core/resampler_limits.h"
  15. #include "defs.h"
  16. #include "hrtfbase.h"
  17. #include "opthelpers.h"
  18. struct CTag;
  19. struct NEONTag;
  20. struct LerpTag;
  21. struct CubicTag;
  22. struct BSincTag;
  23. struct FastBSincTag;
  24. #if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
  25. #pragma GCC target("fpu=neon")
  26. #endif
  27. using uint = unsigned int;
  28. namespace {
  29. constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
  30. constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
  31. constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
  32. constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
  33. constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
  34. constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
  35. force_inline
  36. void vtranspose4(float32x4_t &x0, float32x4_t &x1, float32x4_t &x2, float32x4_t &x3) noexcept
  37. {
  38. float32x4x2_t t0_{vzipq_f32(x0, x2)};
  39. float32x4x2_t t1_{vzipq_f32(x1, x3)};
  40. float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
  41. float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
  42. x0 = u0_.val[0];
  43. x1 = u0_.val[1];
  44. x2 = u1_.val[0];
  45. x3 = u1_.val[1];
  46. }
  47. inline float32x4_t set_f4(float l0, float l1, float l2, float l3)
  48. {
  49. float32x4_t ret{vmovq_n_f32(l0)};
  50. ret = vsetq_lane_f32(l1, ret, 1);
  51. ret = vsetq_lane_f32(l2, ret, 2);
  52. ret = vsetq_lane_f32(l3, ret, 3);
  53. return ret;
  54. }
  55. inline void ApplyCoeffs(const al::span<float2> Values, const size_t IrSize,
  56. const ConstHrirSpan Coeffs, const float left, const float right)
  57. {
  58. ASSUME(IrSize >= MinIrLength);
  59. ASSUME(IrSize <= HrirLength);
  60. auto dup_samples = [left,right]() -> float32x4_t
  61. {
  62. float32x2_t leftright2{vset_lane_f32(right, vmov_n_f32(left), 1)};
  63. return vcombine_f32(leftright2, leftright2);
  64. };
  65. const auto leftright4 = dup_samples();
  66. /* Using a loop here instead of std::transform since some builds seem to
  67. * have an issue with accessing an array/span of float32x4_t.
  68. */
  69. for(size_t c{0};c < IrSize;c += 2)
  70. {
  71. auto vals = vld1q_f32(&Values[c][0]);
  72. vals = vmlaq_f32(vals, vld1q_f32(&Coeffs[c][0]), leftright4);
  73. vst1q_f32(&Values[c][0], vals);
  74. }
  75. }
  76. force_inline void MixLine(const al::span<const float> InSamples, const al::span<float> dst,
  77. float &CurrentGain, const float TargetGain, const float delta, const size_t fade_len,
  78. const size_t realign_len, size_t Counter)
  79. {
  80. const auto step = float{(TargetGain-CurrentGain) * delta};
  81. auto pos = size_t{0};
  82. if(std::abs(step) > std::numeric_limits<float>::epsilon())
  83. {
  84. const auto gain = float{CurrentGain};
  85. auto step_count = float{0.0f};
  86. /* Mix with applying gain steps in aligned multiples of 4. */
  87. if(const size_t todo{fade_len >> 2})
  88. {
  89. const auto four4 = vdupq_n_f32(4.0f);
  90. const auto step4 = vdupq_n_f32(step);
  91. const auto gain4 = vdupq_n_f32(gain);
  92. auto step_count4 = set_f4(0.0f, 1.0f, 2.0f, 3.0f);
  93. const auto in4 = al::span{reinterpret_cast<const float32x4_t*>(InSamples.data()),
  94. InSamples.size()/4}.first(todo);
  95. const auto out4 = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
  96. std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
  97. [gain4,step4,four4,&step_count4](const float32x4_t val4, float32x4_t dry4)
  98. {
  99. /* dry += val * (gain + step*step_count) */
  100. dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
  101. step_count4 = vaddq_f32(step_count4, four4);
  102. return dry4;
  103. });
  104. pos += in4.size()*4;
  105. /* NOTE: step_count4 now represents the next four counts after the
  106. * last four mixed samples, so the lowest element represents the
  107. * next step count to apply.
  108. */
  109. step_count = vgetq_lane_f32(step_count4, 0);
  110. }
  111. /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
  112. if(const size_t leftover{fade_len&3})
  113. {
  114. const auto in = InSamples.subspan(pos, leftover);
  115. const auto out = dst.subspan(pos);
  116. std::transform(in.begin(), in.end(), out.begin(), out.begin(),
  117. [gain,step,&step_count](const float val, float dry) noexcept -> float
  118. {
  119. dry += val * (gain + step*step_count);
  120. step_count += 1.0f;
  121. return dry;
  122. });
  123. pos += leftover;
  124. }
  125. if(pos < Counter)
  126. {
  127. CurrentGain = gain + step*step_count;
  128. return;
  129. }
  130. /* Mix until pos is aligned with 4 or the mix is done. */
  131. if(const size_t leftover{realign_len&3})
  132. {
  133. const auto in = InSamples.subspan(pos, leftover);
  134. const auto out = dst.subspan(pos);
  135. std::transform(in.begin(), in.end(), out.begin(), out.begin(),
  136. [TargetGain](const float val, const float dry) noexcept -> float
  137. { return dry + val*TargetGain; });
  138. pos += leftover;
  139. }
  140. }
  141. CurrentGain = TargetGain;
  142. if(!(std::abs(TargetGain) > GainSilenceThreshold))
  143. return;
  144. if(const size_t todo{(InSamples.size()-pos) >> 2})
  145. {
  146. const auto in4 = al::span{reinterpret_cast<const float32x4_t*>(InSamples.data()),
  147. InSamples.size()/4}.last(todo);
  148. const auto out = dst.subspan(pos);
  149. const auto out4 = al::span{reinterpret_cast<float32x4_t*>(out.data()), out.size()/4};
  150. const auto gain4 = vdupq_n_f32(TargetGain);
  151. std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
  152. [gain4](const float32x4_t val4, const float32x4_t dry4) -> float32x4_t
  153. { return vmlaq_f32(dry4, val4, gain4); });
  154. pos += in4.size()*4;
  155. }
  156. if(const size_t leftover{(InSamples.size()-pos)&3})
  157. {
  158. const auto in = InSamples.last(leftover);
  159. const auto out = dst.subspan(pos);
  160. std::transform(in.begin(), in.end(), out.begin(), out.begin(),
  161. [TargetGain](const float val, const float dry) noexcept -> float
  162. { return dry + val*TargetGain; });
  163. }
  164. }
  165. } // namespace
  166. template<>
  167. void Resample_<LerpTag,NEONTag>(const InterpState*, const al::span<const float> src, uint frac,
  168. const uint increment, const al::span<float> dst)
  169. {
  170. ASSUME(frac < MixerFracOne);
  171. const uint32x4_t increment4 = vdupq_n_u32(increment*4u);
  172. const float32x4_t fracOne4 = vdupq_n_f32(1.0f/MixerFracOne);
  173. const uint32x4_t fracMask4 = vdupq_n_u32(MixerFracMask);
  174. alignas(16) std::array<uint,4> pos_{}, frac_{};
  175. InitPosArrays(MaxResamplerEdge, frac, increment, al::span{frac_}, al::span{pos_});
  176. uint32x4_t frac4 = vld1q_u32(frac_.data());
  177. uint32x4_t pos4 = vld1q_u32(pos_.data());
  178. auto vecout = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
  179. std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]() -> float32x4_t
  180. {
  181. const uint pos0{vgetq_lane_u32(pos4, 0)};
  182. const uint pos1{vgetq_lane_u32(pos4, 1)};
  183. const uint pos2{vgetq_lane_u32(pos4, 2)};
  184. const uint pos3{vgetq_lane_u32(pos4, 3)};
  185. ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
  186. const float32x4_t val1{set_f4(src[pos0], src[pos1], src[pos2], src[pos3])};
  187. const float32x4_t val2{set_f4(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])};
  188. /* val1 + (val2-val1)*mu */
  189. const float32x4_t r0{vsubq_f32(val2, val1)};
  190. const float32x4_t mu{vmulq_f32(vcvtq_f32_u32(frac4), fracOne4)};
  191. const float32x4_t out{vmlaq_f32(val1, mu, r0)};
  192. frac4 = vaddq_u32(frac4, increment4);
  193. pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits));
  194. frac4 = vandq_u32(frac4, fracMask4);
  195. return out;
  196. });
  197. if(size_t todo{dst.size()&3})
  198. {
  199. auto pos = size_t{vgetq_lane_u32(pos4, 0)};
  200. frac = vgetq_lane_u32(frac4, 0);
  201. const auto out = dst.last(todo);
  202. std::generate(out.begin(), out.end(), [&pos,&frac,src,increment]
  203. {
  204. const float output{lerpf(src[pos+0], src[pos+1],
  205. static_cast<float>(frac) * (1.0f/MixerFracOne))};
  206. frac += increment;
  207. pos += frac>>MixerFracBits;
  208. frac &= MixerFracMask;
  209. return output;
  210. });
  211. }
  212. }
  213. template<>
  214. void Resample_<CubicTag,NEONTag>(const InterpState *state, const al::span<const float> src,
  215. uint frac, const uint increment, const al::span<float> dst)
  216. {
  217. ASSUME(frac < MixerFracOne);
  218. const auto filter = std::get<CubicState>(*state).filter;
  219. const uint32x4_t increment4{vdupq_n_u32(increment*4u)};
  220. const uint32x4_t fracMask4{vdupq_n_u32(MixerFracMask)};
  221. const float32x4_t fracDiffOne4{vdupq_n_f32(1.0f/CubicPhaseDiffOne)};
  222. const uint32x4_t fracDiffMask4{vdupq_n_u32(CubicPhaseDiffMask)};
  223. alignas(16) std::array<uint,4> pos_{}, frac_{};
  224. InitPosArrays(MaxResamplerEdge-1, frac, increment, al::span{frac_}, al::span{pos_});
  225. uint32x4_t frac4{vld1q_u32(frac_.data())};
  226. uint32x4_t pos4{vld1q_u32(pos_.data())};
  227. auto vecout = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
  228. std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]
  229. {
  230. const uint pos0{vgetq_lane_u32(pos4, 0)};
  231. const uint pos1{vgetq_lane_u32(pos4, 1)};
  232. const uint pos2{vgetq_lane_u32(pos4, 2)};
  233. const uint pos3{vgetq_lane_u32(pos4, 3)};
  234. ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
  235. const float32x4_t val0{vld1q_f32(&src[pos0])};
  236. const float32x4_t val1{vld1q_f32(&src[pos1])};
  237. const float32x4_t val2{vld1q_f32(&src[pos2])};
  238. const float32x4_t val3{vld1q_f32(&src[pos3])};
  239. const uint32x4_t pi4{vshrq_n_u32(frac4, CubicPhaseDiffBits)};
  240. const uint pi0{vgetq_lane_u32(pi4, 0)}; ASSUME(pi0 < CubicPhaseCount);
  241. const uint pi1{vgetq_lane_u32(pi4, 1)}; ASSUME(pi1 < CubicPhaseCount);
  242. const uint pi2{vgetq_lane_u32(pi4, 2)}; ASSUME(pi2 < CubicPhaseCount);
  243. const uint pi3{vgetq_lane_u32(pi4, 3)}; ASSUME(pi3 < CubicPhaseCount);
  244. const float32x4_t pf4{vmulq_f32(vcvtq_f32_u32(vandq_u32(frac4, fracDiffMask4)),
  245. fracDiffOne4)};
  246. float32x4_t r0{vmulq_f32(val0,
  247. vmlaq_f32(vld1q_f32(filter[pi0].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 0),
  248. vld1q_f32(filter[pi0].mDeltas.data())))};
  249. float32x4_t r1{vmulq_f32(val1,
  250. vmlaq_f32(vld1q_f32(filter[pi1].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 1),
  251. vld1q_f32(filter[pi1].mDeltas.data())))};
  252. float32x4_t r2{vmulq_f32(val2,
  253. vmlaq_f32(vld1q_f32(filter[pi2].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 0),
  254. vld1q_f32(filter[pi2].mDeltas.data())))};
  255. float32x4_t r3{vmulq_f32(val3,
  256. vmlaq_f32(vld1q_f32(filter[pi3].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 1),
  257. vld1q_f32(filter[pi3].mDeltas.data())))};
  258. vtranspose4(r0, r1, r2, r3);
  259. r0 = vaddq_f32(vaddq_f32(r0, r1), vaddq_f32(r2, r3));
  260. frac4 = vaddq_u32(frac4, increment4);
  261. pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits));
  262. frac4 = vandq_u32(frac4, fracMask4);
  263. return r0;
  264. });
  265. if(const size_t todo{dst.size()&3})
  266. {
  267. auto pos = size_t{vgetq_lane_u32(pos4, 0)};
  268. frac = vgetq_lane_u32(frac4, 0);
  269. auto out = dst.last(todo);
  270. std::generate(out.begin(), out.end(), [&pos,&frac,src,increment,filter]
  271. {
  272. const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
  273. const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
  274. const float32x4_t pf4{vdupq_n_f32(pf)};
  275. const float32x4_t f4{vmlaq_f32(vld1q_f32(filter[pi].mCoeffs.data()), pf4,
  276. vld1q_f32(filter[pi].mDeltas.data()))};
  277. float32x4_t r4{vmulq_f32(f4, vld1q_f32(&src[pos]))};
  278. r4 = vaddq_f32(r4, vrev64q_f32(r4));
  279. const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
  280. frac += increment;
  281. pos += frac>>MixerFracBits;
  282. frac &= MixerFracMask;
  283. return output;
  284. });
  285. }
  286. }
  287. template<>
  288. void Resample_<BSincTag,NEONTag>(const InterpState *state, const al::span<const float> src,
  289. uint frac, const uint increment, const al::span<float> dst)
  290. {
  291. const auto &bsinc = std::get<BsincState>(*state);
  292. const auto sf4 = vdupq_n_f32(bsinc.sf);
  293. const auto m = size_t{bsinc.m};
  294. ASSUME(m > 0);
  295. ASSUME(m <= MaxResamplerPadding);
  296. ASSUME(frac < MixerFracOne);
  297. const auto filter = bsinc.filter.first(4_uz*BSincPhaseCount*m);
  298. ASSUME(bsinc.l <= MaxResamplerEdge);
  299. auto pos = size_t{MaxResamplerEdge-bsinc.l};
  300. std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,sf4,m,filter]() -> float
  301. {
  302. // Calculate the phase index and factor.
  303. const uint pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
  304. const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
  305. // Apply the scale and phase interpolated filter.
  306. float32x4_t r4{vdupq_n_f32(0.0f)};
  307. {
  308. const float32x4_t pf4{vdupq_n_f32(pf)};
  309. const auto fil = filter.subspan(2_uz*pi*m);
  310. const auto phd = fil.subspan(m);
  311. const auto scd = fil.subspan(2_uz*BSincPhaseCount*m);
  312. const auto spd = scd.subspan(m);
  313. size_t td{m >> 2};
  314. size_t j{0u};
  315. do {
  316. /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
  317. const float32x4_t f4 = vmlaq_f32(
  318. vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j])),
  319. pf4, vmlaq_f32(vld1q_f32(&phd[j]), sf4, vld1q_f32(&spd[j])));
  320. /* r += f*src */
  321. r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[pos+j]));
  322. j += 4;
  323. } while(--td);
  324. }
  325. r4 = vaddq_f32(r4, vrev64q_f32(r4));
  326. const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
  327. frac += increment;
  328. pos += frac>>MixerFracBits;
  329. frac &= MixerFracMask;
  330. return output;
  331. });
  332. }
  333. template<>
  334. void Resample_<FastBSincTag,NEONTag>(const InterpState *state, const al::span<const float> src,
  335. uint frac, const uint increment, const al::span<float> dst)
  336. {
  337. const auto &bsinc = std::get<BsincState>(*state);
  338. const auto m = size_t{bsinc.m};
  339. ASSUME(m > 0);
  340. ASSUME(m <= MaxResamplerPadding);
  341. ASSUME(frac < MixerFracOne);
  342. const auto filter = bsinc.filter.first(2_uz*BSincPhaseCount*m);
  343. ASSUME(bsinc.l <= MaxResamplerEdge);
  344. auto pos = size_t{MaxResamplerEdge-bsinc.l};
  345. std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,m,filter]() -> float
  346. {
  347. // Calculate the phase index and factor.
  348. const uint pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
  349. const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
  350. // Apply the phase interpolated filter.
  351. float32x4_t r4{vdupq_n_f32(0.0f)};
  352. {
  353. const float32x4_t pf4{vdupq_n_f32(pf)};
  354. const auto fil = filter.subspan(2_uz*pi*m);
  355. const auto phd = fil.subspan(m);
  356. size_t td{m >> 2};
  357. size_t j{0u};
  358. do {
  359. /* f = fil + pf*phd */
  360. const float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), pf4, vld1q_f32(&phd[j]));
  361. /* r += f*src */
  362. r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[pos+j]));
  363. j += 4;
  364. } while(--td);
  365. }
  366. r4 = vaddq_f32(r4, vrev64q_f32(r4));
  367. const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
  368. frac += increment;
  369. pos += frac>>MixerFracBits;
  370. frac &= MixerFracMask;
  371. return output;
  372. });
  373. }
  374. template<>
  375. void MixHrtf_<NEONTag>(const al::span<const float> InSamples, const al::span<float2> AccumSamples,
  376. const uint IrSize, const MixHrtfFilter *hrtfparams, const size_t SamplesToDo)
  377. { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, SamplesToDo); }
  378. template<>
  379. void MixHrtfBlend_<NEONTag>(const al::span<const float> InSamples,
  380. const al::span<float2> AccumSamples, const uint IrSize, const HrtfFilter *oldparams,
  381. const MixHrtfFilter *newparams, const size_t SamplesToDo)
  382. {
  383. MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
  384. SamplesToDo);
  385. }
  386. template<>
  387. void MixDirectHrtf_<NEONTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
  388. const al::span<const FloatBufferLine> InSamples, const al::span<float2> AccumSamples,
  389. const al::span<float,BufferLineSize> TempBuf, const al::span<HrtfChannelState> ChanState,
  390. const size_t IrSize, const size_t SamplesToDo)
  391. {
  392. MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
  393. IrSize, SamplesToDo);
  394. }
  395. template<>
  396. void Mix_<NEONTag>(const al::span<const float> InSamples,const al::span<FloatBufferLine> OutBuffer,
  397. const al::span<float> CurrentGains, const al::span<const float> TargetGains,
  398. const size_t Counter, const size_t OutPos)
  399. {
  400. if((OutPos&3) != 0) UNLIKELY
  401. return Mix_<CTag>(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos);
  402. const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
  403. const auto fade_len = std::min(Counter, InSamples.size());
  404. const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
  405. auto curgains = CurrentGains.begin();
  406. auto targetgains = TargetGains.cbegin();
  407. for(FloatBufferLine &output : OutBuffer)
  408. MixLine(InSamples, al::span{output}.subspan(OutPos), *curgains++, *targetgains++, delta,
  409. fade_len, realign_len, Counter);
  410. }
  411. template<>
  412. void Mix_<NEONTag>(const al::span<const float> InSamples, const al::span<float> OutBuffer,
  413. float &CurrentGain, const float TargetGain, const size_t Counter)
  414. {
  415. if((reinterpret_cast<uintptr_t>(OutBuffer.data())&15) != 0) UNLIKELY
  416. return Mix_<CTag>(InSamples, OutBuffer, CurrentGain, TargetGain, Counter);
  417. const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
  418. const auto fade_len = std::min(Counter, InSamples.size());
  419. const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
  420. MixLine(InSamples, OutBuffer, CurrentGain, TargetGain, delta, fade_len, realign_len, Counter);
  421. }