mixer_sse2.cpp 8.8 KB


  1. /**
  2. * OpenAL cross platform audio library
  3. * Copyright (C) 2014 by Timothy Arceri <[email protected]>.
  4. * This library is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Library General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2 of the License, or (at your option) any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Library General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Library General Public
  15. * License along with this library; if not, write to the
  16. * Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18. * Or go to http://www.gnu.org/copyleft/lgpl.html
  19. */
  20. #include "config.h"
  21. #include <xmmintrin.h>
  22. #include <emmintrin.h>
  23. #include <algorithm>
  24. #include <array>
  25. #include <cstddef>
  26. #include <variant>
  27. #include "alnumeric.h"
  28. #include "alspan.h"
  29. #include "core/cubic_defs.h"
  30. #include "core/resampler_limits.h"
  31. #include "defs.h"
  32. #include "opthelpers.h"
  33. struct SSE2Tag;
  34. struct LerpTag;
  35. struct CubicTag;
  36. #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE2__)
  37. #pragma GCC target("sse2")
  38. #endif
  39. using uint = unsigned int;
  40. namespace {
  41. constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
  42. constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
  43. constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
  44. force_inline __m128 vmadd(const __m128 x, const __m128 y, const __m128 z) noexcept
  45. { return _mm_add_ps(x, _mm_mul_ps(y, z)); }
  46. } // namespace
  47. template<>
  48. void Resample_<LerpTag,SSE2Tag>(const InterpState*, const al::span<const float> src, uint frac,
  49. const uint increment, const al::span<float> dst)
  50. {
  51. ASSUME(frac < MixerFracOne);
  52. const __m128i increment4{_mm_set1_epi32(static_cast<int>(increment*4))};
  53. const __m128 fracOne4{_mm_set1_ps(1.0f/MixerFracOne)};
  54. const __m128i fracMask4{_mm_set1_epi32(MixerFracMask)};
  55. std::array<uint,4> pos_{}, frac_{};
  56. InitPosArrays(MaxResamplerEdge, frac, increment, al::span{frac_}, al::span{pos_});
  57. __m128i frac4{_mm_setr_epi32(static_cast<int>(frac_[0]), static_cast<int>(frac_[1]),
  58. static_cast<int>(frac_[2]), static_cast<int>(frac_[3]))};
  59. __m128i pos4{_mm_setr_epi32(static_cast<int>(pos_[0]), static_cast<int>(pos_[1]),
  60. static_cast<int>(pos_[2]), static_cast<int>(pos_[3]))};
  61. auto vecout = al::span{reinterpret_cast<__m128*>(dst.data()), dst.size()/4};
  62. std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]() -> __m128
  63. {
  64. const auto pos0 = static_cast<uint>(_mm_cvtsi128_si32(pos4));
  65. const auto pos1 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4)));
  66. const auto pos2 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8)));
  67. const auto pos3 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12)));
  68. ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
  69. const __m128 val1{_mm_setr_ps(src[pos0], src[pos1], src[pos2], src[pos3])};
  70. const __m128 val2{_mm_setr_ps(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])};
  71. /* val1 + (val2-val1)*mu */
  72. const __m128 r0{_mm_sub_ps(val2, val1)};
  73. const __m128 mu{_mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4)};
  74. const __m128 out{_mm_add_ps(val1, _mm_mul_ps(mu, r0))};
  75. frac4 = _mm_add_epi32(frac4, increment4);
  76. pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, MixerFracBits));
  77. frac4 = _mm_and_si128(frac4, fracMask4);
  78. return out;
  79. });
  80. if(size_t todo{dst.size()&3})
  81. {
  82. auto pos = size_t{static_cast<uint>(_mm_cvtsi128_si32(pos4))};
  83. frac = static_cast<uint>(_mm_cvtsi128_si32(frac4));
  84. const auto out = dst.last(todo);
  85. std::generate(out.begin(), out.end(), [&pos,&frac,src,increment]()
  86. {
  87. const float smp{lerpf(src[pos+0], src[pos+1],
  88. static_cast<float>(frac) * (1.0f/MixerFracOne))};
  89. frac += increment;
  90. pos += frac>>MixerFracBits;
  91. frac &= MixerFracMask;
  92. return smp;
  93. });
  94. }
  95. }
  96. template<>
  97. void Resample_<CubicTag,SSE2Tag>(const InterpState *state, const al::span<const float> src,
  98. uint frac, const uint increment, const al::span<float> dst)
  99. {
  100. ASSUME(frac < MixerFracOne);
  101. const auto filter = std::get<CubicState>(*state).filter;
  102. const __m128i increment4{_mm_set1_epi32(static_cast<int>(increment*4))};
  103. const __m128i fracMask4{_mm_set1_epi32(MixerFracMask)};
  104. const __m128 fracDiffOne4{_mm_set1_ps(1.0f/CubicPhaseDiffOne)};
  105. const __m128i fracDiffMask4{_mm_set1_epi32(CubicPhaseDiffMask)};
  106. std::array<uint,4> pos_{}, frac_{};
  107. InitPosArrays(MaxResamplerEdge-1, frac, increment, al::span{frac_}, al::span{pos_});
  108. __m128i frac4{_mm_setr_epi32(static_cast<int>(frac_[0]), static_cast<int>(frac_[1]),
  109. static_cast<int>(frac_[2]), static_cast<int>(frac_[3]))};
  110. __m128i pos4{_mm_setr_epi32(static_cast<int>(pos_[0]), static_cast<int>(pos_[1]),
  111. static_cast<int>(pos_[2]), static_cast<int>(pos_[3]))};
  112. auto vecout = al::span{reinterpret_cast<__m128*>(dst.data()), dst.size()/4};
  113. std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]
  114. {
  115. const auto pos0 = static_cast<uint>(_mm_cvtsi128_si32(pos4));
  116. const auto pos1 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4)));
  117. const auto pos2 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8)));
  118. const auto pos3 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12)));
  119. ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
  120. const __m128 val0{_mm_loadu_ps(&src[pos0])};
  121. const __m128 val1{_mm_loadu_ps(&src[pos1])};
  122. const __m128 val2{_mm_loadu_ps(&src[pos2])};
  123. const __m128 val3{_mm_loadu_ps(&src[pos3])};
  124. const __m128i pi4{_mm_srli_epi32(frac4, CubicPhaseDiffBits)};
  125. const auto pi0 = static_cast<uint>(_mm_cvtsi128_si32(pi4));
  126. const auto pi1 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pi4, 4)));
  127. const auto pi2 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pi4, 8)));
  128. const auto pi3 = static_cast<uint>(_mm_cvtsi128_si32(_mm_srli_si128(pi4, 12)));
  129. ASSUME(pi0 < CubicPhaseCount); ASSUME(pi1 < CubicPhaseCount);
  130. ASSUME(pi2 < CubicPhaseCount); ASSUME(pi3 < CubicPhaseCount);
  131. const __m128 pf4{_mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(frac4, fracDiffMask4)),
  132. fracDiffOne4)};
  133. __m128 r0{_mm_mul_ps(val0,
  134. vmadd(_mm_load_ps(filter[pi0].mCoeffs.data()),
  135. _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(0, 0, 0, 0)),
  136. _mm_load_ps(filter[pi0].mDeltas.data())))};
  137. __m128 r1{_mm_mul_ps(val1,
  138. vmadd(_mm_load_ps(filter[pi1].mCoeffs.data()),
  139. _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(1, 1, 1, 1)),
  140. _mm_load_ps(filter[pi1].mDeltas.data())))};
  141. __m128 r2{_mm_mul_ps(val2,
  142. vmadd(_mm_load_ps(filter[pi2].mCoeffs.data()),
  143. _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(2, 2, 2, 2)),
  144. _mm_load_ps(filter[pi2].mDeltas.data())))};
  145. __m128 r3{_mm_mul_ps(val3,
  146. vmadd(_mm_load_ps(filter[pi3].mCoeffs.data()),
  147. _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(3, 3, 3, 3)),
  148. _mm_load_ps(filter[pi3].mDeltas.data())))};
  149. _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
  150. r0 = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
  151. frac4 = _mm_add_epi32(frac4, increment4);
  152. pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, MixerFracBits));
  153. frac4 = _mm_and_si128(frac4, fracMask4);
  154. return r0;
  155. });
  156. if(const size_t todo{dst.size()&3})
  157. {
  158. auto pos = size_t{static_cast<uint>(_mm_cvtsi128_si32(pos4))};
  159. frac = static_cast<uint>(_mm_cvtsi128_si32(frac4));
  160. auto out = dst.last(todo);
  161. std::generate(out.begin(), out.end(), [&pos,&frac,src,increment,filter]
  162. {
  163. const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
  164. const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
  165. const __m128 pf4{_mm_set1_ps(pf)};
  166. const __m128 f4 = vmadd(_mm_load_ps(filter[pi].mCoeffs.data()), pf4,
  167. _mm_load_ps(filter[pi].mDeltas.data()));
  168. __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(&src[pos]))};
  169. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  170. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  171. const float output{_mm_cvtss_f32(r4)};
  172. frac += increment;
  173. pos += frac>>MixerFracBits;
  174. frac &= MixerFracMask;
  175. return output;
  176. });
  177. }
  178. }