SDL_audiotypecvt.c 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009
  1. /*
  2. Simple DirectMedia Layer
  3. Copyright (C) 1997-2024 Sam Lantinga <[email protected]>
  4. This software is provided 'as-is', without any express or implied
  5. warranty. In no event will the authors be held liable for any damages
  6. arising from the use of this software.
  7. Permission is granted to anyone to use this software for any purpose,
  8. including commercial applications, and to alter it and redistribute it
  9. freely, subject to the following restrictions:
  10. 1. The origin of this software must not be misrepresented; you must not
  11. claim that you wrote the original software. If you use this software
  12. in a product, an acknowledgment in the product documentation would be
  13. appreciated but is not required.
  14. 2. Altered source versions must be plainly marked as such, and must not be
  15. misrepresented as being the original software.
  16. 3. This notice may not be removed or altered from any source distribution.
  17. */
  18. #include "SDL_internal.h"
  19. #include "SDL_sysaudio.h"
  20. // TODO: NEON is disabled until https://github.com/libsdl-org/SDL/issues/8352 can be fixed
  21. #undef SDL_NEON_INTRINSICS
  22. #ifndef SDL_PLATFORM_EMSCRIPTEN
  23. #if defined(__x86_64__) && defined(SDL_SSE2_INTRINSICS)
  24. #define NEED_SCALAR_CONVERTER_FALLBACKS 0 // x86_64 guarantees SSE2.
  25. #elif defined(SDL_PLATFORM_MACOS) && defined(SDL_SSE2_INTRINSICS)
  26. #define NEED_SCALAR_CONVERTER_FALLBACKS 0 // macOS/Intel guarantees SSE2.
  27. #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && defined(SDL_NEON_INTRINSICS)
  28. #define NEED_SCALAR_CONVERTER_FALLBACKS 0 // ARMv8+ promise NEON.
  29. #elif defined(SDL_PLATFORM_APPLE) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && defined(SDL_NEON_INTRINSICS)
  30. #define NEED_SCALAR_CONVERTER_FALLBACKS 0 // All Apple ARMv7 chips promise NEON support.
  31. #endif
  32. #endif /* SDL_PLATFORM_EMSCRIPTEN */
  33. // Set to zero if platform is guaranteed to use a SIMD codepath here.
  34. #if !defined(NEED_SCALAR_CONVERTER_FALLBACKS)
  35. #define NEED_SCALAR_CONVERTER_FALLBACKS 1
  36. #endif
  37. #define DIVBY2147483648 0.0000000004656612873077392578125f // 0x1p-31f
  38. #if NEED_SCALAR_CONVERTER_FALLBACKS
  39. // This code requires that floats are in the IEEE-754 binary32 format
  40. SDL_COMPILE_TIME_ASSERT(float_bits, sizeof(float) == sizeof(Uint32));
  41. union float_bits {
  42. Uint32 u32;
  43. float f32;
  44. };
  45. static void SDL_Convert_S8_to_F32_Scalar(float *dst, const Sint8 *src, int num_samples)
  46. {
  47. int i;
  48. LOG_DEBUG_AUDIO_CONVERT("S8", "F32");
  49. for (i = num_samples - 1; i >= 0; --i) {
  50. /* 1) Construct a float in the range [65536.0, 65538.0)
  51. * 2) Shift the float range to [-1.0, 1.0) */
  52. union float_bits x;
  53. x.u32 = (Uint8)src[i] ^ 0x47800080u;
  54. dst[i] = x.f32 - 65537.0f;
  55. }
  56. }
  57. static void SDL_Convert_U8_to_F32_Scalar(float *dst, const Uint8 *src, int num_samples)
  58. {
  59. int i;
  60. LOG_DEBUG_AUDIO_CONVERT("U8", "F32");
  61. for (i = num_samples - 1; i >= 0; --i) {
  62. /* 1) Construct a float in the range [65536.0, 65538.0)
  63. * 2) Shift the float range to [-1.0, 1.0) */
  64. union float_bits x;
  65. x.u32 = src[i] ^ 0x47800000u;
  66. dst[i] = x.f32 - 65537.0f;
  67. }
  68. }
  69. static void SDL_Convert_S16_to_F32_Scalar(float *dst, const Sint16 *src, int num_samples)
  70. {
  71. int i;
  72. LOG_DEBUG_AUDIO_CONVERT("S16", "F32");
  73. for (i = num_samples - 1; i >= 0; --i) {
  74. /* 1) Construct a float in the range [256.0, 258.0)
  75. * 2) Shift the float range to [-1.0, 1.0) */
  76. union float_bits x;
  77. x.u32 = (Uint16)src[i] ^ 0x43808000u;
  78. dst[i] = x.f32 - 257.0f;
  79. }
  80. }
  81. static void SDL_Convert_S32_to_F32_Scalar(float *dst, const Sint32 *src, int num_samples)
  82. {
  83. int i;
  84. LOG_DEBUG_AUDIO_CONVERT("S32", "F32");
  85. for (i = num_samples - 1; i >= 0; --i) {
  86. dst[i] = (float)src[i] * DIVBY2147483648;
  87. }
  88. }
  89. // Create a bit-mask based on the sign-bit. Should optimize to a single arithmetic-shift-right
  90. #define SIGNMASK(x) (Uint32)(0u - ((Uint32)(x) >> 31))
  91. static void SDL_Convert_F32_to_S8_Scalar(Sint8 *dst, const float *src, int num_samples)
  92. {
  93. int i;
  94. LOG_DEBUG_AUDIO_CONVERT("F32", "S8");
  95. for (i = 0; i < num_samples; ++i) {
  96. /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
  97. * 2) Shift the integer range from [0x47BFFF80, 0x47C00080] to [-128, 128]
  98. * 3) Clamp the value to [-128, 127] */
  99. union float_bits x;
  100. x.f32 = src[i] + 98304.0f;
  101. Uint32 y = x.u32 - 0x47C00000u;
  102. Uint32 z = 0x7Fu - (y ^ SIGNMASK(y));
  103. y = y ^ (z & SIGNMASK(z));
  104. dst[i] = (Sint8)(y & 0xFF);
  105. }
  106. }
  107. static void SDL_Convert_F32_to_U8_Scalar(Uint8 *dst, const float *src, int num_samples)
  108. {
  109. int i;
  110. LOG_DEBUG_AUDIO_CONVERT("F32", "U8");
  111. for (i = 0; i < num_samples; ++i) {
  112. /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
  113. * 2) Shift the integer range from [0x47BFFF80, 0x47C00080] to [-128, 128]
  114. * 3) Clamp the value to [-128, 127]
  115. * 4) Shift the integer range from [-128, 127] to [0, 255] */
  116. union float_bits x;
  117. x.f32 = src[i] + 98304.0f;
  118. Uint32 y = x.u32 - 0x47C00000u;
  119. Uint32 z = 0x7Fu - (y ^ SIGNMASK(y));
  120. y = (y ^ 0x80u) ^ (z & SIGNMASK(z));
  121. dst[i] = (Uint8)(y & 0xFF);
  122. }
  123. }
  124. static void SDL_Convert_F32_to_S16_Scalar(Sint16 *dst, const float *src, int num_samples)
  125. {
  126. int i;
  127. LOG_DEBUG_AUDIO_CONVERT("F32", "S16");
  128. for (i = 0; i < num_samples; ++i) {
  129. /* 1) Shift the float range from [-1.0, 1.0] to [383.0, 385.0]
  130. * 2) Shift the integer range from [0x43BF8000, 0x43C08000] to [-32768, 32768]
  131. * 3) Clamp values outside the [-32768, 32767] range */
  132. union float_bits x;
  133. x.f32 = src[i] + 384.0f;
  134. Uint32 y = x.u32 - 0x43C00000u;
  135. Uint32 z = 0x7FFFu - (y ^ SIGNMASK(y));
  136. y = y ^ (z & SIGNMASK(z));
  137. dst[i] = (Sint16)(y & 0xFFFF);
  138. }
  139. }
  140. static void SDL_Convert_F32_to_S32_Scalar(Sint32 *dst, const float *src, int num_samples)
  141. {
  142. int i;
  143. LOG_DEBUG_AUDIO_CONVERT("F32", "S32");
  144. for (i = 0; i < num_samples; ++i) {
  145. /* 1) Shift the float range from [-1.0, 1.0] to [-2147483648.0, 2147483648.0]
  146. * 2) Set values outside the [-2147483648.0, 2147483647.0] range to -2147483648.0
  147. * 3) Convert the float to an integer, and fixup values outside the valid range */
  148. union float_bits x;
  149. x.f32 = src[i];
  150. Uint32 y = x.u32 + 0x0F800000u;
  151. Uint32 z = y - 0xCF000000u;
  152. z &= SIGNMASK(y ^ z);
  153. x.u32 = y - z;
  154. dst[i] = (Sint32)x.f32 ^ (Sint32)SIGNMASK(z);
  155. }
  156. }
  157. #undef SIGNMASK
  158. #endif // NEED_SCALAR_CONVERTER_FALLBACKS
  159. #ifdef SDL_SSE2_INTRINSICS
  160. static void SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(float *dst, const Sint8 *src, int num_samples)
  161. {
  162. int i = num_samples;
  163. /* 1) Flip the sign bit to convert from S8 to U8 format
  164. * 2) Construct a float in the range [65536.0, 65538.0)
  165. * 3) Shift the float range to [-1.0, 1.0)
  166. * dst[i] = i2f((src[i] ^ 0x80) | 0x47800000) - 65537.0 */
  167. const __m128i zero = _mm_setzero_si128();
  168. const __m128i flipper = _mm_set1_epi8(-0x80);
  169. const __m128i caster = _mm_set1_epi16(0x4780 /* 0x47800000 = f2i(65536.0) */);
  170. const __m128 offset = _mm_set1_ps(-65537.0);
  171. LOG_DEBUG_AUDIO_CONVERT("S8", "F32 (using SSE2)");
  172. while (i >= 16) {
  173. i -= 16;
  174. const __m128i bytes = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i]), flipper);
  175. const __m128i shorts1 = _mm_unpacklo_epi8(bytes, zero);
  176. const __m128i shorts2 = _mm_unpackhi_epi8(bytes, zero);
  177. const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset);
  178. const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset);
  179. const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset);
  180. const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset);
  181. _mm_storeu_ps(&dst[i], floats1);
  182. _mm_storeu_ps(&dst[i + 4], floats2);
  183. _mm_storeu_ps(&dst[i + 8], floats3);
  184. _mm_storeu_ps(&dst[i + 12], floats4);
  185. }
  186. while (i) {
  187. --i;
  188. _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800080u)), offset));
  189. }
  190. }
  191. static void SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(float *dst, const Uint8 *src, int num_samples)
  192. {
  193. int i = num_samples;
  194. /* 1) Construct a float in the range [65536.0, 65538.0)
  195. * 2) Shift the float range to [-1.0, 1.0)
  196. * dst[i] = i2f(src[i] | 0x47800000) - 65537.0 */
  197. const __m128i zero = _mm_setzero_si128();
  198. const __m128i caster = _mm_set1_epi16(0x4780 /* 0x47800000 = f2i(65536.0) */);
  199. const __m128 offset = _mm_set1_ps(-65537.0);
  200. LOG_DEBUG_AUDIO_CONVERT("U8", "F32 (using SSE2)");
  201. while (i >= 16) {
  202. i -= 16;
  203. const __m128i bytes = _mm_loadu_si128((const __m128i *)&src[i]);
  204. const __m128i shorts1 = _mm_unpacklo_epi8(bytes, zero);
  205. const __m128i shorts2 = _mm_unpackhi_epi8(bytes, zero);
  206. const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset);
  207. const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset);
  208. const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset);
  209. const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset);
  210. _mm_storeu_ps(&dst[i], floats1);
  211. _mm_storeu_ps(&dst[i + 4], floats2);
  212. _mm_storeu_ps(&dst[i + 8], floats3);
  213. _mm_storeu_ps(&dst[i + 12], floats4);
  214. }
  215. while (i) {
  216. --i;
  217. _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800000u)), offset));
  218. }
  219. }
  220. static void SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(float *dst, const Sint16 *src, int num_samples)
  221. {
  222. int i = num_samples;
  223. /* 1) Flip the sign bit to convert from S16 to U16 format
  224. * 2) Construct a float in the range [256.0, 258.0)
  225. * 3) Shift the float range to [-1.0, 1.0)
  226. * dst[i] = i2f((src[i] ^ 0x8000) | 0x43800000) - 257.0 */
  227. const __m128i flipper = _mm_set1_epi16(-0x8000);
  228. const __m128i caster = _mm_set1_epi16(0x4380 /* 0x43800000 = f2i(256.0) */);
  229. const __m128 offset = _mm_set1_ps(-257.0f);
  230. LOG_DEBUG_AUDIO_CONVERT("S16", "F32 (using SSE2)");
  231. while (i >= 16) {
  232. i -= 16;
  233. const __m128i shorts1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i]), flipper);
  234. const __m128i shorts2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i + 8]), flipper);
  235. const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset);
  236. const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset);
  237. const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset);
  238. const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset);
  239. _mm_storeu_ps(&dst[i], floats1);
  240. _mm_storeu_ps(&dst[i + 4], floats2);
  241. _mm_storeu_ps(&dst[i + 8], floats3);
  242. _mm_storeu_ps(&dst[i + 12], floats4);
  243. }
  244. while (i) {
  245. --i;
  246. _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint16)src[i] ^ 0x43808000u)), offset));
  247. }
  248. }
  249. static void SDL_TARGETING("sse2") SDL_Convert_S32_to_F32_SSE2(float *dst, const Sint32 *src, int num_samples)
  250. {
  251. int i = num_samples;
  252. // dst[i] = f32(src[i]) / f32(0x80000000)
  253. const __m128 scaler = _mm_set1_ps(DIVBY2147483648);
  254. LOG_DEBUG_AUDIO_CONVERT("S32", "F32 (using SSE2)");
  255. while (i >= 16) {
  256. i -= 16;
  257. const __m128i ints1 = _mm_loadu_si128((const __m128i *)&src[i]);
  258. const __m128i ints2 = _mm_loadu_si128((const __m128i *)&src[i + 4]);
  259. const __m128i ints3 = _mm_loadu_si128((const __m128i *)&src[i + 8]);
  260. const __m128i ints4 = _mm_loadu_si128((const __m128i *)&src[i + 12]);
  261. const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(ints1), scaler);
  262. const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(ints2), scaler);
  263. const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(ints3), scaler);
  264. const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(ints4), scaler);
  265. _mm_storeu_ps(&dst[i], floats1);
  266. _mm_storeu_ps(&dst[i + 4], floats2);
  267. _mm_storeu_ps(&dst[i + 8], floats3);
  268. _mm_storeu_ps(&dst[i + 12], floats4);
  269. }
  270. while (i) {
  271. --i;
  272. _mm_store_ss(&dst[i], _mm_mul_ss(_mm_cvt_si2ss(_mm_setzero_ps(), src[i]), scaler));
  273. }
  274. }
  275. static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(Sint8 *dst, const float *src, int num_samples)
  276. {
  277. int i = num_samples;
  278. /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
  279. * 2) Extract the lowest 16 bits and clamp to [-128, 127]
  280. * Overflow is correctly handled for inputs between roughly [-255.0, 255.0]
  281. * dst[i] = clamp(i16(f2i(src[i] + 98304.0) & 0xFFFF), -128, 127) */
  282. const __m128 offset = _mm_set1_ps(98304.0f);
  283. const __m128i mask = _mm_set1_epi16(0xFF);
  284. LOG_DEBUG_AUDIO_CONVERT("F32", "S8 (using SSE2)");
  285. while (i >= 16) {
  286. const __m128 floats1 = _mm_loadu_ps(&src[0]);
  287. const __m128 floats2 = _mm_loadu_ps(&src[4]);
  288. const __m128 floats3 = _mm_loadu_ps(&src[8]);
  289. const __m128 floats4 = _mm_loadu_ps(&src[12]);
  290. const __m128i ints1 = _mm_castps_si128(_mm_add_ps(floats1, offset));
  291. const __m128i ints2 = _mm_castps_si128(_mm_add_ps(floats2, offset));
  292. const __m128i ints3 = _mm_castps_si128(_mm_add_ps(floats3, offset));
  293. const __m128i ints4 = _mm_castps_si128(_mm_add_ps(floats4, offset));
  294. const __m128i shorts1 = _mm_and_si128(_mm_packs_epi16(ints1, ints2), mask);
  295. const __m128i shorts2 = _mm_and_si128(_mm_packs_epi16(ints3, ints4), mask);
  296. const __m128i bytes = _mm_packus_epi16(shorts1, shorts2);
  297. _mm_storeu_si128((__m128i*)dst, bytes);
  298. i -= 16;
  299. src += 16;
  300. dst += 16;
  301. }
  302. while (i) {
  303. const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset));
  304. *dst = (Sint8)(_mm_cvtsi128_si32(_mm_packs_epi16(ints, ints)) & 0xFF);
  305. --i;
  306. ++src;
  307. ++dst;
  308. }
  309. }
  310. static void SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(Uint8 *dst, const float *src, int num_samples)
  311. {
  312. int i = num_samples;
  313. /* 1) Shift the float range from [-1.0, 1.0] to [98304.0, 98306.0]
  314. * 2) Extract the lowest 16 bits and clamp to [0, 255]
  315. * Overflow is correctly handled for inputs between roughly [-254.0, 254.0]
  316. * dst[i] = clamp(i16(f2i(src[i] + 98305.0) & 0xFFFF), 0, 255) */
  317. const __m128 offset = _mm_set1_ps(98305.0f);
  318. const __m128i mask = _mm_set1_epi16(0xFF);
  319. LOG_DEBUG_AUDIO_CONVERT("F32", "U8 (using SSE2)");
  320. while (i >= 16) {
  321. const __m128 floats1 = _mm_loadu_ps(&src[0]);
  322. const __m128 floats2 = _mm_loadu_ps(&src[4]);
  323. const __m128 floats3 = _mm_loadu_ps(&src[8]);
  324. const __m128 floats4 = _mm_loadu_ps(&src[12]);
  325. const __m128i ints1 = _mm_castps_si128(_mm_add_ps(floats1, offset));
  326. const __m128i ints2 = _mm_castps_si128(_mm_add_ps(floats2, offset));
  327. const __m128i ints3 = _mm_castps_si128(_mm_add_ps(floats3, offset));
  328. const __m128i ints4 = _mm_castps_si128(_mm_add_ps(floats4, offset));
  329. const __m128i shorts1 = _mm_and_si128(_mm_packus_epi16(ints1, ints2), mask);
  330. const __m128i shorts2 = _mm_and_si128(_mm_packus_epi16(ints3, ints4), mask);
  331. const __m128i bytes = _mm_packus_epi16(shorts1, shorts2);
  332. _mm_storeu_si128((__m128i*)dst, bytes);
  333. i -= 16;
  334. src += 16;
  335. dst += 16;
  336. }
  337. while (i) {
  338. const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset));
  339. *dst = (Uint8)(_mm_cvtsi128_si32(_mm_packus_epi16(ints, ints)) & 0xFF);
  340. --i;
  341. ++src;
  342. ++dst;
  343. }
  344. }
  345. static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(Sint16 *dst, const float *src, int num_samples)
  346. {
  347. int i = num_samples;
  348. /* 1) Shift the float range from [-1.0, 1.0] to [256.0, 258.0]
  349. * 2) Shift the int range from [0x43800000, 0x43810000] to [-32768,32768]
  350. * 3) Clamp to range [-32768,32767]
  351. * Overflow is correctly handled for inputs between roughly [-257.0, +inf)
  352. * dst[i] = clamp(f2i(src[i] + 257.0) - 0x43808000, -32768, 32767) */
  353. const __m128 offset = _mm_set1_ps(257.0f);
  354. LOG_DEBUG_AUDIO_CONVERT("F32", "S16 (using SSE2)");
  355. while (i >= 16) {
  356. const __m128 floats1 = _mm_loadu_ps(&src[0]);
  357. const __m128 floats2 = _mm_loadu_ps(&src[4]);
  358. const __m128 floats3 = _mm_loadu_ps(&src[8]);
  359. const __m128 floats4 = _mm_loadu_ps(&src[12]);
  360. const __m128i ints1 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats1, offset)), _mm_castps_si128(offset));
  361. const __m128i ints2 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats2, offset)), _mm_castps_si128(offset));
  362. const __m128i ints3 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats3, offset)), _mm_castps_si128(offset));
  363. const __m128i ints4 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats4, offset)), _mm_castps_si128(offset));
  364. const __m128i shorts1 = _mm_packs_epi32(ints1, ints2);
  365. const __m128i shorts2 = _mm_packs_epi32(ints3, ints4);
  366. _mm_storeu_si128((__m128i*)&dst[0], shorts1);
  367. _mm_storeu_si128((__m128i*)&dst[8], shorts2);
  368. i -= 16;
  369. src += 16;
  370. dst += 16;
  371. }
  372. while (i) {
  373. const __m128i ints = _mm_sub_epi32(_mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset)), _mm_castps_si128(offset));
  374. *dst = (Sint16)(_mm_cvtsi128_si32(_mm_packs_epi32(ints, ints)) & 0xFFFF);
  375. --i;
  376. ++src;
  377. ++dst;
  378. }
  379. }
  380. static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(Sint32 *dst, const float *src, int num_samples)
  381. {
  382. int i = num_samples;
  383. /* 1) Scale the float range from [-1.0, 1.0] to [-2147483648.0, 2147483648.0]
  384. * 2) Convert to integer (values too small/large become 0x80000000 = -2147483648)
  385. * 3) Fixup values which were too large (0x80000000 ^ 0xFFFFFFFF = 2147483647)
  386. * dst[i] = i32(src[i] * 2147483648.0) ^ ((src[i] >= 2147483648.0) ? 0xFFFFFFFF : 0x00000000) */
  387. const __m128 limit = _mm_set1_ps(2147483648.0f);
  388. LOG_DEBUG_AUDIO_CONVERT("F32", "S32 (using SSE2)");
  389. while (i >= 16) {
  390. const __m128 floats1 = _mm_loadu_ps(&src[0]);
  391. const __m128 floats2 = _mm_loadu_ps(&src[4]);
  392. const __m128 floats3 = _mm_loadu_ps(&src[8]);
  393. const __m128 floats4 = _mm_loadu_ps(&src[12]);
  394. const __m128 values1 = _mm_mul_ps(floats1, limit);
  395. const __m128 values2 = _mm_mul_ps(floats2, limit);
  396. const __m128 values3 = _mm_mul_ps(floats3, limit);
  397. const __m128 values4 = _mm_mul_ps(floats4, limit);
  398. const __m128i ints1 = _mm_xor_si128(_mm_cvttps_epi32(values1), _mm_castps_si128(_mm_cmpge_ps(values1, limit)));
  399. const __m128i ints2 = _mm_xor_si128(_mm_cvttps_epi32(values2), _mm_castps_si128(_mm_cmpge_ps(values2, limit)));
  400. const __m128i ints3 = _mm_xor_si128(_mm_cvttps_epi32(values3), _mm_castps_si128(_mm_cmpge_ps(values3, limit)));
  401. const __m128i ints4 = _mm_xor_si128(_mm_cvttps_epi32(values4), _mm_castps_si128(_mm_cmpge_ps(values4, limit)));
  402. _mm_storeu_si128((__m128i*)&dst[0], ints1);
  403. _mm_storeu_si128((__m128i*)&dst[4], ints2);
  404. _mm_storeu_si128((__m128i*)&dst[8], ints3);
  405. _mm_storeu_si128((__m128i*)&dst[12], ints4);
  406. i -= 16;
  407. src += 16;
  408. dst += 16;
  409. }
  410. while (i) {
  411. const __m128 floats = _mm_load_ss(src);
  412. const __m128 values = _mm_mul_ss(floats, limit);
  413. const __m128i ints = _mm_xor_si128(_mm_cvttps_epi32(values), _mm_castps_si128(_mm_cmpge_ss(values, limit)));
  414. *dst = (Sint32)_mm_cvtsi128_si32(ints);
  415. --i;
  416. ++src;
  417. ++dst;
  418. }
  419. }
  420. #endif
  421. #ifdef SDL_NEON_INTRINSICS
  422. #define DIVBY128 0.0078125f // 0x1p-7f
  423. #define DIVBY32768 0.000030517578125f // 0x1p-15f
  424. #define DIVBY8388607 0.00000011920930376163766f // 0x1.000002p-23f
  425. static void SDL_Convert_S8_to_F32_NEON(float *dst, const Sint8 *src, int num_samples)
  426. {
  427. int i;
  428. LOG_DEBUG_AUDIO_CONVERT("S8", "F32 (using NEON)");
  429. src += num_samples - 1;
  430. dst += num_samples - 1;
  431. // Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src)
  432. for (i = num_samples; i && (((size_t)(dst - 15)) & 15); --i, --src, --dst) {
  433. *dst = ((float)*src) * DIVBY128;
  434. }
  435. src -= 15;
  436. dst -= 15; // adjust to read NEON blocks from the start.
  437. SDL_assert(!i || !(((size_t)dst) & 15));
  438. // Make sure src is aligned too.
  439. if (!(((size_t)src) & 15)) {
  440. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  441. const int8_t *mmsrc = (const int8_t *)src;
  442. const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
  443. while (i >= 16) { // 16 * 8-bit
  444. const int8x16_t bytes = vld1q_s8(mmsrc); // get 16 sint8 into a NEON register.
  445. const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); // convert top 8 bytes to 8 int16
  446. const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); // convert bottom 8 bytes to 8 int16
  447. // split int16 to two int32, then convert to float, then multiply to normalize, store.
  448. vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
  449. vst1q_f32(dst + 4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
  450. vst1q_f32(dst + 8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
  451. vst1q_f32(dst + 12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
  452. i -= 16;
  453. mmsrc -= 16;
  454. dst -= 16;
  455. }
  456. src = (const Sint8 *)mmsrc;
  457. }
  458. src += 15;
  459. dst += 15; // adjust for any scalar finishing.
  460. // Finish off any leftovers with scalar operations.
  461. while (i) {
  462. *dst = ((float)*src) * DIVBY128;
  463. i--;
  464. src--;
  465. dst--;
  466. }
  467. }
  468. static void SDL_Convert_U8_to_F32_NEON(float *dst, const Uint8 *src, int num_samples)
  469. {
  470. int i;
  471. LOG_DEBUG_AUDIO_CONVERT("U8", "F32 (using NEON)");
  472. src += num_samples - 1;
  473. dst += num_samples - 1;
  474. // Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src)
  475. for (i = num_samples; i && (((size_t)(dst - 15)) & 15); --i, --src, --dst) {
  476. *dst = (((float)*src) * DIVBY128) - 1.0f;
  477. }
  478. src -= 15;
  479. dst -= 15; // adjust to read NEON blocks from the start.
  480. SDL_assert(!i || !(((size_t)dst) & 15));
  481. // Make sure src is aligned too.
  482. if (!(((size_t)src) & 15)) {
  483. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  484. const uint8_t *mmsrc = (const uint8_t *)src;
  485. const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
  486. const float32x4_t negone = vdupq_n_f32(-1.0f);
  487. while (i >= 16) { // 16 * 8-bit
  488. const uint8x16_t bytes = vld1q_u8(mmsrc); // get 16 uint8 into a NEON register.
  489. const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); // convert top 8 bytes to 8 uint16
  490. const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); // convert bottom 8 bytes to 8 uint16
  491. // split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store.
  492. vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
  493. vst1q_f32(dst + 4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
  494. vst1q_f32(dst + 8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
  495. vst1q_f32(dst + 12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
  496. i -= 16;
  497. mmsrc -= 16;
  498. dst -= 16;
  499. }
  500. src = (const Uint8 *)mmsrc;
  501. }
  502. src += 15;
  503. dst += 15; // adjust for any scalar finishing.
  504. // Finish off any leftovers with scalar operations.
  505. while (i) {
  506. *dst = (((float)*src) * DIVBY128) - 1.0f;
  507. i--;
  508. src--;
  509. dst--;
  510. }
  511. }
  512. static void SDL_Convert_S16_to_F32_NEON(float *dst, const Sint16 *src, int num_samples)
  513. {
  514. int i;
  515. LOG_DEBUG_AUDIO_CONVERT("S16", "F32 (using NEON)");
  516. src += num_samples - 1;
  517. dst += num_samples - 1;
  518. // Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src)
  519. for (i = num_samples; i && (((size_t)(dst - 7)) & 15); --i, --src, --dst) {
  520. *dst = ((float)*src) * DIVBY32768;
  521. }
  522. src -= 7;
  523. dst -= 7; // adjust to read NEON blocks from the start.
  524. SDL_assert(!i || !(((size_t)dst) & 15));
  525. // Make sure src is aligned too.
  526. if (!(((size_t)src) & 15)) {
  527. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  528. const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
  529. while (i >= 8) { // 8 * 16-bit
  530. const int16x8_t ints = vld1q_s16((int16_t const *)src); // get 8 sint16 into a NEON register.
  531. // split int16 to two int32, then convert to float, then multiply to normalize, store.
  532. vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
  533. vst1q_f32(dst + 4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
  534. i -= 8;
  535. src -= 8;
  536. dst -= 8;
  537. }
  538. }
  539. src += 7;
  540. dst += 7; // adjust for any scalar finishing.
  541. // Finish off any leftovers with scalar operations.
  542. while (i) {
  543. *dst = ((float)*src) * DIVBY32768;
  544. i--;
  545. src--;
  546. dst--;
  547. }
  548. }
  549. static void SDL_Convert_S32_to_F32_NEON(float *dst, const Sint32 *src, int num_samples)
  550. {
  551. int i;
  552. LOG_DEBUG_AUDIO_CONVERT("S32", "F32 (using NEON)");
  553. // Get dst aligned to 16 bytes
  554. for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) {
  555. *dst = ((float)(*src >> 8)) * DIVBY8388607;
  556. }
  557. SDL_assert(!i || !(((size_t)dst) & 15));
  558. // Make sure src is aligned too.
  559. if (!(((size_t)src) & 15)) {
  560. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  561. const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
  562. const int32_t *mmsrc = (const int32_t *)src;
  563. while (i >= 4) { // 4 * sint32
  564. // shift out lowest bits so int fits in a float32. Small precision loss, but much faster.
  565. vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
  566. i -= 4;
  567. mmsrc += 4;
  568. dst += 4;
  569. }
  570. src = (const Sint32 *)mmsrc;
  571. }
  572. // Finish off any leftovers with scalar operations.
  573. while (i) {
  574. *dst = ((float)(*src >> 8)) * DIVBY8388607;
  575. i--;
  576. src++;
  577. dst++;
  578. }
  579. }
  580. static void SDL_Convert_F32_to_S8_NEON(Sint8 *dst, const float *src, int num_samples)
  581. {
  582. int i;
  583. LOG_DEBUG_AUDIO_CONVERT("F32", "S8 (using NEON)");
  584. // Get dst aligned to 16 bytes
  585. for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) {
  586. const float sample = *src;
  587. if (sample >= 1.0f) {
  588. *dst = 127;
  589. } else if (sample <= -1.0f) {
  590. *dst = -128;
  591. } else {
  592. *dst = (Sint8)(sample * 127.0f);
  593. }
  594. }
  595. SDL_assert(!i || !(((size_t)dst) & 15));
  596. // Make sure src is aligned too.
  597. if (!(((size_t)src) & 15)) {
  598. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  599. const float32x4_t one = vdupq_n_f32(1.0f);
  600. const float32x4_t negone = vdupq_n_f32(-1.0f);
  601. const float32x4_t mulby127 = vdupq_n_f32(127.0f);
  602. int8_t *mmdst = (int8_t *)dst;
  603. while (i >= 16) { // 16 * float32
  604. const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); // load 4 floats, clamp, convert to sint32
  605. const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 4)), one), mulby127)); // load 4 floats, clamp, convert to sint32
  606. const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 8)), one), mulby127)); // load 4 floats, clamp, convert to sint32
  607. const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 12)), one), mulby127)); // load 4 floats, clamp, convert to sint32
  608. const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); // narrow to sint16, combine, narrow to sint8
  609. const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); // narrow to sint16, combine, narrow to sint8
  610. vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); // combine to int8x16_t, store out
  611. i -= 16;
  612. src += 16;
  613. mmdst += 16;
  614. }
  615. dst = (Sint8 *)mmdst;
  616. }
  617. // Finish off any leftovers with scalar operations.
  618. while (i) {
  619. const float sample = *src;
  620. if (sample >= 1.0f) {
  621. *dst = 127;
  622. } else if (sample <= -1.0f) {
  623. *dst = -128;
  624. } else {
  625. *dst = (Sint8)(sample * 127.0f);
  626. }
  627. i--;
  628. src++;
  629. dst++;
  630. }
  631. }
  632. static void SDL_Convert_F32_to_U8_NEON(Uint8 *dst, const float *src, int num_samples)
  633. {
  634. int i;
  635. LOG_DEBUG_AUDIO_CONVERT("F32", "U8 (using NEON)");
  636. // Get dst aligned to 16 bytes
  637. for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) {
  638. const float sample = *src;
  639. if (sample >= 1.0f) {
  640. *dst = 255;
  641. } else if (sample <= -1.0f) {
  642. *dst = 0;
  643. } else {
  644. *dst = (Uint8)((sample + 1.0f) * 127.0f);
  645. }
  646. }
  647. SDL_assert(!i || !(((size_t)dst) & 15));
  648. // Make sure src is aligned too.
  649. if (!(((size_t)src) & 15)) {
  650. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  651. const float32x4_t one = vdupq_n_f32(1.0f);
  652. const float32x4_t negone = vdupq_n_f32(-1.0f);
  653. const float32x4_t mulby127 = vdupq_n_f32(127.0f);
  654. uint8_t *mmdst = (uint8_t *)dst;
  655. while (i >= 16) { // 16 * float32
  656. const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32
  657. const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 4)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32
  658. const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 8)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32
  659. const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 12)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32
  660. const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); // narrow to uint16, combine, narrow to uint8
  661. const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); // narrow to uint16, combine, narrow to uint8
  662. vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); // combine to uint8x16_t, store out
  663. i -= 16;
  664. src += 16;
  665. mmdst += 16;
  666. }
  667. dst = (Uint8 *)mmdst;
  668. }
  669. // Finish off any leftovers with scalar operations.
  670. while (i) {
  671. const float sample = *src;
  672. if (sample >= 1.0f) {
  673. *dst = 255;
  674. } else if (sample <= -1.0f) {
  675. *dst = 0;
  676. } else {
  677. *dst = (Uint8)((sample + 1.0f) * 127.0f);
  678. }
  679. i--;
  680. src++;
  681. dst++;
  682. }
  683. }
  684. static void SDL_Convert_F32_to_S16_NEON(Sint16 *dst, const float *src, int num_samples)
  685. {
  686. int i;
  687. LOG_DEBUG_AUDIO_CONVERT("F32", "S16 (using NEON)");
  688. // Get dst aligned to 16 bytes
  689. for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) {
  690. const float sample = *src;
  691. if (sample >= 1.0f) {
  692. *dst = 32767;
  693. } else if (sample <= -1.0f) {
  694. *dst = -32768;
  695. } else {
  696. *dst = (Sint16)(sample * 32767.0f);
  697. }
  698. }
  699. SDL_assert(!i || !(((size_t)dst) & 15));
  700. // Make sure src is aligned too.
  701. if (!(((size_t)src) & 15)) {
  702. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  703. const float32x4_t one = vdupq_n_f32(1.0f);
  704. const float32x4_t negone = vdupq_n_f32(-1.0f);
  705. const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
  706. int16_t *mmdst = (int16_t *)dst;
  707. while (i >= 8) { // 8 * float32
  708. const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); // load 4 floats, clamp, convert to sint32
  709. const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 4)), one), mulby32767)); // load 4 floats, clamp, convert to sint32
  710. vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); // narrow to sint16, combine, store out.
  711. i -= 8;
  712. src += 8;
  713. mmdst += 8;
  714. }
  715. dst = (Sint16 *)mmdst;
  716. }
  717. // Finish off any leftovers with scalar operations.
  718. while (i) {
  719. const float sample = *src;
  720. if (sample >= 1.0f) {
  721. *dst = 32767;
  722. } else if (sample <= -1.0f) {
  723. *dst = -32768;
  724. } else {
  725. *dst = (Sint16)(sample * 32767.0f);
  726. }
  727. i--;
  728. src++;
  729. dst++;
  730. }
  731. }
  732. static void SDL_Convert_F32_to_S32_NEON(Sint32 *dst, const float *src, int num_samples)
  733. {
  734. int i;
  735. LOG_DEBUG_AUDIO_CONVERT("F32", "S32 (using NEON)");
  736. // Get dst aligned to 16 bytes
  737. for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) {
  738. const float sample = *src;
  739. if (sample >= 1.0f) {
  740. *dst = 2147483647;
  741. } else if (sample <= -1.0f) {
  742. *dst = (-2147483647) - 1;
  743. } else {
  744. *dst = ((Sint32)(sample * 8388607.0f)) << 8;
  745. }
  746. }
  747. SDL_assert(!i || !(((size_t)dst) & 15));
  748. SDL_assert(!i || !(((size_t)src) & 15));
  749. {
  750. // Aligned! Do NEON blocks as long as we have 16 bytes available.
  751. const float32x4_t one = vdupq_n_f32(1.0f);
  752. const float32x4_t negone = vdupq_n_f32(-1.0f);
  753. const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
  754. int32_t *mmdst = (int32_t *)dst;
  755. while (i >= 4) { // 4 * float32
  756. vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
  757. i -= 4;
  758. src += 4;
  759. mmdst += 4;
  760. }
  761. dst = (Sint32 *)mmdst;
  762. }
  763. // Finish off any leftovers with scalar operations.
  764. while (i) {
  765. const float sample = *src;
  766. if (sample >= 1.0f) {
  767. *dst = 2147483647;
  768. } else if (sample <= -1.0f) {
  769. *dst = (-2147483647) - 1;
  770. } else {
  771. *dst = ((Sint32)(sample * 8388607.0f)) << 8;
  772. }
  773. i--;
  774. src++;
  775. dst++;
  776. }
  777. }
  778. #endif
  779. // Function pointers set to a CPU-specific implementation.
  780. void (*SDL_Convert_S8_to_F32)(float *dst, const Sint8 *src, int num_samples) = NULL;
  781. void (*SDL_Convert_U8_to_F32)(float *dst, const Uint8 *src, int num_samples) = NULL;
  782. void (*SDL_Convert_S16_to_F32)(float *dst, const Sint16 *src, int num_samples) = NULL;
  783. void (*SDL_Convert_S32_to_F32)(float *dst, const Sint32 *src, int num_samples) = NULL;
  784. void (*SDL_Convert_F32_to_S8)(Sint8 *dst, const float *src, int num_samples) = NULL;
  785. void (*SDL_Convert_F32_to_U8)(Uint8 *dst, const float *src, int num_samples) = NULL;
  786. void (*SDL_Convert_F32_to_S16)(Sint16 *dst, const float *src, int num_samples) = NULL;
  787. void (*SDL_Convert_F32_to_S32)(Sint32 *dst, const float *src, int num_samples) = NULL;
  788. void SDL_ChooseAudioConverters(void)
  789. {
  790. static SDL_bool converters_chosen = SDL_FALSE;
  791. if (converters_chosen) {
  792. return;
  793. }
  794. #define SET_CONVERTER_FUNCS(fntype) \
  795. SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
  796. SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
  797. SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
  798. SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
  799. SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
  800. SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
  801. SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
  802. SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
  803. converters_chosen = SDL_TRUE
  804. #ifdef SDL_SSE2_INTRINSICS
  805. if (SDL_HasSSE2()) {
  806. SET_CONVERTER_FUNCS(SSE2);
  807. return;
  808. }
  809. #endif
  810. #ifdef SDL_NEON_INTRINSICS
  811. if (SDL_HasNEON()) {
  812. SET_CONVERTER_FUNCS(NEON);
  813. return;
  814. }
  815. #endif
  816. #if NEED_SCALAR_CONVERTER_FALLBACKS
  817. SET_CONVERTER_FUNCS(Scalar);
  818. #endif
  819. #undef SET_CONVERTER_FUNCS
  820. SDL_assert(converters_chosen == SDL_TRUE);
  821. }