3 years ago · 9f56c7cf02
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -93,68 +93,34 @@
 
				  * 8 channels (7.1) layout: FL+FR+FC+LFE+BL+BR+SL+SR
			
 
				  */
			
 
				 
			
 
				-
			
 
				-#if 0  /* !!! FIXME: these need to be updated to match the new scalar code. */
			
 
				-#if HAVE_AVX_INTRINSICS
			
 
				-/* MSVC will always accept AVX intrinsics when compiling for x64 */
			
 
				-#if defined(__clang__) || defined(__GNUC__)
			
 
				-__attribute__((target("avx")))
			
 
				-#endif
			
 
				-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
			
 
				+#if HAVE_SSE3_INTRINSICS
			
 
				+/* Convert from stereo to mono. Average left and right. */
			
 
				 static void SDLCALL
			
 
				-SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
			
 
				+SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
			
 
				 {
			
 
				+    const __m128 divby2 = _mm_set1_ps(0.5f);
			
 
				     float *dst = (float *) cvt->buf;
			
 
				     const float *src = dst;
			
 
				-    int i = cvt->len_cvt / (sizeof (float) * 6);
			
 
				-    const float two_fifths_f = 1.0f / 2.5f;
			
 
				-    const __m256 two_fifths_v = _mm256_set1_ps(two_fifths_f);
			
 
				-    const __m256 half = _mm256_set1_ps(0.5f);
			
 
				+    int i = cvt->len_cvt / 8;
			
 
				 
			
 
				-    LOG_DEBUG_CONVERT("5.1", "stereo (using AVX)");
			
 
				+    LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
			
 
				     SDL_assert(format == AUDIO_F32SYS);
			
 
				 
			
 
				-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
			
 
				-    while (i >= 4) {
			
 
				-        __m256 in0 = _mm256_loadu_ps(src + 0);  /* 0FL 0FR 0FC 0LF 0BL 0BR 1FL 1FR */
			
 
				-        __m256 in1 = _mm256_loadu_ps(src + 8);  /* 1FC 1LF 1BL 1BR 2FL 2FR 2FC 2LF */
			
 
				-        __m256 in2 = _mm256_loadu_ps(src + 16); /* 2BL 2BR 3FL 3FR 3FC 3LF 3BL 3BR */
			
 
				-
			
 
				-        /* 0FL 0FR 0FC 0LF 2FL 2FR 2FC 2LF */
			
 
				-        __m256 temp0 = _mm256_blend_ps(in0, in1, 0xF0);
			
 
				-        /* 1FC 1LF 1BL 1BR 3FC 3LF 3BL 3BR */
			
 
				-        __m256 temp1 = _mm256_blend_ps(in1, in2, 0xF0);
			
 
				-
			
 
				-        /* 0FC 0FC 1FC 1FC 2FC 2FC 3FC 3FC */
			
 
				-        __m256 fc_distributed = _mm256_mul_ps(half, _mm256_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 2, 2)));
			
 
				-
			
 
				-        /* 0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
			
 
				-        __m256 permuted0 = _mm256_blend_ps(temp0, temp1, 0xCC);
			
 
				-        /* 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
			
 
				-        __m256 permuted1 = _mm256_permute2f128_ps(in0, in2, 0x21);
			
 
				-
			
 
				-        /*   0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
			
 
				-        /* + 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
			
 
				-        /* =  0L  0R  1L  1R  2L  2R  3L  3R */
			
 
				-        __m256 out = _mm256_add_ps(permuted0, permuted1);
			
 
				-        out = _mm256_add_ps(out, fc_distributed);
			
 
				-        out = _mm256_mul_ps(out, two_fifths_v);
			
 
				-
			
 
				-        _mm256_storeu_ps(dst, out);
			
 
				-
			
 
				-        i -= 4; src += 24; dst += 8;
			
 
				+    /* Do SSE blocks as long as we have 16 bytes available.
			
 
				+       Just use unaligned load/stores, if the memory at runtime is
			
 
				+       aligned it'll be just as fast on modern processors */
			
 
				+    while (i >= 4) {   /* 4 * float32 */
			
 
				+        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
			
 
				+        i -= 4; src += 8; dst += 4;
			
 
				     }
			
 
				 
			
 
				-
			
 
				     /* Finish off any leftovers with scalar operations. */
			
 
				     while (i) {
			
 
				-        const float front_center_distributed = src[2] * 0.5f;
			
 
				-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
			
 
				-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
			
 
				-        i--; src += 6; dst+=2;
			
 
				+        *dst = (src[0] + src[1]) * 0.5f;
			
 
				+        dst++; i--; src += 2;
			
 
				     }
			
 
				 
			
 
				-    cvt->len_cvt /= 3;
			
 
				+    cvt->len_cvt /= 2;
			
 
				     if (cvt->filters[++cvt->filter_index]) {
			
 
				         cvt->filters[cvt->filter_index] (cvt, format);
			
 
				     }
			
@@ -162,155 +128,38 @@ SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 
				 #endif
			
 
				 
			
 
				 #if HAVE_SSE_INTRINSICS
			
 
				-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
			
 
				+/* Convert from mono to stereo. Duplicate to stereo left and right. */
			
 
				 static void SDLCALL
			
 
				-SDL_Convert51ToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
			
 
				+SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
			
 
				 {
			
 
				-    float *dst = (float *) cvt->buf;
			
 
				-    const float *src = dst;
			
 
				-    int i = cvt->len_cvt / (sizeof (float) * 6);
			
 
				-    const float two_fifths_f = 1.0f / 2.5f;
			
 
				-    const __m128 two_fifths_v = _mm_set1_ps(two_fifths_f);
			
 
				-    const __m128 half = _mm_set1_ps(0.5f);
			
 
				+    float *dst = ((float *) (cvt->buf + (cvt->len_cvt * 2))) - 8;
			
 
				+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
			
 
				+    int i = cvt->len_cvt / sizeof (float);
			
 
				 
			
 
				-    LOG_DEBUG_CONVERT("5.1", "stereo (using SSE)");
			
 
				-    SDL_assert(format == AUDIO_F32SYS);
			
 
				-
			
 
				-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
			
 
				-    /* Just use unaligned load/stores, if the memory at runtime is */
			
 
				-    /* aligned it'll be just as fast on modern processors */
			
 
				-    while (i >= 2) {
			
 
				-        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
			
 
				-        /* registers. Using shuffles they can be rearranged so that */
			
 
				-        /* the conversion math can be vectorized. */
			
 
				-        __m128 in0 = _mm_loadu_ps(src);     /* 0FL 0FR 0FC 0LF */
			
 
				-        __m128 in1 = _mm_loadu_ps(src + 4); /* 0BL 0BR 1FL 1FR */
			
 
				-        __m128 in2 = _mm_loadu_ps(src + 8); /* 1FC 1LF 1BL 1BR */
			
 
				-
			
 
				-        /* 0FC 0FC 1FC 1FC */
			
 
				-        __m128 fc_distributed = _mm_mul_ps(half, _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(0, 0, 2, 2)));
			
 
				-
			
 
				-        /* 0FL 0FR 1BL 1BR */
			
 
				-        __m128 blended = _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(3, 2, 1, 0));
			
 
				-
			
 
				-        /*   0FL 0FR 1BL 1BR */
			
 
				-        /* + 0BL 0BR 1FL 1FR */
			
 
				-        /* =  0L  0R  1L  1R */
			
 
				-        __m128 out = _mm_add_ps(blended, in1);
			
 
				-        out = _mm_add_ps(out, fc_distributed);
			
 
				-        out = _mm_mul_ps(out, two_fifths_v);
			
 
				-
			
 
				-        _mm_storeu_ps(dst, out);
			
 
				-
			
 
				-        i -= 2; src += 12; dst += 4;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    /* Finish off any leftovers with scalar operations. */
			
 
				-    while (i) {
			
 
				-        const float front_center_distributed = src[2] * 0.5f;
			
 
				-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
			
 
				-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
			
 
				-        i--; src += 6; dst+=2;
			
 
				-    }
			
 
				-
			
 
				-    cvt->len_cvt /= 3;
			
 
				-    if (cvt->filters[++cvt->filter_index]) {
			
 
				-        cvt->filters[cvt->filter_index] (cvt, format);
			
 
				-    }
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#if HAVE_NEON_INTRINSICS
			
 
				-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
			
 
				-static void SDLCALL
			
 
				-SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
			
 
				-{
			
 
				-    float *dst = (float *) cvt->buf;
			
 
				-    const float *src = dst;
			
 
				-    int i = cvt->len_cvt / (sizeof (float) * 6);
			
 
				-    const float two_fifths_f = 1.0f / 2.5f;
			
 
				-    const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
			
 
				-    const float32x4_t half = vdupq_n_f32(0.5f);
			
 
				-
			
 
				-    LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
			
 
				-    SDL_assert(format == AUDIO_F32SYS);
			
 
				-
			
 
				-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
			
 
				-
			
 
				-    /* Just use unaligned load/stores, it's the same NEON instructions and
			
 
				-       hopefully even unaligned NEON is faster than the scalar fallback. */
			
 
				-    while (i >= 2) {
			
 
				-        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
			
 
				-        /* registers. Using shuffles they can be rearranged so that */
			
 
				-        /* the conversion math can be vectorized. */
			
 
				-        const float32x4_t in0 = vld1q_f32(src);     /* 0FL 0FR 0FC 0LF */
			
 
				-        const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
			
 
				-        const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
			
 
				-
			
 
				-        /* 0FC 0FC 1FC 1FC */
			
 
				-        const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
			
 
				-
			
 
				-        /* 0FL 0FR 1BL 1BR */
			
 
				-        const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
			
 
				-
			
 
				-        /*   0FL 0FR 1BL 1BR */
			
 
				-        /* + 0BL 0BR 1FL 1FR */
			
 
				-        /* =  0L  0R  1L  1R */
			
 
				-        float32x4_t out = vaddq_f32(blended, in1);
			
 
				-        out = vaddq_f32(out, fc_distributed);
			
 
				-        out = vmulq_f32(out, two_fifths_v);
			
 
				-
			
 
				-        vst1q_f32(dst, out);
			
 
				-
			
 
				-        i -= 2; src += 12; dst += 4;
			
 
				-    }
			
 
				-
			
 
				-    /* Finish off any leftovers with scalar operations. */
			
 
				-    while (i) {
			
 
				-        const float front_center_distributed = src[2] * 0.5f;
			
 
				-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
			
 
				-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
			
 
				-        i--; src += 6; dst+=2;
			
 
				-    }
			
 
				-
			
 
				-    cvt->len_cvt /= 3;
			
 
				-    if (cvt->filters[++cvt->filter_index]) {
			
 
				-        cvt->filters[cvt->filter_index] (cvt, format);
			
 
				-    }
			
 
				-}
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-#if HAVE_SSE3_INTRINSICS
			
 
				-/* Convert from stereo to mono. Average left and right. */
			
 
				-static void SDLCALL
			
 
				-SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
			
 
				-{
			
 
				-    const __m128 divby2 = _mm_set1_ps(0.5f);
			
 
				-    float *dst = (float *) cvt->buf;
			
 
				-    const float *src = dst;
			
 
				-    int i = cvt->len_cvt / 8;
			
 
				-
			
 
				-    LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
			
 
				+    LOG_DEBUG_CONVERT("mono", "stereo (using SSE)");
			
 
				     SDL_assert(format == AUDIO_F32SYS);
			
 
				 
			
 
				     /* Do SSE blocks as long as we have 16 bytes available.
			
 
				        Just use unaligned load/stores, if the memory at runtime is
			
 
				        aligned it'll be just as fast on modern processors */
			
 
				+    /* convert backwards, since output is growing in-place. */
			
 
				     while (i >= 4) {   /* 4 * float32 */
			
 
				-        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
			
 
				-        i -= 4; src += 8; dst += 4;
			
 
				+        const __m128 input = _mm_loadu_ps(src);   /* A B C D */
			
 
				+        _mm_storeu_ps(dst, _mm_unpacklo_ps(input, input));  /* A A B B */
			
 
				+        _mm_storeu_ps(dst+4, _mm_unpackhi_ps(input, input));  /* C C D D */
			
 
				+        i -= 4; src -= 4; dst -= 8;
			
 
				     }
			
 
				 
			
 
				     /* Finish off any leftovers with scalar operations. */
			
 
				-    while (i) {
			
 
				-        *dst = (src[0] + src[1]) * 0.5f;
			
 
				-        dst++; i--; src += 2;
			
 
				+    src += 3; dst += 6;  /* adjust for smaller buffers. */
			
 
				+    while (i) {     /* convert backwards, since output is growing in-place. */
			
 
				+        const float srcFC = src[0];
			
 
				+        dst[1] /* FR */ = srcFC;
			
 
				+        dst[0] /* FL */ = srcFC;
			
 
				+        i--; src--; dst -= 2;
			
 
				     }
			
 
				 
			
 
				-    cvt->len_cvt /= 2;
			
 
				+    cvt->len_cvt *= 2;
			
 
				     if (cvt->filters[++cvt->filter_index]) {
			
 
				         cvt->filters[cvt->filter_index] (cvt, format);
			
 
				     }
			
@@ -833,24 +682,16 @@ SDL_BuildAudioCVT(SDL_AudioCVT * cvt,
 
				         return SDL_SetError("Invalid channel combination");
			
 
				     } else if (channel_converter != NULL) {
			
 
				         /* swap in some SIMD versions for a few of these. */
			
 
				-        if (channel_converter == SDL_Convert51ToStereo) {
			
 
				+        if (channel_converter == SDL_ConvertStereoToMono) {
			
 
				             SDL_AudioFilter filter = NULL;
			
 
				-#if 0 /* !!! FIXME: these have not been updated for the new formulas */
			
 
				-            #if HAVE_AVX_INTRINSICS
			
 
				-            if (!filter && SDL_HasAVX()) { filter = SDL_Convert51ToStereo_AVX; }
			
 
				-            #endif
			
 
				-            #if HAVE_SSE_INTRINSICS
			
 
				-            if (!filter && SDL_HasSSE()) { filter = SDL_Convert51ToStereo_SSE; }
			
 
				-            #endif
			
 
				-            #if HAVE_NEON_INTRINSICS
			
 
				-            if (!filter && SDL_HasNEON()) { filter = SDL_Convert51ToStereo_NEON; }
			
 
				+            #if HAVE_SSE3_INTRINSICS
			
 
				+            if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
			
 
				             #endif
			
 
				-#endif
			
 
				             if (filter) { channel_converter = filter; }
			
 
				-        } else if (channel_converter == SDL_ConvertStereoToMono) {
			
 
				+        } else if (channel_converter == SDL_ConvertMonoToStereo) {
			
 
				             SDL_AudioFilter filter = NULL;
			
 
				-            #if HAVE_SSE3_INTRINSICS
			
 
				-            if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
			
 
				+            #if HAVE_SSE_INTRINSICS
			
 
				+            if (!filter && SDL_HasSSE()) { filter = SDL_ConvertMonoToStereo_SSE; }
			
 
				             #endif
			
 
				             if (filter) { channel_converter = filter; }
			
 
				         }