Index: webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc |
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft_sse2.cc b/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc |
similarity index 85% |
rename from webrtc/modules/audio_processing/aec/aec_rdft_sse2.cc |
rename to webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc |
index b4e453ff53ea2efdeb6888c5c1b22e3a92df7d16..03f6b31f0ff91799593a13e9ebcc7bf085a0ac1f 100644 |
--- a/webrtc/modules/audio_processing/aec/aec_rdft_sse2.cc |
+++ b/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc |
@@ -8,14 +8,32 @@ |
* be found in the AUTHORS file in the root of the source tree. |
*/ |
-#include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
+#include "webrtc/modules/audio_processing//utility/ooura_fft.h" |
#include <emmintrin.h> |
-static const ALIGN16_BEG float ALIGN16_END |
- k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; |
+#include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
+#include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" |
-static void cft1st_128_SSE2(float* a) { |
+namespace webrtc { |
+ |
+#if defined(WEBRTC_ARCH_X86_FAMILY) |
+ |
+namespace { |
+// These intrinsics were unavailable before VS 2008. |
+// TODO(andrew): move to a common file. |
+#if defined(_MSC_VER) && _MSC_VER < 1500 |
+static __inline __m128 _mm_castsi128_ps(__m128i a) { |
+ return *(__m128*)&a; |
+} |
+static __inline __m128i _mm_castps_si128(__m128 a) { |
+ return *(__m128i*)&a; |
+} |
+#endif |
+ |
+} // namespace |
+ |
+void cft1st_128_SSE2(float* a) { |
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |
int j, k2; |
@@ -78,7 +96,7 @@ static void cft1st_128_SSE2(float* a) { |
} |
} |
-static void cftmdl_128_SSE2(float* a) { |
+void cftmdl_128_SSE2(float* a) { |
const int l = 8; |
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |
int j0; |
@@ -89,12 +107,12 @@ static void cftmdl_128_SSE2(float* a) { |
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); |
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); |
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); |
- const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), |
- _mm_castsi128_ps(a_32), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
- const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), |
- _mm_castsi128_ps(a_40), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_00_32 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_08_40 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); |
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); |
@@ -102,12 +120,12 @@ static void cftmdl_128_SSE2(float* a) { |
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); |
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); |
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); |
- const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), |
- _mm_castsi128_ps(a_48), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
- const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), |
- _mm_castsi128_ps(a_56), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_16_48 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_24_56 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); |
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); |
@@ -163,12 +181,12 @@ static void cftmdl_128_SSE2(float* a) { |
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); |
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); |
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); |
- const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), |
- _mm_castsi128_ps(a_32), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
- const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), |
- _mm_castsi128_ps(a_40), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_00_32 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_08_40 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); |
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); |
@@ -176,22 +194,21 @@ static void cftmdl_128_SSE2(float* a) { |
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); |
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); |
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); |
- const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), |
- _mm_castsi128_ps(a_48), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
- const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), |
- _mm_castsi128_ps(a_56), |
- _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_16_48 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
+ const __m128 a_24_56 = |
+ _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), |
+ _MM_SHUFFLE(1, 0, 1, 0)); |
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); |
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); |
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); |
- const __m128 xx3 = |
- _mm_mul_ps(wk2iv, |
- _mm_castsi128_ps(_mm_shuffle_epi32( |
- _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); |
+ const __m128 xx3 = _mm_mul_ps( |
+ wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), |
+ _MM_SHUFFLE(2, 3, 0, 1)))); |
const __m128 xx4 = _mm_add_ps(xx2, xx3); |
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( |
@@ -202,16 +219,14 @@ static void cftmdl_128_SSE2(float* a) { |
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); |
const __m128 xx11 = _mm_mul_ps( |
- wk1iv, |
- _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), |
- _MM_SHUFFLE(2, 3, 0, 1)))); |
+ wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), |
+ _MM_SHUFFLE(2, 3, 0, 1)))); |
const __m128 xx12 = _mm_add_ps(xx10, xx11); |
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); |
const __m128 xx21 = _mm_mul_ps( |
- wk3iv, |
- _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), |
- _MM_SHUFFLE(2, 3, 0, 1)))); |
+ wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), |
+ _MM_SHUFFLE(2, 3, 0, 1)))); |
const __m128 xx22 = _mm_add_ps(xx20, xx21); |
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); |
@@ -237,13 +252,13 @@ static void cftmdl_128_SSE2(float* a) { |
} |
} |
-static void rftfsub_128_SSE2(float* a) { |
+void rftfsub_128_SSE2(float* a) { |
const float* c = rdft_w + 32; |
int j1, j2, k1, k2; |
float wkr, wki, xr, xi, yr, yi; |
- static const ALIGN16_BEG float ALIGN16_END |
- k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; |
+ static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, |
+ 0.5f}; |
const __m128 mm_half = _mm_load_ps(k_half); |
// Vectorized code (four at once). |
@@ -327,13 +342,13 @@ static void rftfsub_128_SSE2(float* a) { |
} |
} |
-static void rftbsub_128_SSE2(float* a) { |
+void rftbsub_128_SSE2(float* a) { |
const float* c = rdft_w + 32; |
int j1, j2, k1, k2; |
float wkr, wki, xr, xi, yr, yi; |
- static const ALIGN16_BEG float ALIGN16_END |
- k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; |
+ static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, |
+ 0.5f}; |
const __m128 mm_half = _mm_load_ps(k_half); |
a[1] = -a[1]; |
@@ -418,10 +433,6 @@ static void rftbsub_128_SSE2(float* a) { |
} |
a[65] = -a[65]; |
} |
+#endif |
-void aec_rdft_init_sse2(void) { |
- cft1st_128 = cft1st_128_SSE2; |
- cftmdl_128 = cftmdl_128_SSE2; |
- rftfsub_128 = rftfsub_128_SSE2; |
- rftbsub_128 = rftbsub_128_SSE2; |
-} |
+} // namespace webrtc |