Index: webrtc/modules/audio_processing/utility/ooura_fft.cc |
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft.cc b/webrtc/modules/audio_processing/utility/ooura_fft.cc |
similarity index 65% |
rename from webrtc/modules/audio_processing/aec/aec_rdft.cc |
rename to webrtc/modules/audio_processing/utility/ooura_fft.cc |
index 690fe9f34f9c51ebf3f2c91e5553f2b2d9d46f1f..4ba88d7f17f0905426f86fd3a37421b7bdbe706b 100644 |
--- a/webrtc/modules/audio_processing/aec/aec_rdft.cc |
+++ b/webrtc/modules/audio_processing/utility/ooura_fft.cc |
@@ -10,6 +10,8 @@ |
* - Trivial type modifications. |
* - Minimal code subset to do rdft of length 128. |
* - Optimizations because of known length. |
+ * - Removed the global variables by moving the code in to a class in order |
+ * to make it thread safe. |
* |
* All changes are covered by the WebRTC license and IP grant: |
* Use of this source code is governed by a BSD-style license |
@@ -19,184 +21,19 @@ |
* be found in the AUTHORS file in the root of the source tree. |
*/ |
-#include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
+#include "webrtc/modules/audio_processing//utility/ooura_fft.h" |
#include <math.h> |
+#include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
#include "webrtc/system_wrappers/include/cpu_features_wrapper.h" |
#include "webrtc/typedefs.h" |
-// These tables used to be computed at run-time. For example, refer to: |
-// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564 |
-// to see the initialization code. |
-const float rdft_w[64] = { |
- 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, |
- 0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f, |
- 0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f, |
- 0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f, |
- 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f, |
- 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, |
- 0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f, |
- 0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f, |
- 0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f, |
- 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f, |
- 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, |
- 0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f, |
- 0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f, |
- 0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f, |
- 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f, |
- 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f, |
-}; |
-const float rdft_wk3ri_first[16] = { |
- 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f, |
- 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f, |
- 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f, |
- 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f, |
-}; |
-const float rdft_wk3ri_second[16] = { |
- -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f, |
- -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f, |
- -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f, |
- -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = { |
- 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, |
- 0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f, |
- 0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f, |
- 0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f, |
- 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f, |
- 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, |
- 0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f, |
- 0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = { |
- 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, |
- 0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f, |
- 0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f, |
- 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, |
- 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f, |
- 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, |
- 0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f, |
- 0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = { |
- 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, |
- 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, |
- 0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f, |
- -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f, |
- 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f, |
- 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, |
- 0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f, |
- -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = { |
- -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, |
- -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, |
- -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f, |
- -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f, |
- -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f, |
- -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, |
- -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f, |
- -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = { |
- -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, |
- -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f, |
- -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, |
- -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f, |
- -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f, |
- -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, |
- -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f, |
- -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = { |
- -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, |
- -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f, |
- -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f, |
- -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f, |
- -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f, |
- -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, |
- -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f, |
- -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f, |
-}; |
-ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = { |
- 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f, |
-}; |
+namespace webrtc { |
-static void bitrv2_128_C(float* a) { |
- /* |
- Following things have been attempted but are no faster: |
- (a) Storing the swap indexes in a LUT (index calculations are done |
- for 'free' while waiting on memory/L1). |
- (b) Consolidate the load/store of two consecutive floats by a 64 bit |
- integer (execution is memory/L1 bound). |
- (c) Do a mix of floats and 64 bit integer to maximize register |
- utilization (execution is memory/L1 bound). |
- (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). |
- (e) Hard-coding of the offsets to completely eliminates index |
- calculations. |
- */ |
- |
- unsigned int j, j1, k, k1; |
- float xr, xi, yr, yi; |
- |
- static const int ip[4] = {0, 64, 32, 96}; |
- for (k = 0; k < 4; k++) { |
- for (j = 0; j < k; j++) { |
- j1 = 2 * j + ip[k]; |
- k1 = 2 * k + ip[j]; |
- xr = a[j1 + 0]; |
- xi = a[j1 + 1]; |
- yr = a[k1 + 0]; |
- yi = a[k1 + 1]; |
- a[j1 + 0] = yr; |
- a[j1 + 1] = yi; |
- a[k1 + 0] = xr; |
- a[k1 + 1] = xi; |
- j1 += 8; |
- k1 += 16; |
- xr = a[j1 + 0]; |
- xi = a[j1 + 1]; |
- yr = a[k1 + 0]; |
- yi = a[k1 + 1]; |
- a[j1 + 0] = yr; |
- a[j1 + 1] = yi; |
- a[k1 + 0] = xr; |
- a[k1 + 1] = xi; |
- j1 += 8; |
- k1 -= 8; |
- xr = a[j1 + 0]; |
- xi = a[j1 + 1]; |
- yr = a[k1 + 0]; |
- yi = a[k1 + 1]; |
- a[j1 + 0] = yr; |
- a[j1 + 1] = yi; |
- a[k1 + 0] = xr; |
- a[k1 + 1] = xi; |
- j1 += 8; |
- k1 += 16; |
- xr = a[j1 + 0]; |
- xi = a[j1 + 1]; |
- yr = a[k1 + 0]; |
- yi = a[k1 + 1]; |
- a[j1 + 0] = yr; |
- a[j1 + 1] = yi; |
- a[k1 + 0] = xr; |
- a[k1 + 1] = xi; |
- } |
- j1 = 2 * k + 8 + ip[k]; |
- k1 = j1 + 8; |
- xr = a[j1 + 0]; |
- xi = a[j1 + 1]; |
- yr = a[k1 + 0]; |
- yi = a[k1 + 1]; |
- a[j1 + 0] = yr; |
- a[j1 + 1] = yi; |
- a[k1 + 0] = xr; |
- a[k1 + 1] = xi; |
- } |
-} |
+namespace { |
+#if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON)) |
static void cft1st_128_C(float* a) { |
const int n = 128; |
int j, k1, k2; |
@@ -431,67 +268,6 @@ static void cftmdl_128_C(float* a) { |
} |
} |
-static void cftfsub_128_C(float* a) { |
- int j, j1, j2, j3, l; |
- float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
- |
- cft1st_128(a); |
- cftmdl_128(a); |
- l = 32; |
- for (j = 0; j < l; j += 2) { |
- j1 = j + l; |
- j2 = j1 + l; |
- j3 = j2 + l; |
- x0r = a[j] + a[j1]; |
- x0i = a[j + 1] + a[j1 + 1]; |
- x1r = a[j] - a[j1]; |
- x1i = a[j + 1] - a[j1 + 1]; |
- x2r = a[j2] + a[j3]; |
- x2i = a[j2 + 1] + a[j3 + 1]; |
- x3r = a[j2] - a[j3]; |
- x3i = a[j2 + 1] - a[j3 + 1]; |
- a[j] = x0r + x2r; |
- a[j + 1] = x0i + x2i; |
- a[j2] = x0r - x2r; |
- a[j2 + 1] = x0i - x2i; |
- a[j1] = x1r - x3i; |
- a[j1 + 1] = x1i + x3r; |
- a[j3] = x1r + x3i; |
- a[j3 + 1] = x1i - x3r; |
- } |
-} |
- |
-static void cftbsub_128_C(float* a) { |
- int j, j1, j2, j3, l; |
- float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
- |
- cft1st_128(a); |
- cftmdl_128(a); |
- l = 32; |
- |
- for (j = 0; j < l; j += 2) { |
- j1 = j + l; |
- j2 = j1 + l; |
- j3 = j2 + l; |
- x0r = a[j] + a[j1]; |
- x0i = -a[j + 1] - a[j1 + 1]; |
- x1r = a[j] - a[j1]; |
- x1i = -a[j + 1] + a[j1 + 1]; |
- x2r = a[j2] + a[j3]; |
- x2i = a[j2 + 1] + a[j3 + 1]; |
- x3r = a[j2] - a[j3]; |
- x3i = a[j2 + 1] - a[j3 + 1]; |
- a[j] = x0r + x2r; |
- a[j + 1] = x0i - x2i; |
- a[j2] = x0r - x2r; |
- a[j2 + 1] = x0i + x2i; |
- a[j1] = x1r - x3i; |
- a[j1 + 1] = x1i - x3r; |
- a[j3] = x1r + x3i; |
- a[j3 + 1] = x1i + x3r; |
- } |
-} |
- |
static void rftfsub_128_C(float* a) { |
const float* c = rdft_w + 32; |
int j1, j2, k1, k2; |
@@ -535,8 +311,22 @@ static void rftbsub_128_C(float* a) { |
} |
a[65] = -a[65]; |
} |
+#endif |
+ |
-void aec_rdft_forward_128(float* a) { |
+} // namespace |
+ |
+OouraFft::OouraFft() { |
+#if defined(WEBRTC_ARCH_X86_FAMILY) |
+ use_sse2_ = (WebRtc_GetCPUInfo(kSSE2) != 0); |
+#else |
+ use_sse2_ = false; |
+#endif |
+} |
+ |
+OouraFft::~OouraFft() = default; |
+ |
+void OouraFft::Fft(float* a) const { |
float xi; |
bitrv2_128(a); |
cftfsub_128(a); |
@@ -545,8 +335,7 @@ void aec_rdft_forward_128(float* a) { |
a[0] += a[1]; |
a[1] = xi; |
} |
- |
-void aec_rdft_inverse_128(float* a) { |
+void OouraFft::InverseFft(float* a) const { |
a[1] = 0.5f * (a[0] - a[1]); |
a[0] -= a[1]; |
rftbsub_128(a); |
@@ -554,32 +343,193 @@ void aec_rdft_inverse_128(float* a) { |
cftbsub_128(a); |
} |
-// code path selection |
-RftSub128 cft1st_128; |
-RftSub128 cftmdl_128; |
-RftSub128 rftfsub_128; |
-RftSub128 rftbsub_128; |
-RftSub128 cftfsub_128; |
-RftSub128 cftbsub_128; |
-RftSub128 bitrv2_128; |
- |
-void aec_rdft_init(void) { |
- cft1st_128 = cft1st_128_C; |
- cftmdl_128 = cftmdl_128_C; |
- rftfsub_128 = rftfsub_128_C; |
- rftbsub_128 = rftbsub_128_C; |
- cftfsub_128 = cftfsub_128_C; |
- cftbsub_128 = cftbsub_128_C; |
- bitrv2_128 = bitrv2_128_C; |
-#if defined(WEBRTC_ARCH_X86_FAMILY) |
- if (WebRtc_GetCPUInfo(kSSE2)) { |
- aec_rdft_init_sse2(); |
+void OouraFft::cft1st_128(float* a) const { |
+#if defined(MIPS_FPU_LE) |
+ cft1st_128_mips(a); |
+#elif defined(WEBRTC_HAS_NEON) |
+ cft1st_128_neon(a); |
+#else |
+ if (use_sse2_) { |
+ cft1st_128_SSE2(a); |
+ } else { |
+ cft1st_128_C(a); |
} |
#endif |
+} |
+void OouraFft::cftmdl_128(float* a) const { |
#if defined(MIPS_FPU_LE) |
- aec_rdft_init_mips(); |
+ cftmdl_128_mips(a); |
+#elif defined(WEBRTC_HAS_NEON) |
+ cftmdl_128_neon(a); |
+#else |
+ if (use_sse2_) { |
+ cftmdl_128_SSE2(a); |
+ } else { |
+ cftmdl_128_C(a); |
+ } |
#endif |
-#if defined(WEBRTC_HAS_NEON) |
- aec_rdft_init_neon(); |
+} |
+void OouraFft::rftfsub_128(float* a) const { |
+#if defined(MIPS_FPU_LE) |
+ rftfsub_128_mips(a); |
+#elif defined(WEBRTC_HAS_NEON) |
+ rftfsub_128_neon(a); |
+#else |
+ if (use_sse2_) { |
+ rftfsub_128_SSE2(a); |
+ } else { |
+ rftfsub_128_C(a); |
+ } |
+#endif |
+} |
+ |
+void OouraFft::rftbsub_128(float* a) const { |
+#if defined(MIPS_FPU_LE) |
+ rftbsub_128_mips(a); |
+#elif defined(WEBRTC_HAS_NEON) |
+ rftbsub_128_neon(a); |
+#else |
+ if (use_sse2_) { |
+ rftbsub_128_SSE2(a); |
+ } else { |
+ rftbsub_128_C(a); |
+ } |
#endif |
} |
+ |
+void OouraFft::cftbsub_128(float* a) const { |
+ int j, j1, j2, j3, l; |
+ float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
+ |
+ cft1st_128(a); |
+ cftmdl_128(a); |
+ l = 32; |
+ |
+ for (j = 0; j < l; j += 2) { |
+ j1 = j + l; |
+ j2 = j1 + l; |
+ j3 = j2 + l; |
+ x0r = a[j] + a[j1]; |
+ x0i = -a[j + 1] - a[j1 + 1]; |
+ x1r = a[j] - a[j1]; |
+ x1i = -a[j + 1] + a[j1 + 1]; |
+ x2r = a[j2] + a[j3]; |
+ x2i = a[j2 + 1] + a[j3 + 1]; |
+ x3r = a[j2] - a[j3]; |
+ x3i = a[j2 + 1] - a[j3 + 1]; |
+ a[j] = x0r + x2r; |
+ a[j + 1] = x0i - x2i; |
+ a[j2] = x0r - x2r; |
+ a[j2 + 1] = x0i + x2i; |
+ a[j1] = x1r - x3i; |
+ a[j1 + 1] = x1i - x3r; |
+ a[j3] = x1r + x3i; |
+ a[j3 + 1] = x1i + x3r; |
+ } |
+} |
+ |
+void OouraFft::cftfsub_128(float* a) const { |
+ int j, j1, j2, j3, l; |
+ float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
+ |
+ cft1st_128(a); |
+ cftmdl_128(a); |
+ l = 32; |
+ for (j = 0; j < l; j += 2) { |
+ j1 = j + l; |
+ j2 = j1 + l; |
+ j3 = j2 + l; |
+ x0r = a[j] + a[j1]; |
+ x0i = a[j + 1] + a[j1 + 1]; |
+ x1r = a[j] - a[j1]; |
+ x1i = a[j + 1] - a[j1 + 1]; |
+ x2r = a[j2] + a[j3]; |
+ x2i = a[j2 + 1] + a[j3 + 1]; |
+ x3r = a[j2] - a[j3]; |
+ x3i = a[j2 + 1] - a[j3 + 1]; |
+ a[j] = x0r + x2r; |
+ a[j + 1] = x0i + x2i; |
+ a[j2] = x0r - x2r; |
+ a[j2 + 1] = x0i - x2i; |
+ a[j1] = x1r - x3i; |
+ a[j1 + 1] = x1i + x3r; |
+ a[j3] = x1r + x3i; |
+ a[j3 + 1] = x1i - x3r; |
+ } |
+} |
+ |
+void OouraFft::bitrv2_128(float* a) const { |
+ /* |
+ Following things have been attempted but are no faster: |
+ (a) Storing the swap indexes in a LUT (index calculations are done |
+ for 'free' while waiting on memory/L1). |
+ (b) Consolidate the load/store of two consecutive floats by a 64 bit |
+ integer (execution is memory/L1 bound). |
+ (c) Do a mix of floats and 64 bit integer to maximize register |
+ utilization (execution is memory/L1 bound). |
+ (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). |
+ (e) Hard-coding of the offsets to completely eliminates index |
+ calculations. |
+ */ |
+ |
+ unsigned int j, j1, k, k1; |
+ float xr, xi, yr, yi; |
+ |
+ const int ip[4] = {0, 64, 32, 96}; |
+ for (k = 0; k < 4; k++) { |
+ for (j = 0; j < k; j++) { |
+ j1 = 2 * j + ip[k]; |
+ k1 = 2 * k + ip[j]; |
+ xr = a[j1 + 0]; |
+ xi = a[j1 + 1]; |
+ yr = a[k1 + 0]; |
+ yi = a[k1 + 1]; |
+ a[j1 + 0] = yr; |
+ a[j1 + 1] = yi; |
+ a[k1 + 0] = xr; |
+ a[k1 + 1] = xi; |
+ j1 += 8; |
+ k1 += 16; |
+ xr = a[j1 + 0]; |
+ xi = a[j1 + 1]; |
+ yr = a[k1 + 0]; |
+ yi = a[k1 + 1]; |
+ a[j1 + 0] = yr; |
+ a[j1 + 1] = yi; |
+ a[k1 + 0] = xr; |
+ a[k1 + 1] = xi; |
+ j1 += 8; |
+ k1 -= 8; |
+ xr = a[j1 + 0]; |
+ xi = a[j1 + 1]; |
+ yr = a[k1 + 0]; |
+ yi = a[k1 + 1]; |
+ a[j1 + 0] = yr; |
+ a[j1 + 1] = yi; |
+ a[k1 + 0] = xr; |
+ a[k1 + 1] = xi; |
+ j1 += 8; |
+ k1 += 16; |
+ xr = a[j1 + 0]; |
+ xi = a[j1 + 1]; |
+ yr = a[k1 + 0]; |
+ yi = a[k1 + 1]; |
+ a[j1 + 0] = yr; |
+ a[j1 + 1] = yi; |
+ a[k1 + 0] = xr; |
+ a[k1 + 1] = xi; |
+ } |
+ j1 = 2 * k + 8 + ip[k]; |
+ k1 = j1 + 8; |
+ xr = a[j1 + 0]; |
+ xi = a[j1 + 1]; |
+ yr = a[k1 + 0]; |
+ yi = a[k1 + 1]; |
+ a[j1 + 0] = yr; |
+ a[j1 + 1] = yi; |
+ a[k1 + 0] = xr; |
+ a[k1 + 1] = xi; |
+ } |
+} |
+ |
+} // namespace webrtc |