OLD | NEW |
1 /* | 1 /* |
2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html | 2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html |
3 * Copyright Takuya OOURA, 1996-2001 | 3 * Copyright Takuya OOURA, 1996-2001 |
4 * | 4 * |
5 * You may use, copy, modify and distribute this code for any purpose (include | 5 * You may use, copy, modify and distribute this code for any purpose (include |
6 * commercial use) and without fee. Please refer to this package when you modify | 6 * commercial use) and without fee. Please refer to this package when you modify |
7 * this code. | 7 * this code. |
8 * | 8 * |
9 * Changes by the WebRTC authors: | 9 * Changes by the WebRTC authors: |
10 * - Trivial type modifications. | 10 * - Trivial type modifications. |
11 * - Minimal code subset to do rdft of length 128. | 11 * - Minimal code subset to do rdft of length 128. |
12 * - Optimizations because of known length. | 12 * - Optimizations because of known length. |
| 13 * - Removed the global variables by moving the code in to a class in order |
| 14 * to make it thread safe. |
13 * | 15 * |
14 * All changes are covered by the WebRTC license and IP grant: | 16 * All changes are covered by the WebRTC license and IP grant: |
15 * Use of this source code is governed by a BSD-style license | 17 * Use of this source code is governed by a BSD-style license |
16 * that can be found in the LICENSE file in the root of the source | 18 * that can be found in the LICENSE file in the root of the source |
17 * tree. An additional intellectual property rights grant can be found | 19 * tree. An additional intellectual property rights grant can be found |
18 * in the file PATENTS. All contributing project authors may | 20 * in the file PATENTS. All contributing project authors may |
19 * be found in the AUTHORS file in the root of the source tree. | 21 * be found in the AUTHORS file in the root of the source tree. |
20 */ | 22 */ |
21 | 23 |
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 24 #include "webrtc/modules/audio_processing//utility/ooura_fft.h" |
23 | 25 |
24 #include <math.h> | 26 #include <math.h> |
25 | 27 |
| 28 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
26 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" | 29 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" |
27 #include "webrtc/typedefs.h" | 30 #include "webrtc/typedefs.h" |
28 | 31 |
29 // These tables used to be computed at run-time. For example, refer to: | 32 namespace webrtc { |
30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_pro
cessing/aec/aec_rdft.c?r=6564 | |
31 // to see the initialization code. | |
32 const float rdft_w[64] = { | |
33 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, | |
34 0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f, | |
35 0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f, | |
36 0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f, | |
37 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f, | |
38 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, | |
39 0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f, | |
40 0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f, | |
41 0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f, | |
42 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f, | |
43 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, | |
44 0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f, | |
45 0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f, | |
46 0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f, | |
47 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f, | |
48 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f, | |
49 }; | |
50 const float rdft_wk3ri_first[16] = { | |
51 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f, | |
52 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f, | |
53 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f, | |
54 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f, | |
55 }; | |
56 const float rdft_wk3ri_second[16] = { | |
57 -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f, | |
58 -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f, | |
59 -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f, | |
60 -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f, | |
61 }; | |
62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = { | |
63 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, | |
64 0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f, | |
65 0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f, | |
66 0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f, | |
67 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f, | |
68 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, | |
69 0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f, | |
70 0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f, | |
71 }; | |
72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = { | |
73 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, | |
74 0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f, | |
75 0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f, | |
76 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, | |
77 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f, | |
78 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, | |
79 0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f, | |
80 0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f, | |
81 }; | |
82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = { | |
83 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, | |
84 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, | |
85 0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f, | |
86 -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f, | |
87 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f, | |
88 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, | |
89 0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f, | |
90 -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f, | |
91 }; | |
92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = { | |
93 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, | |
94 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, | |
95 -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f, | |
96 -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f, | |
97 -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f, | |
98 -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, | |
99 -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f, | |
100 -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f, | |
101 }; | |
102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = { | |
103 -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, | |
104 -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f, | |
105 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, | |
106 -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f, | |
107 -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f, | |
108 -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, | |
109 -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f, | |
110 -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f, | |
111 }; | |
112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = { | |
113 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, | |
114 -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f, | |
115 -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f, | |
116 -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f, | |
117 -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f, | |
118 -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, | |
119 -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f, | |
120 -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f, | |
121 }; | |
122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = { | |
123 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f, | |
124 }; | |
125 | 33 |
126 static void bitrv2_128_C(float* a) { | 34 namespace { |
127 /* | |
128 Following things have been attempted but are no faster: | |
129 (a) Storing the swap indexes in a LUT (index calculations are done | |
130 for 'free' while waiting on memory/L1). | |
131 (b) Consolidate the load/store of two consecutive floats by a 64 bit | |
132 integer (execution is memory/L1 bound). | |
133 (c) Do a mix of floats and 64 bit integer to maximize register | |
134 utilization (execution is memory/L1 bound). | |
135 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). | |
136 (e) Hard-coding of the offsets to completely eliminates index | |
137 calculations. | |
138 */ | |
139 | 35 |
140 unsigned int j, j1, k, k1; | 36 #if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON)) |
141 float xr, xi, yr, yi; | |
142 | |
143 static const int ip[4] = {0, 64, 32, 96}; | |
144 for (k = 0; k < 4; k++) { | |
145 for (j = 0; j < k; j++) { | |
146 j1 = 2 * j + ip[k]; | |
147 k1 = 2 * k + ip[j]; | |
148 xr = a[j1 + 0]; | |
149 xi = a[j1 + 1]; | |
150 yr = a[k1 + 0]; | |
151 yi = a[k1 + 1]; | |
152 a[j1 + 0] = yr; | |
153 a[j1 + 1] = yi; | |
154 a[k1 + 0] = xr; | |
155 a[k1 + 1] = xi; | |
156 j1 += 8; | |
157 k1 += 16; | |
158 xr = a[j1 + 0]; | |
159 xi = a[j1 + 1]; | |
160 yr = a[k1 + 0]; | |
161 yi = a[k1 + 1]; | |
162 a[j1 + 0] = yr; | |
163 a[j1 + 1] = yi; | |
164 a[k1 + 0] = xr; | |
165 a[k1 + 1] = xi; | |
166 j1 += 8; | |
167 k1 -= 8; | |
168 xr = a[j1 + 0]; | |
169 xi = a[j1 + 1]; | |
170 yr = a[k1 + 0]; | |
171 yi = a[k1 + 1]; | |
172 a[j1 + 0] = yr; | |
173 a[j1 + 1] = yi; | |
174 a[k1 + 0] = xr; | |
175 a[k1 + 1] = xi; | |
176 j1 += 8; | |
177 k1 += 16; | |
178 xr = a[j1 + 0]; | |
179 xi = a[j1 + 1]; | |
180 yr = a[k1 + 0]; | |
181 yi = a[k1 + 1]; | |
182 a[j1 + 0] = yr; | |
183 a[j1 + 1] = yi; | |
184 a[k1 + 0] = xr; | |
185 a[k1 + 1] = xi; | |
186 } | |
187 j1 = 2 * k + 8 + ip[k]; | |
188 k1 = j1 + 8; | |
189 xr = a[j1 + 0]; | |
190 xi = a[j1 + 1]; | |
191 yr = a[k1 + 0]; | |
192 yi = a[k1 + 1]; | |
193 a[j1 + 0] = yr; | |
194 a[j1 + 1] = yi; | |
195 a[k1 + 0] = xr; | |
196 a[k1 + 1] = xi; | |
197 } | |
198 } | |
199 | |
200 static void cft1st_128_C(float* a) { | 37 static void cft1st_128_C(float* a) { |
201 const int n = 128; | 38 const int n = 128; |
202 int j, k1, k2; | 39 int j, k1, k2; |
203 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; | 40 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; |
204 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | 41 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
205 | 42 |
206 // The processing of the first set of elements was simplified in C to avoid | 43 // The processing of the first set of elements was simplified in C to avoid |
207 // some operations (multiplication by zero or one, addition of two elements | 44 // some operations (multiplication by zero or one, addition of two elements |
208 // multiplied by the same weight, ...). | 45 // multiplied by the same weight, ...). |
209 x0r = a[0] + a[2]; | 46 x0r = a[0] + a[2]; |
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
424 a[j1 + 0] = wk1r * x0r - wk1i * x0i; | 261 a[j1 + 0] = wk1r * x0r - wk1i * x0i; |
425 a[j1 + 1] = wk1r * x0i + wk1i * x0r; | 262 a[j1 + 1] = wk1r * x0i + wk1i * x0r; |
426 x0r = x1r + x3i; | 263 x0r = x1r + x3i; |
427 x0i = x1i - x3r; | 264 x0i = x1i - x3r; |
428 a[j3 + 0] = wk3r * x0r - wk3i * x0i; | 265 a[j3 + 0] = wk3r * x0r - wk3i * x0i; |
429 a[j3 + 1] = wk3r * x0i + wk3i * x0r; | 266 a[j3 + 1] = wk3r * x0i + wk3i * x0r; |
430 } | 267 } |
431 } | 268 } |
432 } | 269 } |
433 | 270 |
434 static void cftfsub_128_C(float* a) { | |
435 int j, j1, j2, j3, l; | |
436 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | |
437 | |
438 cft1st_128(a); | |
439 cftmdl_128(a); | |
440 l = 32; | |
441 for (j = 0; j < l; j += 2) { | |
442 j1 = j + l; | |
443 j2 = j1 + l; | |
444 j3 = j2 + l; | |
445 x0r = a[j] + a[j1]; | |
446 x0i = a[j + 1] + a[j1 + 1]; | |
447 x1r = a[j] - a[j1]; | |
448 x1i = a[j + 1] - a[j1 + 1]; | |
449 x2r = a[j2] + a[j3]; | |
450 x2i = a[j2 + 1] + a[j3 + 1]; | |
451 x3r = a[j2] - a[j3]; | |
452 x3i = a[j2 + 1] - a[j3 + 1]; | |
453 a[j] = x0r + x2r; | |
454 a[j + 1] = x0i + x2i; | |
455 a[j2] = x0r - x2r; | |
456 a[j2 + 1] = x0i - x2i; | |
457 a[j1] = x1r - x3i; | |
458 a[j1 + 1] = x1i + x3r; | |
459 a[j3] = x1r + x3i; | |
460 a[j3 + 1] = x1i - x3r; | |
461 } | |
462 } | |
463 | |
464 static void cftbsub_128_C(float* a) { | |
465 int j, j1, j2, j3, l; | |
466 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | |
467 | |
468 cft1st_128(a); | |
469 cftmdl_128(a); | |
470 l = 32; | |
471 | |
472 for (j = 0; j < l; j += 2) { | |
473 j1 = j + l; | |
474 j2 = j1 + l; | |
475 j3 = j2 + l; | |
476 x0r = a[j] + a[j1]; | |
477 x0i = -a[j + 1] - a[j1 + 1]; | |
478 x1r = a[j] - a[j1]; | |
479 x1i = -a[j + 1] + a[j1 + 1]; | |
480 x2r = a[j2] + a[j3]; | |
481 x2i = a[j2 + 1] + a[j3 + 1]; | |
482 x3r = a[j2] - a[j3]; | |
483 x3i = a[j2 + 1] - a[j3 + 1]; | |
484 a[j] = x0r + x2r; | |
485 a[j + 1] = x0i - x2i; | |
486 a[j2] = x0r - x2r; | |
487 a[j2 + 1] = x0i + x2i; | |
488 a[j1] = x1r - x3i; | |
489 a[j1 + 1] = x1i - x3r; | |
490 a[j3] = x1r + x3i; | |
491 a[j3 + 1] = x1i + x3r; | |
492 } | |
493 } | |
494 | |
495 static void rftfsub_128_C(float* a) { | 271 static void rftfsub_128_C(float* a) { |
496 const float* c = rdft_w + 32; | 272 const float* c = rdft_w + 32; |
497 int j1, j2, k1, k2; | 273 int j1, j2, k1, k2; |
498 float wkr, wki, xr, xi, yr, yi; | 274 float wkr, wki, xr, xi, yr, yi; |
499 | 275 |
500 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { | 276 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { |
501 k2 = 128 - j2; | 277 k2 = 128 - j2; |
502 k1 = 32 - j1; | 278 k1 = 32 - j1; |
503 wkr = 0.5f - c[k1]; | 279 wkr = 0.5f - c[k1]; |
504 wki = c[j1]; | 280 wki = c[j1]; |
(...skipping 23 matching lines...) Expand all Loading... |
528 xi = a[j2 + 1] + a[k2 + 1]; | 304 xi = a[j2 + 1] + a[k2 + 1]; |
529 yr = wkr * xr + wki * xi; | 305 yr = wkr * xr + wki * xi; |
530 yi = wkr * xi - wki * xr; | 306 yi = wkr * xi - wki * xr; |
531 a[j2 + 0] = a[j2 + 0] - yr; | 307 a[j2 + 0] = a[j2 + 0] - yr; |
532 a[j2 + 1] = yi - a[j2 + 1]; | 308 a[j2 + 1] = yi - a[j2 + 1]; |
533 a[k2 + 0] = yr + a[k2 + 0]; | 309 a[k2 + 0] = yr + a[k2 + 0]; |
534 a[k2 + 1] = yi - a[k2 + 1]; | 310 a[k2 + 1] = yi - a[k2 + 1]; |
535 } | 311 } |
536 a[65] = -a[65]; | 312 a[65] = -a[65]; |
537 } | 313 } |
| 314 #endif |
538 | 315 |
539 void aec_rdft_forward_128(float* a) { | 316 |
| 317 } // namespace |
| 318 |
| 319 OouraFft::OouraFft() { |
| 320 #if defined(WEBRTC_ARCH_X86_FAMILY) |
| 321 use_sse2_ = (WebRtc_GetCPUInfo(kSSE2) != 0); |
| 322 #else |
| 323 use_sse2_ = false; |
| 324 #endif |
| 325 } |
| 326 |
| 327 OouraFft::~OouraFft() = default; |
| 328 |
| 329 void OouraFft::Fft(float* a) const { |
540 float xi; | 330 float xi; |
541 bitrv2_128(a); | 331 bitrv2_128(a); |
542 cftfsub_128(a); | 332 cftfsub_128(a); |
543 rftfsub_128(a); | 333 rftfsub_128(a); |
544 xi = a[0] - a[1]; | 334 xi = a[0] - a[1]; |
545 a[0] += a[1]; | 335 a[0] += a[1]; |
546 a[1] = xi; | 336 a[1] = xi; |
547 } | 337 } |
548 | 338 void OouraFft::InverseFft(float* a) const { |
549 void aec_rdft_inverse_128(float* a) { | |
550 a[1] = 0.5f * (a[0] - a[1]); | 339 a[1] = 0.5f * (a[0] - a[1]); |
551 a[0] -= a[1]; | 340 a[0] -= a[1]; |
552 rftbsub_128(a); | 341 rftbsub_128(a); |
553 bitrv2_128(a); | 342 bitrv2_128(a); |
554 cftbsub_128(a); | 343 cftbsub_128(a); |
555 } | 344 } |
556 | 345 |
557 // code path selection | 346 void OouraFft::cft1st_128(float* a) const { |
558 RftSub128 cft1st_128; | 347 #if defined(MIPS_FPU_LE) |
559 RftSub128 cftmdl_128; | 348 cft1st_128_mips(a); |
560 RftSub128 rftfsub_128; | 349 #elif defined(WEBRTC_HAS_NEON) |
561 RftSub128 rftbsub_128; | 350 cft1st_128_neon(a); |
562 RftSub128 cftfsub_128; | 351 #else |
563 RftSub128 cftbsub_128; | 352 if (use_sse2_) { |
564 RftSub128 bitrv2_128; | 353 cft1st_128_SSE2(a); |
565 | 354 } else { |
566 void aec_rdft_init(void) { | 355 cft1st_128_C(a); |
567 cft1st_128 = cft1st_128_C; | |
568 cftmdl_128 = cftmdl_128_C; | |
569 rftfsub_128 = rftfsub_128_C; | |
570 rftbsub_128 = rftbsub_128_C; | |
571 cftfsub_128 = cftfsub_128_C; | |
572 cftbsub_128 = cftbsub_128_C; | |
573 bitrv2_128 = bitrv2_128_C; | |
574 #if defined(WEBRTC_ARCH_X86_FAMILY) | |
575 if (WebRtc_GetCPUInfo(kSSE2)) { | |
576 aec_rdft_init_sse2(); | |
577 } | 356 } |
578 #endif | 357 #endif |
| 358 } |
| 359 void OouraFft::cftmdl_128(float* a) const { |
579 #if defined(MIPS_FPU_LE) | 360 #if defined(MIPS_FPU_LE) |
580 aec_rdft_init_mips(); | 361 cftmdl_128_mips(a); |
581 #endif | 362 #elif defined(WEBRTC_HAS_NEON) |
582 #if defined(WEBRTC_HAS_NEON) | 363 cftmdl_128_neon(a); |
583 aec_rdft_init_neon(); | 364 #else |
| 365 if (use_sse2_) { |
| 366 cftmdl_128_SSE2(a); |
| 367 } else { |
| 368 cftmdl_128_C(a); |
| 369 } |
584 #endif | 370 #endif |
585 } | 371 } |
| 372 void OouraFft::rftfsub_128(float* a) const { |
| 373 #if defined(MIPS_FPU_LE) |
| 374 rftfsub_128_mips(a); |
| 375 #elif defined(WEBRTC_HAS_NEON) |
| 376 rftfsub_128_neon(a); |
| 377 #else |
| 378 if (use_sse2_) { |
| 379 rftfsub_128_SSE2(a); |
| 380 } else { |
| 381 rftfsub_128_C(a); |
| 382 } |
| 383 #endif |
| 384 } |
| 385 |
| 386 void OouraFft::rftbsub_128(float* a) const { |
| 387 #if defined(MIPS_FPU_LE) |
| 388 rftbsub_128_mips(a); |
| 389 #elif defined(WEBRTC_HAS_NEON) |
| 390 rftbsub_128_neon(a); |
| 391 #else |
| 392 if (use_sse2_) { |
| 393 rftbsub_128_SSE2(a); |
| 394 } else { |
| 395 rftbsub_128_C(a); |
| 396 } |
| 397 #endif |
| 398 } |
| 399 |
| 400 void OouraFft::cftbsub_128(float* a) const { |
| 401 int j, j1, j2, j3, l; |
| 402 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
| 403 |
| 404 cft1st_128(a); |
| 405 cftmdl_128(a); |
| 406 l = 32; |
| 407 |
| 408 for (j = 0; j < l; j += 2) { |
| 409 j1 = j + l; |
| 410 j2 = j1 + l; |
| 411 j3 = j2 + l; |
| 412 x0r = a[j] + a[j1]; |
| 413 x0i = -a[j + 1] - a[j1 + 1]; |
| 414 x1r = a[j] - a[j1]; |
| 415 x1i = -a[j + 1] + a[j1 + 1]; |
| 416 x2r = a[j2] + a[j3]; |
| 417 x2i = a[j2 + 1] + a[j3 + 1]; |
| 418 x3r = a[j2] - a[j3]; |
| 419 x3i = a[j2 + 1] - a[j3 + 1]; |
| 420 a[j] = x0r + x2r; |
| 421 a[j + 1] = x0i - x2i; |
| 422 a[j2] = x0r - x2r; |
| 423 a[j2 + 1] = x0i + x2i; |
| 424 a[j1] = x1r - x3i; |
| 425 a[j1 + 1] = x1i - x3r; |
| 426 a[j3] = x1r + x3i; |
| 427 a[j3 + 1] = x1i + x3r; |
| 428 } |
| 429 } |
| 430 |
| 431 void OouraFft::cftfsub_128(float* a) const { |
| 432 int j, j1, j2, j3, l; |
| 433 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
| 434 |
| 435 cft1st_128(a); |
| 436 cftmdl_128(a); |
| 437 l = 32; |
| 438 for (j = 0; j < l; j += 2) { |
| 439 j1 = j + l; |
| 440 j2 = j1 + l; |
| 441 j3 = j2 + l; |
| 442 x0r = a[j] + a[j1]; |
| 443 x0i = a[j + 1] + a[j1 + 1]; |
| 444 x1r = a[j] - a[j1]; |
| 445 x1i = a[j + 1] - a[j1 + 1]; |
| 446 x2r = a[j2] + a[j3]; |
| 447 x2i = a[j2 + 1] + a[j3 + 1]; |
| 448 x3r = a[j2] - a[j3]; |
| 449 x3i = a[j2 + 1] - a[j3 + 1]; |
| 450 a[j] = x0r + x2r; |
| 451 a[j + 1] = x0i + x2i; |
| 452 a[j2] = x0r - x2r; |
| 453 a[j2 + 1] = x0i - x2i; |
| 454 a[j1] = x1r - x3i; |
| 455 a[j1 + 1] = x1i + x3r; |
| 456 a[j3] = x1r + x3i; |
| 457 a[j3 + 1] = x1i - x3r; |
| 458 } |
| 459 } |
| 460 |
| 461 void OouraFft::bitrv2_128(float* a) const { |
| 462 /* |
| 463 Following things have been attempted but are no faster: |
| 464 (a) Storing the swap indexes in a LUT (index calculations are done |
| 465 for 'free' while waiting on memory/L1). |
| 466 (b) Consolidate the load/store of two consecutive floats by a 64 bit |
| 467 integer (execution is memory/L1 bound). |
| 468 (c) Do a mix of floats and 64 bit integer to maximize register |
| 469 utilization (execution is memory/L1 bound). |
| 470 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). |
| 471 (e) Hard-coding of the offsets to completely eliminates index |
| 472 calculations. |
| 473 */ |
| 474 |
| 475 unsigned int j, j1, k, k1; |
| 476 float xr, xi, yr, yi; |
| 477 |
| 478 const int ip[4] = {0, 64, 32, 96}; |
| 479 for (k = 0; k < 4; k++) { |
| 480 for (j = 0; j < k; j++) { |
| 481 j1 = 2 * j + ip[k]; |
| 482 k1 = 2 * k + ip[j]; |
| 483 xr = a[j1 + 0]; |
| 484 xi = a[j1 + 1]; |
| 485 yr = a[k1 + 0]; |
| 486 yi = a[k1 + 1]; |
| 487 a[j1 + 0] = yr; |
| 488 a[j1 + 1] = yi; |
| 489 a[k1 + 0] = xr; |
| 490 a[k1 + 1] = xi; |
| 491 j1 += 8; |
| 492 k1 += 16; |
| 493 xr = a[j1 + 0]; |
| 494 xi = a[j1 + 1]; |
| 495 yr = a[k1 + 0]; |
| 496 yi = a[k1 + 1]; |
| 497 a[j1 + 0] = yr; |
| 498 a[j1 + 1] = yi; |
| 499 a[k1 + 0] = xr; |
| 500 a[k1 + 1] = xi; |
| 501 j1 += 8; |
| 502 k1 -= 8; |
| 503 xr = a[j1 + 0]; |
| 504 xi = a[j1 + 1]; |
| 505 yr = a[k1 + 0]; |
| 506 yi = a[k1 + 1]; |
| 507 a[j1 + 0] = yr; |
| 508 a[j1 + 1] = yi; |
| 509 a[k1 + 0] = xr; |
| 510 a[k1 + 1] = xi; |
| 511 j1 += 8; |
| 512 k1 += 16; |
| 513 xr = a[j1 + 0]; |
| 514 xi = a[j1 + 1]; |
| 515 yr = a[k1 + 0]; |
| 516 yi = a[k1 + 1]; |
| 517 a[j1 + 0] = yr; |
| 518 a[j1 + 1] = yi; |
| 519 a[k1 + 0] = xr; |
| 520 a[k1 + 1] = xi; |
| 521 } |
| 522 j1 = 2 * k + 8 + ip[k]; |
| 523 k1 = j1 + 8; |
| 524 xr = a[j1 + 0]; |
| 525 xi = a[j1 + 1]; |
| 526 yr = a[k1 + 0]; |
| 527 yi = a[k1 + 1]; |
| 528 a[j1 + 0] = yr; |
| 529 a[j1 + 1] = yi; |
| 530 a[k1 + 0] = xr; |
| 531 a[k1 + 1] = xi; |
| 532 } |
| 533 } |
| 534 |
| 535 } // namespace webrtc |
OLD | NEW |