Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(717)

Side by Side Diff: webrtc/modules/audio_processing/utility/ooura_fft.cc

Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe. (Closed)
Patch Set: Rebase Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html 2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3 * Copyright Takuya OOURA, 1996-2001 3 * Copyright Takuya OOURA, 1996-2001
4 * 4 *
5 * You may use, copy, modify and distribute this code for any purpose (include 5 * You may use, copy, modify and distribute this code for any purpose (include
6 * commercial use) and without fee. Please refer to this package when you modify 6 * commercial use) and without fee. Please refer to this package when you modify
7 * this code. 7 * this code.
8 * 8 *
9 * Changes by the WebRTC authors: 9 * Changes by the WebRTC authors:
10 * - Trivial type modifications. 10 * - Trivial type modifications.
11 * - Minimal code subset to do rdft of length 128. 11 * - Minimal code subset to do rdft of length 128.
12 * - Optimizations because of known length. 12 * - Optimizations because of known length.
13 * - Removed the global variables by moving the code in to a class in order
14 * to make it thread safe.
13 * 15 *
14 * All changes are covered by the WebRTC license and IP grant: 16 * All changes are covered by the WebRTC license and IP grant:
15 * Use of this source code is governed by a BSD-style license 17 * Use of this source code is governed by a BSD-style license
16 * that can be found in the LICENSE file in the root of the source 18 * that can be found in the LICENSE file in the root of the source
17 * tree. An additional intellectual property rights grant can be found 19 * tree. An additional intellectual property rights grant can be found
18 * in the file PATENTS. All contributing project authors may 20 * in the file PATENTS. All contributing project authors may
19 * be found in the AUTHORS file in the root of the source tree. 21 * be found in the AUTHORS file in the root of the source tree.
20 */ 22 */
21 23
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" 24 #include "webrtc/modules/audio_processing//utility/ooura_fft.h"
23 25
24 #include <math.h> 26 #include <math.h>
25 27
28 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h"
26 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" 29 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
27 #include "webrtc/typedefs.h" 30 #include "webrtc/typedefs.h"
28 31
29 // These tables used to be computed at run-time. For example, refer to: 32 namespace webrtc {
30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_pro cessing/aec/aec_rdft.c?r=6564
31 // to see the initialization code.
32 const float rdft_w[64] = {
33 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
34 0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
35 0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
36 0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
37 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
38 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
39 0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
40 0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
41 0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
42 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
43 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
44 0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
45 0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
46 0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
47 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
48 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
49 };
50 const float rdft_wk3ri_first[16] = {
51 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
52 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
53 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
54 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
55 };
56 const float rdft_wk3ri_second[16] = {
57 -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
58 -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
59 -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
60 -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
61 };
62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
63 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
64 0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
65 0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
66 0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
67 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
68 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
69 0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
70 0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
71 };
72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
73 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
74 0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
75 0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
76 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
77 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
78 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
79 0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
80 0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
81 };
82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
83 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
84 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
85 0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
86 -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
87 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
88 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
89 0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
90 -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
91 };
92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
93 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
94 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
95 -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
96 -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
97 -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
98 -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
99 -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
100 -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
101 };
102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
103 -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
104 -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
105 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
106 -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
107 -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
108 -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
109 -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
110 -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
111 };
112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
113 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
114 -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
115 -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
116 -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
117 -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
118 -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
119 -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
120 -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
121 };
122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
123 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
124 };
125 33
126 static void bitrv2_128_C(float* a) { 34 namespace {
127 /*
128 Following things have been attempted but are no faster:
129 (a) Storing the swap indexes in a LUT (index calculations are done
130 for 'free' while waiting on memory/L1).
131 (b) Consolidate the load/store of two consecutive floats by a 64 bit
132 integer (execution is memory/L1 bound).
133 (c) Do a mix of floats and 64 bit integer to maximize register
134 utilization (execution is memory/L1 bound).
135 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
136 (e) Hard-coding of the offsets to completely eliminates index
137 calculations.
138 */
139 35
140 unsigned int j, j1, k, k1; 36 #if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON))
141 float xr, xi, yr, yi;
142
143 static const int ip[4] = {0, 64, 32, 96};
144 for (k = 0; k < 4; k++) {
145 for (j = 0; j < k; j++) {
146 j1 = 2 * j + ip[k];
147 k1 = 2 * k + ip[j];
148 xr = a[j1 + 0];
149 xi = a[j1 + 1];
150 yr = a[k1 + 0];
151 yi = a[k1 + 1];
152 a[j1 + 0] = yr;
153 a[j1 + 1] = yi;
154 a[k1 + 0] = xr;
155 a[k1 + 1] = xi;
156 j1 += 8;
157 k1 += 16;
158 xr = a[j1 + 0];
159 xi = a[j1 + 1];
160 yr = a[k1 + 0];
161 yi = a[k1 + 1];
162 a[j1 + 0] = yr;
163 a[j1 + 1] = yi;
164 a[k1 + 0] = xr;
165 a[k1 + 1] = xi;
166 j1 += 8;
167 k1 -= 8;
168 xr = a[j1 + 0];
169 xi = a[j1 + 1];
170 yr = a[k1 + 0];
171 yi = a[k1 + 1];
172 a[j1 + 0] = yr;
173 a[j1 + 1] = yi;
174 a[k1 + 0] = xr;
175 a[k1 + 1] = xi;
176 j1 += 8;
177 k1 += 16;
178 xr = a[j1 + 0];
179 xi = a[j1 + 1];
180 yr = a[k1 + 0];
181 yi = a[k1 + 1];
182 a[j1 + 0] = yr;
183 a[j1 + 1] = yi;
184 a[k1 + 0] = xr;
185 a[k1 + 1] = xi;
186 }
187 j1 = 2 * k + 8 + ip[k];
188 k1 = j1 + 8;
189 xr = a[j1 + 0];
190 xi = a[j1 + 1];
191 yr = a[k1 + 0];
192 yi = a[k1 + 1];
193 a[j1 + 0] = yr;
194 a[j1 + 1] = yi;
195 a[k1 + 0] = xr;
196 a[k1 + 1] = xi;
197 }
198 }
199
200 static void cft1st_128_C(float* a) { 37 static void cft1st_128_C(float* a) {
201 const int n = 128; 38 const int n = 128;
202 int j, k1, k2; 39 int j, k1, k2;
203 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; 40 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
204 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; 41 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
205 42
206 // The processing of the first set of elements was simplified in C to avoid 43 // The processing of the first set of elements was simplified in C to avoid
207 // some operations (multiplication by zero or one, addition of two elements 44 // some operations (multiplication by zero or one, addition of two elements
208 // multiplied by the same weight, ...). 45 // multiplied by the same weight, ...).
209 x0r = a[0] + a[2]; 46 x0r = a[0] + a[2];
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after
424 a[j1 + 0] = wk1r * x0r - wk1i * x0i; 261 a[j1 + 0] = wk1r * x0r - wk1i * x0i;
425 a[j1 + 1] = wk1r * x0i + wk1i * x0r; 262 a[j1 + 1] = wk1r * x0i + wk1i * x0r;
426 x0r = x1r + x3i; 263 x0r = x1r + x3i;
427 x0i = x1i - x3r; 264 x0i = x1i - x3r;
428 a[j3 + 0] = wk3r * x0r - wk3i * x0i; 265 a[j3 + 0] = wk3r * x0r - wk3i * x0i;
429 a[j3 + 1] = wk3r * x0i + wk3i * x0r; 266 a[j3 + 1] = wk3r * x0i + wk3i * x0r;
430 } 267 }
431 } 268 }
432 } 269 }
433 270
434 static void cftfsub_128_C(float* a) {
435 int j, j1, j2, j3, l;
436 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
437
438 cft1st_128(a);
439 cftmdl_128(a);
440 l = 32;
441 for (j = 0; j < l; j += 2) {
442 j1 = j + l;
443 j2 = j1 + l;
444 j3 = j2 + l;
445 x0r = a[j] + a[j1];
446 x0i = a[j + 1] + a[j1 + 1];
447 x1r = a[j] - a[j1];
448 x1i = a[j + 1] - a[j1 + 1];
449 x2r = a[j2] + a[j3];
450 x2i = a[j2 + 1] + a[j3 + 1];
451 x3r = a[j2] - a[j3];
452 x3i = a[j2 + 1] - a[j3 + 1];
453 a[j] = x0r + x2r;
454 a[j + 1] = x0i + x2i;
455 a[j2] = x0r - x2r;
456 a[j2 + 1] = x0i - x2i;
457 a[j1] = x1r - x3i;
458 a[j1 + 1] = x1i + x3r;
459 a[j3] = x1r + x3i;
460 a[j3 + 1] = x1i - x3r;
461 }
462 }
463
464 static void cftbsub_128_C(float* a) {
465 int j, j1, j2, j3, l;
466 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
467
468 cft1st_128(a);
469 cftmdl_128(a);
470 l = 32;
471
472 for (j = 0; j < l; j += 2) {
473 j1 = j + l;
474 j2 = j1 + l;
475 j3 = j2 + l;
476 x0r = a[j] + a[j1];
477 x0i = -a[j + 1] - a[j1 + 1];
478 x1r = a[j] - a[j1];
479 x1i = -a[j + 1] + a[j1 + 1];
480 x2r = a[j2] + a[j3];
481 x2i = a[j2 + 1] + a[j3 + 1];
482 x3r = a[j2] - a[j3];
483 x3i = a[j2 + 1] - a[j3 + 1];
484 a[j] = x0r + x2r;
485 a[j + 1] = x0i - x2i;
486 a[j2] = x0r - x2r;
487 a[j2 + 1] = x0i + x2i;
488 a[j1] = x1r - x3i;
489 a[j1 + 1] = x1i - x3r;
490 a[j3] = x1r + x3i;
491 a[j3 + 1] = x1i + x3r;
492 }
493 }
494
495 static void rftfsub_128_C(float* a) { 271 static void rftfsub_128_C(float* a) {
496 const float* c = rdft_w + 32; 272 const float* c = rdft_w + 32;
497 int j1, j2, k1, k2; 273 int j1, j2, k1, k2;
498 float wkr, wki, xr, xi, yr, yi; 274 float wkr, wki, xr, xi, yr, yi;
499 275
500 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { 276 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
501 k2 = 128 - j2; 277 k2 = 128 - j2;
502 k1 = 32 - j1; 278 k1 = 32 - j1;
503 wkr = 0.5f - c[k1]; 279 wkr = 0.5f - c[k1];
504 wki = c[j1]; 280 wki = c[j1];
(...skipping 23 matching lines...) Expand all
528 xi = a[j2 + 1] + a[k2 + 1]; 304 xi = a[j2 + 1] + a[k2 + 1];
529 yr = wkr * xr + wki * xi; 305 yr = wkr * xr + wki * xi;
530 yi = wkr * xi - wki * xr; 306 yi = wkr * xi - wki * xr;
531 a[j2 + 0] = a[j2 + 0] - yr; 307 a[j2 + 0] = a[j2 + 0] - yr;
532 a[j2 + 1] = yi - a[j2 + 1]; 308 a[j2 + 1] = yi - a[j2 + 1];
533 a[k2 + 0] = yr + a[k2 + 0]; 309 a[k2 + 0] = yr + a[k2 + 0];
534 a[k2 + 1] = yi - a[k2 + 1]; 310 a[k2 + 1] = yi - a[k2 + 1];
535 } 311 }
536 a[65] = -a[65]; 312 a[65] = -a[65];
537 } 313 }
314 #endif
538 315
539 void aec_rdft_forward_128(float* a) { 316
317 } // namespace
318
319 OouraFft::OouraFft() {
320 #if defined(WEBRTC_ARCH_X86_FAMILY)
321 use_sse2_ = (WebRtc_GetCPUInfo(kSSE2) != 0);
322 #else
323 use_sse2_ = false;
324 #endif
325 }
326
327 OouraFft::~OouraFft() = default;
328
329 void OouraFft::Fft(float* a) const {
540 float xi; 330 float xi;
541 bitrv2_128(a); 331 bitrv2_128(a);
542 cftfsub_128(a); 332 cftfsub_128(a);
543 rftfsub_128(a); 333 rftfsub_128(a);
544 xi = a[0] - a[1]; 334 xi = a[0] - a[1];
545 a[0] += a[1]; 335 a[0] += a[1];
546 a[1] = xi; 336 a[1] = xi;
547 } 337 }
548 338 void OouraFft::InverseFft(float* a) const {
549 void aec_rdft_inverse_128(float* a) {
550 a[1] = 0.5f * (a[0] - a[1]); 339 a[1] = 0.5f * (a[0] - a[1]);
551 a[0] -= a[1]; 340 a[0] -= a[1];
552 rftbsub_128(a); 341 rftbsub_128(a);
553 bitrv2_128(a); 342 bitrv2_128(a);
554 cftbsub_128(a); 343 cftbsub_128(a);
555 } 344 }
556 345
557 // code path selection 346 void OouraFft::cft1st_128(float* a) const {
558 RftSub128 cft1st_128; 347 #if defined(MIPS_FPU_LE)
559 RftSub128 cftmdl_128; 348 cft1st_128_mips(a);
560 RftSub128 rftfsub_128; 349 #elif defined(WEBRTC_HAS_NEON)
561 RftSub128 rftbsub_128; 350 cft1st_128_neon(a);
562 RftSub128 cftfsub_128; 351 #else
563 RftSub128 cftbsub_128; 352 if (use_sse2_) {
564 RftSub128 bitrv2_128; 353 cft1st_128_SSE2(a);
565 354 } else {
566 void aec_rdft_init(void) { 355 cft1st_128_C(a);
567 cft1st_128 = cft1st_128_C;
568 cftmdl_128 = cftmdl_128_C;
569 rftfsub_128 = rftfsub_128_C;
570 rftbsub_128 = rftbsub_128_C;
571 cftfsub_128 = cftfsub_128_C;
572 cftbsub_128 = cftbsub_128_C;
573 bitrv2_128 = bitrv2_128_C;
574 #if defined(WEBRTC_ARCH_X86_FAMILY)
575 if (WebRtc_GetCPUInfo(kSSE2)) {
576 aec_rdft_init_sse2();
577 } 356 }
578 #endif 357 #endif
358 }
359 void OouraFft::cftmdl_128(float* a) const {
579 #if defined(MIPS_FPU_LE) 360 #if defined(MIPS_FPU_LE)
580 aec_rdft_init_mips(); 361 cftmdl_128_mips(a);
581 #endif 362 #elif defined(WEBRTC_HAS_NEON)
582 #if defined(WEBRTC_HAS_NEON) 363 cftmdl_128_neon(a);
583 aec_rdft_init_neon(); 364 #else
365 if (use_sse2_) {
366 cftmdl_128_SSE2(a);
367 } else {
368 cftmdl_128_C(a);
369 }
584 #endif 370 #endif
585 } 371 }
372 void OouraFft::rftfsub_128(float* a) const {
373 #if defined(MIPS_FPU_LE)
374 rftfsub_128_mips(a);
375 #elif defined(WEBRTC_HAS_NEON)
376 rftfsub_128_neon(a);
377 #else
378 if (use_sse2_) {
379 rftfsub_128_SSE2(a);
380 } else {
381 rftfsub_128_C(a);
382 }
383 #endif
384 }
385
386 void OouraFft::rftbsub_128(float* a) const {
387 #if defined(MIPS_FPU_LE)
388 rftbsub_128_mips(a);
389 #elif defined(WEBRTC_HAS_NEON)
390 rftbsub_128_neon(a);
391 #else
392 if (use_sse2_) {
393 rftbsub_128_SSE2(a);
394 } else {
395 rftbsub_128_C(a);
396 }
397 #endif
398 }
399
400 void OouraFft::cftbsub_128(float* a) const {
401 int j, j1, j2, j3, l;
402 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
403
404 cft1st_128(a);
405 cftmdl_128(a);
406 l = 32;
407
408 for (j = 0; j < l; j += 2) {
409 j1 = j + l;
410 j2 = j1 + l;
411 j3 = j2 + l;
412 x0r = a[j] + a[j1];
413 x0i = -a[j + 1] - a[j1 + 1];
414 x1r = a[j] - a[j1];
415 x1i = -a[j + 1] + a[j1 + 1];
416 x2r = a[j2] + a[j3];
417 x2i = a[j2 + 1] + a[j3 + 1];
418 x3r = a[j2] - a[j3];
419 x3i = a[j2 + 1] - a[j3 + 1];
420 a[j] = x0r + x2r;
421 a[j + 1] = x0i - x2i;
422 a[j2] = x0r - x2r;
423 a[j2 + 1] = x0i + x2i;
424 a[j1] = x1r - x3i;
425 a[j1 + 1] = x1i - x3r;
426 a[j3] = x1r + x3i;
427 a[j3 + 1] = x1i + x3r;
428 }
429 }
430
431 void OouraFft::cftfsub_128(float* a) const {
432 int j, j1, j2, j3, l;
433 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
434
435 cft1st_128(a);
436 cftmdl_128(a);
437 l = 32;
438 for (j = 0; j < l; j += 2) {
439 j1 = j + l;
440 j2 = j1 + l;
441 j3 = j2 + l;
442 x0r = a[j] + a[j1];
443 x0i = a[j + 1] + a[j1 + 1];
444 x1r = a[j] - a[j1];
445 x1i = a[j + 1] - a[j1 + 1];
446 x2r = a[j2] + a[j3];
447 x2i = a[j2 + 1] + a[j3 + 1];
448 x3r = a[j2] - a[j3];
449 x3i = a[j2 + 1] - a[j3 + 1];
450 a[j] = x0r + x2r;
451 a[j + 1] = x0i + x2i;
452 a[j2] = x0r - x2r;
453 a[j2 + 1] = x0i - x2i;
454 a[j1] = x1r - x3i;
455 a[j1 + 1] = x1i + x3r;
456 a[j3] = x1r + x3i;
457 a[j3 + 1] = x1i - x3r;
458 }
459 }
460
461 void OouraFft::bitrv2_128(float* a) const {
462 /*
463 Following things have been attempted but are no faster:
464 (a) Storing the swap indexes in a LUT (index calculations are done
465 for 'free' while waiting on memory/L1).
466 (b) Consolidate the load/store of two consecutive floats by a 64 bit
467 integer (execution is memory/L1 bound).
468 (c) Do a mix of floats and 64 bit integer to maximize register
469 utilization (execution is memory/L1 bound).
470 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
471 (e) Hard-coding of the offsets to completely eliminates index
472 calculations.
473 */
474
475 unsigned int j, j1, k, k1;
476 float xr, xi, yr, yi;
477
478 const int ip[4] = {0, 64, 32, 96};
479 for (k = 0; k < 4; k++) {
480 for (j = 0; j < k; j++) {
481 j1 = 2 * j + ip[k];
482 k1 = 2 * k + ip[j];
483 xr = a[j1 + 0];
484 xi = a[j1 + 1];
485 yr = a[k1 + 0];
486 yi = a[k1 + 1];
487 a[j1 + 0] = yr;
488 a[j1 + 1] = yi;
489 a[k1 + 0] = xr;
490 a[k1 + 1] = xi;
491 j1 += 8;
492 k1 += 16;
493 xr = a[j1 + 0];
494 xi = a[j1 + 1];
495 yr = a[k1 + 0];
496 yi = a[k1 + 1];
497 a[j1 + 0] = yr;
498 a[j1 + 1] = yi;
499 a[k1 + 0] = xr;
500 a[k1 + 1] = xi;
501 j1 += 8;
502 k1 -= 8;
503 xr = a[j1 + 0];
504 xi = a[j1 + 1];
505 yr = a[k1 + 0];
506 yi = a[k1 + 1];
507 a[j1 + 0] = yr;
508 a[j1 + 1] = yi;
509 a[k1 + 0] = xr;
510 a[k1 + 1] = xi;
511 j1 += 8;
512 k1 += 16;
513 xr = a[j1 + 0];
514 xi = a[j1 + 1];
515 yr = a[k1 + 0];
516 yi = a[k1 + 1];
517 a[j1 + 0] = yr;
518 a[j1 + 1] = yi;
519 a[k1 + 0] = xr;
520 a[k1 + 1] = xi;
521 }
522 j1 = 2 * k + 8 + ip[k];
523 k1 = j1 + 8;
524 xr = a[j1 + 0];
525 xi = a[j1 + 1];
526 yr = a[k1 + 0];
527 yi = a[k1 + 1];
528 a[j1 + 0] = yr;
529 a[j1 + 1] = yi;
530 a[k1 + 0] = xr;
531 a[k1 + 1] = xi;
532 }
533 }
534
535 } // namespace webrtc
OLDNEW
« no previous file with comments | « webrtc/modules/audio_processing/utility/ooura_fft.h ('k') | webrtc/modules/audio_processing/utility/ooura_fft_mips.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698