| OLD | NEW |
| 1 /* | 1 /* |
| 2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html | 2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html |
| 3 * Copyright Takuya OOURA, 1996-2001 | 3 * Copyright Takuya OOURA, 1996-2001 |
| 4 * | 4 * |
| 5 * You may use, copy, modify and distribute this code for any purpose (include | 5 * You may use, copy, modify and distribute this code for any purpose (include |
| 6 * commercial use) and without fee. Please refer to this package when you modify | 6 * commercial use) and without fee. Please refer to this package when you modify |
| 7 * this code. | 7 * this code. |
| 8 * | 8 * |
| 9 * Changes by the WebRTC authors: | 9 * Changes by the WebRTC authors: |
| 10 * - Trivial type modifications. | 10 * - Trivial type modifications. |
| 11 * - Minimal code subset to do rdft of length 128. | 11 * - Minimal code subset to do rdft of length 128. |
| 12 * - Optimizations because of known length. | 12 * - Optimizations because of known length. |
| 13 * - Removed the global variables by moving the code in to a class in order |
| 14 * to make it thread safe. |
| 13 * | 15 * |
| 14 * All changes are covered by the WebRTC license and IP grant: | 16 * All changes are covered by the WebRTC license and IP grant: |
| 15 * Use of this source code is governed by a BSD-style license | 17 * Use of this source code is governed by a BSD-style license |
| 16 * that can be found in the LICENSE file in the root of the source | 18 * that can be found in the LICENSE file in the root of the source |
| 17 * tree. An additional intellectual property rights grant can be found | 19 * tree. An additional intellectual property rights grant can be found |
| 18 * in the file PATENTS. All contributing project authors may | 20 * in the file PATENTS. All contributing project authors may |
| 19 * be found in the AUTHORS file in the root of the source tree. | 21 * be found in the AUTHORS file in the root of the source tree. |
| 20 */ | 22 */ |
| 21 | 23 |
| 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 24 #include "webrtc/modules/audio_processing//utility/ooura_fft.h" |
| 23 | 25 |
| 24 #include <math.h> | 26 #include <math.h> |
| 25 | 27 |
| 28 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
| 26 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" | 29 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" |
| 27 #include "webrtc/typedefs.h" | 30 #include "webrtc/typedefs.h" |
| 28 | 31 |
| 29 // These tables used to be computed at run-time. For example, refer to: | 32 namespace webrtc { |
| 30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_pro
cessing/aec/aec_rdft.c?r=6564 | |
| 31 // to see the initialization code. | |
| 32 const float rdft_w[64] = { | |
| 33 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, | |
| 34 0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f, | |
| 35 0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f, | |
| 36 0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f, | |
| 37 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f, | |
| 38 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, | |
| 39 0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f, | |
| 40 0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f, | |
| 41 0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f, | |
| 42 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f, | |
| 43 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, | |
| 44 0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f, | |
| 45 0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f, | |
| 46 0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f, | |
| 47 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f, | |
| 48 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f, | |
| 49 }; | |
| 50 const float rdft_wk3ri_first[16] = { | |
| 51 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f, | |
| 52 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f, | |
| 53 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f, | |
| 54 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f, | |
| 55 }; | |
| 56 const float rdft_wk3ri_second[16] = { | |
| 57 -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f, | |
| 58 -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f, | |
| 59 -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f, | |
| 60 -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f, | |
| 61 }; | |
| 62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = { | |
| 63 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, | |
| 64 0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f, | |
| 65 0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f, | |
| 66 0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f, | |
| 67 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f, | |
| 68 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, | |
| 69 0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f, | |
| 70 0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f, | |
| 71 }; | |
| 72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = { | |
| 73 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, | |
| 74 0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f, | |
| 75 0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f, | |
| 76 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, | |
| 77 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f, | |
| 78 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, | |
| 79 0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f, | |
| 80 0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f, | |
| 81 }; | |
| 82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = { | |
| 83 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, | |
| 84 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, | |
| 85 0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f, | |
| 86 -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f, | |
| 87 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f, | |
| 88 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, | |
| 89 0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f, | |
| 90 -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f, | |
| 91 }; | |
| 92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = { | |
| 93 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, | |
| 94 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, | |
| 95 -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f, | |
| 96 -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f, | |
| 97 -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f, | |
| 98 -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, | |
| 99 -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f, | |
| 100 -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f, | |
| 101 }; | |
| 102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = { | |
| 103 -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, | |
| 104 -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f, | |
| 105 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, | |
| 106 -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f, | |
| 107 -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f, | |
| 108 -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, | |
| 109 -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f, | |
| 110 -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f, | |
| 111 }; | |
| 112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = { | |
| 113 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, | |
| 114 -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f, | |
| 115 -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f, | |
| 116 -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f, | |
| 117 -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f, | |
| 118 -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, | |
| 119 -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f, | |
| 120 -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f, | |
| 121 }; | |
| 122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = { | |
| 123 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f, | |
| 124 }; | |
| 125 | 33 |
| 126 static void bitrv2_128_C(float* a) { | 34 namespace { |
| 127 /* | |
| 128 Following things have been attempted but are no faster: | |
| 129 (a) Storing the swap indexes in a LUT (index calculations are done | |
| 130 for 'free' while waiting on memory/L1). | |
| 131 (b) Consolidate the load/store of two consecutive floats by a 64 bit | |
| 132 integer (execution is memory/L1 bound). | |
| 133 (c) Do a mix of floats and 64 bit integer to maximize register | |
| 134 utilization (execution is memory/L1 bound). | |
| 135 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). | |
| 136 (e) Hard-coding of the offsets to completely eliminates index | |
| 137 calculations. | |
| 138 */ | |
| 139 | 35 |
| 140 unsigned int j, j1, k, k1; | 36 #if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON)) |
| 141 float xr, xi, yr, yi; | |
| 142 | |
| 143 static const int ip[4] = {0, 64, 32, 96}; | |
| 144 for (k = 0; k < 4; k++) { | |
| 145 for (j = 0; j < k; j++) { | |
| 146 j1 = 2 * j + ip[k]; | |
| 147 k1 = 2 * k + ip[j]; | |
| 148 xr = a[j1 + 0]; | |
| 149 xi = a[j1 + 1]; | |
| 150 yr = a[k1 + 0]; | |
| 151 yi = a[k1 + 1]; | |
| 152 a[j1 + 0] = yr; | |
| 153 a[j1 + 1] = yi; | |
| 154 a[k1 + 0] = xr; | |
| 155 a[k1 + 1] = xi; | |
| 156 j1 += 8; | |
| 157 k1 += 16; | |
| 158 xr = a[j1 + 0]; | |
| 159 xi = a[j1 + 1]; | |
| 160 yr = a[k1 + 0]; | |
| 161 yi = a[k1 + 1]; | |
| 162 a[j1 + 0] = yr; | |
| 163 a[j1 + 1] = yi; | |
| 164 a[k1 + 0] = xr; | |
| 165 a[k1 + 1] = xi; | |
| 166 j1 += 8; | |
| 167 k1 -= 8; | |
| 168 xr = a[j1 + 0]; | |
| 169 xi = a[j1 + 1]; | |
| 170 yr = a[k1 + 0]; | |
| 171 yi = a[k1 + 1]; | |
| 172 a[j1 + 0] = yr; | |
| 173 a[j1 + 1] = yi; | |
| 174 a[k1 + 0] = xr; | |
| 175 a[k1 + 1] = xi; | |
| 176 j1 += 8; | |
| 177 k1 += 16; | |
| 178 xr = a[j1 + 0]; | |
| 179 xi = a[j1 + 1]; | |
| 180 yr = a[k1 + 0]; | |
| 181 yi = a[k1 + 1]; | |
| 182 a[j1 + 0] = yr; | |
| 183 a[j1 + 1] = yi; | |
| 184 a[k1 + 0] = xr; | |
| 185 a[k1 + 1] = xi; | |
| 186 } | |
| 187 j1 = 2 * k + 8 + ip[k]; | |
| 188 k1 = j1 + 8; | |
| 189 xr = a[j1 + 0]; | |
| 190 xi = a[j1 + 1]; | |
| 191 yr = a[k1 + 0]; | |
| 192 yi = a[k1 + 1]; | |
| 193 a[j1 + 0] = yr; | |
| 194 a[j1 + 1] = yi; | |
| 195 a[k1 + 0] = xr; | |
| 196 a[k1 + 1] = xi; | |
| 197 } | |
| 198 } | |
| 199 | |
| 200 static void cft1st_128_C(float* a) { | 37 static void cft1st_128_C(float* a) { |
| 201 const int n = 128; | 38 const int n = 128; |
| 202 int j, k1, k2; | 39 int j, k1, k2; |
| 203 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; | 40 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; |
| 204 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | 41 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
| 205 | 42 |
| 206 // The processing of the first set of elements was simplified in C to avoid | 43 // The processing of the first set of elements was simplified in C to avoid |
| 207 // some operations (multiplication by zero or one, addition of two elements | 44 // some operations (multiplication by zero or one, addition of two elements |
| 208 // multiplied by the same weight, ...). | 45 // multiplied by the same weight, ...). |
| 209 x0r = a[0] + a[2]; | 46 x0r = a[0] + a[2]; |
| (...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 424 a[j1 + 0] = wk1r * x0r - wk1i * x0i; | 261 a[j1 + 0] = wk1r * x0r - wk1i * x0i; |
| 425 a[j1 + 1] = wk1r * x0i + wk1i * x0r; | 262 a[j1 + 1] = wk1r * x0i + wk1i * x0r; |
| 426 x0r = x1r + x3i; | 263 x0r = x1r + x3i; |
| 427 x0i = x1i - x3r; | 264 x0i = x1i - x3r; |
| 428 a[j3 + 0] = wk3r * x0r - wk3i * x0i; | 265 a[j3 + 0] = wk3r * x0r - wk3i * x0i; |
| 429 a[j3 + 1] = wk3r * x0i + wk3i * x0r; | 266 a[j3 + 1] = wk3r * x0i + wk3i * x0r; |
| 430 } | 267 } |
| 431 } | 268 } |
| 432 } | 269 } |
| 433 | 270 |
| 434 static void cftfsub_128_C(float* a) { | |
| 435 int j, j1, j2, j3, l; | |
| 436 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | |
| 437 | |
| 438 cft1st_128(a); | |
| 439 cftmdl_128(a); | |
| 440 l = 32; | |
| 441 for (j = 0; j < l; j += 2) { | |
| 442 j1 = j + l; | |
| 443 j2 = j1 + l; | |
| 444 j3 = j2 + l; | |
| 445 x0r = a[j] + a[j1]; | |
| 446 x0i = a[j + 1] + a[j1 + 1]; | |
| 447 x1r = a[j] - a[j1]; | |
| 448 x1i = a[j + 1] - a[j1 + 1]; | |
| 449 x2r = a[j2] + a[j3]; | |
| 450 x2i = a[j2 + 1] + a[j3 + 1]; | |
| 451 x3r = a[j2] - a[j3]; | |
| 452 x3i = a[j2 + 1] - a[j3 + 1]; | |
| 453 a[j] = x0r + x2r; | |
| 454 a[j + 1] = x0i + x2i; | |
| 455 a[j2] = x0r - x2r; | |
| 456 a[j2 + 1] = x0i - x2i; | |
| 457 a[j1] = x1r - x3i; | |
| 458 a[j1 + 1] = x1i + x3r; | |
| 459 a[j3] = x1r + x3i; | |
| 460 a[j3 + 1] = x1i - x3r; | |
| 461 } | |
| 462 } | |
| 463 | |
| 464 static void cftbsub_128_C(float* a) { | |
| 465 int j, j1, j2, j3, l; | |
| 466 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | |
| 467 | |
| 468 cft1st_128(a); | |
| 469 cftmdl_128(a); | |
| 470 l = 32; | |
| 471 | |
| 472 for (j = 0; j < l; j += 2) { | |
| 473 j1 = j + l; | |
| 474 j2 = j1 + l; | |
| 475 j3 = j2 + l; | |
| 476 x0r = a[j] + a[j1]; | |
| 477 x0i = -a[j + 1] - a[j1 + 1]; | |
| 478 x1r = a[j] - a[j1]; | |
| 479 x1i = -a[j + 1] + a[j1 + 1]; | |
| 480 x2r = a[j2] + a[j3]; | |
| 481 x2i = a[j2 + 1] + a[j3 + 1]; | |
| 482 x3r = a[j2] - a[j3]; | |
| 483 x3i = a[j2 + 1] - a[j3 + 1]; | |
| 484 a[j] = x0r + x2r; | |
| 485 a[j + 1] = x0i - x2i; | |
| 486 a[j2] = x0r - x2r; | |
| 487 a[j2 + 1] = x0i + x2i; | |
| 488 a[j1] = x1r - x3i; | |
| 489 a[j1 + 1] = x1i - x3r; | |
| 490 a[j3] = x1r + x3i; | |
| 491 a[j3 + 1] = x1i + x3r; | |
| 492 } | |
| 493 } | |
| 494 | |
| 495 static void rftfsub_128_C(float* a) { | 271 static void rftfsub_128_C(float* a) { |
| 496 const float* c = rdft_w + 32; | 272 const float* c = rdft_w + 32; |
| 497 int j1, j2, k1, k2; | 273 int j1, j2, k1, k2; |
| 498 float wkr, wki, xr, xi, yr, yi; | 274 float wkr, wki, xr, xi, yr, yi; |
| 499 | 275 |
| 500 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { | 276 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { |
| 501 k2 = 128 - j2; | 277 k2 = 128 - j2; |
| 502 k1 = 32 - j1; | 278 k1 = 32 - j1; |
| 503 wkr = 0.5f - c[k1]; | 279 wkr = 0.5f - c[k1]; |
| 504 wki = c[j1]; | 280 wki = c[j1]; |
| (...skipping 23 matching lines...) Expand all Loading... |
| 528 xi = a[j2 + 1] + a[k2 + 1]; | 304 xi = a[j2 + 1] + a[k2 + 1]; |
| 529 yr = wkr * xr + wki * xi; | 305 yr = wkr * xr + wki * xi; |
| 530 yi = wkr * xi - wki * xr; | 306 yi = wkr * xi - wki * xr; |
| 531 a[j2 + 0] = a[j2 + 0] - yr; | 307 a[j2 + 0] = a[j2 + 0] - yr; |
| 532 a[j2 + 1] = yi - a[j2 + 1]; | 308 a[j2 + 1] = yi - a[j2 + 1]; |
| 533 a[k2 + 0] = yr + a[k2 + 0]; | 309 a[k2 + 0] = yr + a[k2 + 0]; |
| 534 a[k2 + 1] = yi - a[k2 + 1]; | 310 a[k2 + 1] = yi - a[k2 + 1]; |
| 535 } | 311 } |
| 536 a[65] = -a[65]; | 312 a[65] = -a[65]; |
| 537 } | 313 } |
| 314 #endif |
| 538 | 315 |
| 539 void aec_rdft_forward_128(float* a) { | 316 |
| 317 } // namespace |
| 318 |
| 319 OouraFft::OouraFft() { |
| 320 #if defined(WEBRTC_ARCH_X86_FAMILY) |
| 321 use_sse2_ = (WebRtc_GetCPUInfo(kSSE2) != 0); |
| 322 #else |
| 323 use_sse2_ = false; |
| 324 #endif |
| 325 } |
| 326 |
| 327 OouraFft::~OouraFft() = default; |
| 328 |
| 329 void OouraFft::Fft(float* a) const { |
| 540 float xi; | 330 float xi; |
| 541 bitrv2_128(a); | 331 bitrv2_128(a); |
| 542 cftfsub_128(a); | 332 cftfsub_128(a); |
| 543 rftfsub_128(a); | 333 rftfsub_128(a); |
| 544 xi = a[0] - a[1]; | 334 xi = a[0] - a[1]; |
| 545 a[0] += a[1]; | 335 a[0] += a[1]; |
| 546 a[1] = xi; | 336 a[1] = xi; |
| 547 } | 337 } |
| 548 | 338 void OouraFft::InverseFft(float* a) const { |
| 549 void aec_rdft_inverse_128(float* a) { | |
| 550 a[1] = 0.5f * (a[0] - a[1]); | 339 a[1] = 0.5f * (a[0] - a[1]); |
| 551 a[0] -= a[1]; | 340 a[0] -= a[1]; |
| 552 rftbsub_128(a); | 341 rftbsub_128(a); |
| 553 bitrv2_128(a); | 342 bitrv2_128(a); |
| 554 cftbsub_128(a); | 343 cftbsub_128(a); |
| 555 } | 344 } |
| 556 | 345 |
| 557 // code path selection | 346 void OouraFft::cft1st_128(float* a) const { |
| 558 RftSub128 cft1st_128; | 347 #if defined(MIPS_FPU_LE) |
| 559 RftSub128 cftmdl_128; | 348 cft1st_128_mips(a); |
| 560 RftSub128 rftfsub_128; | 349 #elif defined(WEBRTC_HAS_NEON) |
| 561 RftSub128 rftbsub_128; | 350 cft1st_128_neon(a); |
| 562 RftSub128 cftfsub_128; | 351 #else |
| 563 RftSub128 cftbsub_128; | 352 if (use_sse2_) { |
| 564 RftSub128 bitrv2_128; | 353 cft1st_128_SSE2(a); |
| 565 | 354 } else { |
| 566 void aec_rdft_init(void) { | 355 cft1st_128_C(a); |
| 567 cft1st_128 = cft1st_128_C; | |
| 568 cftmdl_128 = cftmdl_128_C; | |
| 569 rftfsub_128 = rftfsub_128_C; | |
| 570 rftbsub_128 = rftbsub_128_C; | |
| 571 cftfsub_128 = cftfsub_128_C; | |
| 572 cftbsub_128 = cftbsub_128_C; | |
| 573 bitrv2_128 = bitrv2_128_C; | |
| 574 #if defined(WEBRTC_ARCH_X86_FAMILY) | |
| 575 if (WebRtc_GetCPUInfo(kSSE2)) { | |
| 576 aec_rdft_init_sse2(); | |
| 577 } | 356 } |
| 578 #endif | 357 #endif |
| 358 } |
| 359 void OouraFft::cftmdl_128(float* a) const { |
| 579 #if defined(MIPS_FPU_LE) | 360 #if defined(MIPS_FPU_LE) |
| 580 aec_rdft_init_mips(); | 361 cftmdl_128_mips(a); |
| 581 #endif | 362 #elif defined(WEBRTC_HAS_NEON) |
| 582 #if defined(WEBRTC_HAS_NEON) | 363 cftmdl_128_neon(a); |
| 583 aec_rdft_init_neon(); | 364 #else |
| 365 if (use_sse2_) { |
| 366 cftmdl_128_SSE2(a); |
| 367 } else { |
| 368 cftmdl_128_C(a); |
| 369 } |
| 584 #endif | 370 #endif |
| 585 } | 371 } |
| 372 void OouraFft::rftfsub_128(float* a) const { |
| 373 #if defined(MIPS_FPU_LE) |
| 374 rftfsub_128_mips(a); |
| 375 #elif defined(WEBRTC_HAS_NEON) |
| 376 rftfsub_128_neon(a); |
| 377 #else |
| 378 if (use_sse2_) { |
| 379 rftfsub_128_SSE2(a); |
| 380 } else { |
| 381 rftfsub_128_C(a); |
| 382 } |
| 383 #endif |
| 384 } |
| 385 |
| 386 void OouraFft::rftbsub_128(float* a) const { |
| 387 #if defined(MIPS_FPU_LE) |
| 388 rftbsub_128_mips(a); |
| 389 #elif defined(WEBRTC_HAS_NEON) |
| 390 rftbsub_128_neon(a); |
| 391 #else |
| 392 if (use_sse2_) { |
| 393 rftbsub_128_SSE2(a); |
| 394 } else { |
| 395 rftbsub_128_C(a); |
| 396 } |
| 397 #endif |
| 398 } |
| 399 |
| 400 void OouraFft::cftbsub_128(float* a) const { |
| 401 int j, j1, j2, j3, l; |
| 402 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
| 403 |
| 404 cft1st_128(a); |
| 405 cftmdl_128(a); |
| 406 l = 32; |
| 407 |
| 408 for (j = 0; j < l; j += 2) { |
| 409 j1 = j + l; |
| 410 j2 = j1 + l; |
| 411 j3 = j2 + l; |
| 412 x0r = a[j] + a[j1]; |
| 413 x0i = -a[j + 1] - a[j1 + 1]; |
| 414 x1r = a[j] - a[j1]; |
| 415 x1i = -a[j + 1] + a[j1 + 1]; |
| 416 x2r = a[j2] + a[j3]; |
| 417 x2i = a[j2 + 1] + a[j3 + 1]; |
| 418 x3r = a[j2] - a[j3]; |
| 419 x3i = a[j2 + 1] - a[j3 + 1]; |
| 420 a[j] = x0r + x2r; |
| 421 a[j + 1] = x0i - x2i; |
| 422 a[j2] = x0r - x2r; |
| 423 a[j2 + 1] = x0i + x2i; |
| 424 a[j1] = x1r - x3i; |
| 425 a[j1 + 1] = x1i - x3r; |
| 426 a[j3] = x1r + x3i; |
| 427 a[j3 + 1] = x1i + x3r; |
| 428 } |
| 429 } |
| 430 |
| 431 void OouraFft::cftfsub_128(float* a) const { |
| 432 int j, j1, j2, j3, l; |
| 433 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
| 434 |
| 435 cft1st_128(a); |
| 436 cftmdl_128(a); |
| 437 l = 32; |
| 438 for (j = 0; j < l; j += 2) { |
| 439 j1 = j + l; |
| 440 j2 = j1 + l; |
| 441 j3 = j2 + l; |
| 442 x0r = a[j] + a[j1]; |
| 443 x0i = a[j + 1] + a[j1 + 1]; |
| 444 x1r = a[j] - a[j1]; |
| 445 x1i = a[j + 1] - a[j1 + 1]; |
| 446 x2r = a[j2] + a[j3]; |
| 447 x2i = a[j2 + 1] + a[j3 + 1]; |
| 448 x3r = a[j2] - a[j3]; |
| 449 x3i = a[j2 + 1] - a[j3 + 1]; |
| 450 a[j] = x0r + x2r; |
| 451 a[j + 1] = x0i + x2i; |
| 452 a[j2] = x0r - x2r; |
| 453 a[j2 + 1] = x0i - x2i; |
| 454 a[j1] = x1r - x3i; |
| 455 a[j1 + 1] = x1i + x3r; |
| 456 a[j3] = x1r + x3i; |
| 457 a[j3 + 1] = x1i - x3r; |
| 458 } |
| 459 } |
| 460 |
| 461 void OouraFft::bitrv2_128(float* a) const { |
| 462 /* |
| 463 Following things have been attempted but are no faster: |
| 464 (a) Storing the swap indexes in a LUT (index calculations are done |
| 465 for 'free' while waiting on memory/L1). |
| 466 (b) Consolidate the load/store of two consecutive floats by a 64 bit |
| 467 integer (execution is memory/L1 bound). |
| 468 (c) Do a mix of floats and 64 bit integer to maximize register |
| 469 utilization (execution is memory/L1 bound). |
| 470 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). |
| 471 (e) Hard-coding of the offsets to completely eliminates index |
| 472 calculations. |
| 473 */ |
| 474 |
| 475 unsigned int j, j1, k, k1; |
| 476 float xr, xi, yr, yi; |
| 477 |
| 478 const int ip[4] = {0, 64, 32, 96}; |
| 479 for (k = 0; k < 4; k++) { |
| 480 for (j = 0; j < k; j++) { |
| 481 j1 = 2 * j + ip[k]; |
| 482 k1 = 2 * k + ip[j]; |
| 483 xr = a[j1 + 0]; |
| 484 xi = a[j1 + 1]; |
| 485 yr = a[k1 + 0]; |
| 486 yi = a[k1 + 1]; |
| 487 a[j1 + 0] = yr; |
| 488 a[j1 + 1] = yi; |
| 489 a[k1 + 0] = xr; |
| 490 a[k1 + 1] = xi; |
| 491 j1 += 8; |
| 492 k1 += 16; |
| 493 xr = a[j1 + 0]; |
| 494 xi = a[j1 + 1]; |
| 495 yr = a[k1 + 0]; |
| 496 yi = a[k1 + 1]; |
| 497 a[j1 + 0] = yr; |
| 498 a[j1 + 1] = yi; |
| 499 a[k1 + 0] = xr; |
| 500 a[k1 + 1] = xi; |
| 501 j1 += 8; |
| 502 k1 -= 8; |
| 503 xr = a[j1 + 0]; |
| 504 xi = a[j1 + 1]; |
| 505 yr = a[k1 + 0]; |
| 506 yi = a[k1 + 1]; |
| 507 a[j1 + 0] = yr; |
| 508 a[j1 + 1] = yi; |
| 509 a[k1 + 0] = xr; |
| 510 a[k1 + 1] = xi; |
| 511 j1 += 8; |
| 512 k1 += 16; |
| 513 xr = a[j1 + 0]; |
| 514 xi = a[j1 + 1]; |
| 515 yr = a[k1 + 0]; |
| 516 yi = a[k1 + 1]; |
| 517 a[j1 + 0] = yr; |
| 518 a[j1 + 1] = yi; |
| 519 a[k1 + 0] = xr; |
| 520 a[k1 + 1] = xi; |
| 521 } |
| 522 j1 = 2 * k + 8 + ip[k]; |
| 523 k1 = j1 + 8; |
| 524 xr = a[j1 + 0]; |
| 525 xi = a[j1 + 1]; |
| 526 yr = a[k1 + 0]; |
| 527 yi = a[k1 + 1]; |
| 528 a[j1 + 0] = yr; |
| 529 a[j1 + 1] = yi; |
| 530 a[k1 + 0] = xr; |
| 531 a[k1 + 1] = xi; |
| 532 } |
| 533 } |
| 534 |
| 535 } // namespace webrtc |
| OLD | NEW |