OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 11 matching lines...) Expand all Loading... |
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
23 | 23 |
24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
25 return aRe * bRe - aIm * bIm; | 25 return aRe * bRe - aIm * bIm; |
26 } | 26 } |
27 | 27 |
28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
29 return aRe * bIm + aIm * bRe; | 29 return aRe * bIm + aIm * bRe; |
30 } | 30 } |
31 | 31 |
32 static void FilterFarSSE2( | 32 static void FilterFarSSE2(int num_partitions, |
33 int num_partitions, | 33 int x_fft_buf_block_pos, |
34 int x_fft_buf_block_pos, | 34 float x_fft_buf[2] |
35 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 35 [kExtendedNumPartitions * PART_LEN1], |
36 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 36 float h_fft_buf[2] |
37 float y_fft[2][PART_LEN1]) { | 37 [kExtendedNumPartitions * PART_LEN1], |
38 | 38 float y_fft[2][PART_LEN1]) { |
39 int i; | 39 int i; |
40 for (i = 0; i < num_partitions; i++) { | 40 for (i = 0; i < num_partitions; i++) { |
41 int j; | 41 int j; |
42 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | 42 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; |
43 int pos = i * PART_LEN1; | 43 int pos = i * PART_LEN1; |
44 // Check for wrap | 44 // Check for wrap |
45 if (i + x_fft_buf_block_pos >= num_partitions) { | 45 if (i + x_fft_buf_block_pos >= num_partitions) { |
46 xPos -= num_partitions * (PART_LEN1); | 46 xPos -= num_partitions * (PART_LEN1); |
47 } | 47 } |
48 | 48 |
(...skipping 11 matching lines...) Expand all Loading... |
60 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re); | 60 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re); |
61 const __m128 e = _mm_sub_ps(a, b); | 61 const __m128 e = _mm_sub_ps(a, b); |
62 const __m128 f = _mm_add_ps(c, d); | 62 const __m128 f = _mm_add_ps(c, d); |
63 const __m128 g = _mm_add_ps(y_fft_re, e); | 63 const __m128 g = _mm_add_ps(y_fft_re, e); |
64 const __m128 h = _mm_add_ps(y_fft_im, f); | 64 const __m128 h = _mm_add_ps(y_fft_im, f); |
65 _mm_storeu_ps(&y_fft[0][j], g); | 65 _mm_storeu_ps(&y_fft[0][j], g); |
66 _mm_storeu_ps(&y_fft[1][j], h); | 66 _mm_storeu_ps(&y_fft[1][j], h); |
67 } | 67 } |
68 // scalar code for the remaining items. | 68 // scalar code for the remaining items. |
69 for (; j < PART_LEN1; j++) { | 69 for (; j < PART_LEN1; j++) { |
70 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], | 70 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], |
71 x_fft_buf[1][xPos + j], | 71 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); |
72 h_fft_buf[0][pos + j], | 72 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], |
73 h_fft_buf[1][pos + j]); | 73 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); |
74 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], | |
75 x_fft_buf[1][xPos + j], | |
76 h_fft_buf[0][pos + j], | |
77 h_fft_buf[1][pos + j]); | |
78 } | 74 } |
79 } | 75 } |
80 } | 76 } |
81 | 77 |
82 static void ScaleErrorSignalSSE2(int extended_filter_enabled, | 78 static void ScaleErrorSignalSSE2(int extended_filter_enabled, |
83 float normal_mu, | 79 float normal_mu, |
84 float normal_error_threshold, | 80 float normal_error_threshold, |
85 float x_pow[PART_LEN1], | 81 float x_pow[PART_LEN1], |
86 float ef[2][PART_LEN1]) { | 82 float ef[2][PART_LEN1]) { |
87 const __m128 k1e_10f = _mm_set1_ps(1e-10f); | 83 const __m128 k1e_10f = _mm_set1_ps(1e-10f); |
88 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) | 84 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) |
89 : _mm_set1_ps(normal_mu); | 85 : _mm_set1_ps(normal_mu); |
90 const __m128 kThresh = extended_filter_enabled | 86 const __m128 kThresh = extended_filter_enabled |
91 ? _mm_set1_ps(kExtendedErrorThreshold) | 87 ? _mm_set1_ps(kExtendedErrorThreshold) |
92 : _mm_set1_ps(normal_error_threshold); | 88 : _mm_set1_ps(normal_error_threshold); |
93 | 89 |
94 int i; | 90 int i; |
95 // vectorized code (four at once) | 91 // vectorized code (four at once) |
96 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 92 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
97 const __m128 x_pow_local = _mm_loadu_ps(&x_pow[i]); | 93 const __m128 x_pow_local = _mm_loadu_ps(&x_pow[i]); |
98 const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]); | 94 const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]); |
99 const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]); | 95 const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]); |
(...skipping 17 matching lines...) Expand all Loading... |
117 ef_re = _mm_or_ps(ef_re, ef_re_if); | 113 ef_re = _mm_or_ps(ef_re, ef_re_if); |
118 ef_im = _mm_or_ps(ef_im, ef_im_if); | 114 ef_im = _mm_or_ps(ef_im, ef_im_if); |
119 ef_re = _mm_mul_ps(ef_re, kMu); | 115 ef_re = _mm_mul_ps(ef_re, kMu); |
120 ef_im = _mm_mul_ps(ef_im, kMu); | 116 ef_im = _mm_mul_ps(ef_im, kMu); |
121 | 117 |
122 _mm_storeu_ps(&ef[0][i], ef_re); | 118 _mm_storeu_ps(&ef[0][i], ef_re); |
123 _mm_storeu_ps(&ef[1][i], ef_im); | 119 _mm_storeu_ps(&ef[1][i], ef_im); |
124 } | 120 } |
125 // scalar code for the remaining items. | 121 // scalar code for the remaining items. |
126 { | 122 { |
127 const float mu = | 123 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; |
128 extended_filter_enabled ? kExtendedMu : normal_mu; | |
129 const float error_threshold = extended_filter_enabled | 124 const float error_threshold = extended_filter_enabled |
130 ? kExtendedErrorThreshold | 125 ? kExtendedErrorThreshold |
131 : normal_error_threshold; | 126 : normal_error_threshold; |
132 for (; i < (PART_LEN1); i++) { | 127 for (; i < (PART_LEN1); i++) { |
133 float abs_ef; | 128 float abs_ef; |
134 ef[0][i] /= (x_pow[i] + 1e-10f); | 129 ef[0][i] /= (x_pow[i] + 1e-10f); |
135 ef[1][i] /= (x_pow[i] + 1e-10f); | 130 ef[1][i] /= (x_pow[i] + 1e-10f); |
136 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); | 131 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); |
137 | 132 |
138 if (abs_ef > error_threshold) { | 133 if (abs_ef > error_threshold) { |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
181 const __m128 e = _mm_add_ps(a, b); | 176 const __m128 e = _mm_add_ps(a, b); |
182 const __m128 f = _mm_sub_ps(c, d); | 177 const __m128 f = _mm_sub_ps(c, d); |
183 // Interleave real and imaginary parts. | 178 // Interleave real and imaginary parts. |
184 const __m128 g = _mm_unpacklo_ps(e, f); | 179 const __m128 g = _mm_unpacklo_ps(e, f); |
185 const __m128 h = _mm_unpackhi_ps(e, f); | 180 const __m128 h = _mm_unpackhi_ps(e, f); |
186 // Store | 181 // Store |
187 _mm_storeu_ps(&fft[2 * j + 0], g); | 182 _mm_storeu_ps(&fft[2 * j + 0], g); |
188 _mm_storeu_ps(&fft[2 * j + 4], h); | 183 _mm_storeu_ps(&fft[2 * j + 4], h); |
189 } | 184 } |
190 // ... and fixup the first imaginary entry. | 185 // ... and fixup the first imaginary entry. |
191 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], | 186 fft[1] = |
192 -x_fft_buf[1][xPos + PART_LEN], | 187 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], |
193 e_fft[0][PART_LEN], | 188 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); |
194 e_fft[1][PART_LEN]); | |
195 | 189 |
196 aec_rdft_inverse_128(fft); | 190 aec_rdft_inverse_128(fft); |
197 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | 191 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); |
198 | 192 |
199 // fft scaling | 193 // fft scaling |
200 { | 194 { |
201 float scale = 2.0f / PART_LEN2; | 195 float scale = 2.0f / PART_LEN2; |
202 const __m128 scale_ps = _mm_load_ps1(&scale); | 196 const __m128 scale_ps = _mm_load_ps1(&scale); |
203 for (j = 0; j < PART_LEN; j += 4) { | 197 for (j = 0; j < PART_LEN; j += 4) { |
204 const __m128 fft_ps = _mm_loadu_ps(&fft[j]); | 198 const __m128 fft_ps = _mm_loadu_ps(&fft[j]); |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
274 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { | 268 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { |
275 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; | 269 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; |
276 const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask)); | 270 const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask)); |
277 const __m128 y = | 271 const __m128 y = |
278 _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one)); | 272 _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one)); |
279 | 273 |
280 // Approximate log2(y) ~= (y - 1) * pol5(y). | 274 // Approximate log2(y) ~= (y - 1) * pol5(y). |
281 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | 275 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 |
282 static const ALIGN16_BEG float ALIGN16_END C5[4] = { | 276 static const ALIGN16_BEG float ALIGN16_END C5[4] = { |
283 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; | 277 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; |
284 static const ALIGN16_BEG float ALIGN16_END | 278 static const ALIGN16_BEG float ALIGN16_END C4[4] = { |
285 C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; | 279 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; |
286 static const ALIGN16_BEG float ALIGN16_END | 280 static const ALIGN16_BEG float ALIGN16_END C3[4] = { |
287 C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; | 281 -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; |
288 static const ALIGN16_BEG float ALIGN16_END | 282 static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, |
289 C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f}; | 283 2.5988452f, 2.5988452f}; |
290 static const ALIGN16_BEG float ALIGN16_END | 284 static const ALIGN16_BEG float ALIGN16_END C1[4] = { |
291 C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; | 285 -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; |
292 static const ALIGN16_BEG float ALIGN16_END | 286 static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, |
293 C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f}; | 287 3.1157899f, 3.1157899f}; |
294 const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5)); | 288 const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5)); |
295 const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4)); | 289 const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4)); |
296 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); | 290 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); |
297 const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3)); | 291 const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3)); |
298 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); | 292 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); |
299 const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2)); | 293 const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2)); |
300 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); | 294 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); |
301 const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1)); | 295 const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1)); |
302 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); | 296 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); |
303 const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0)); | 297 const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0)); |
(...skipping 23 matching lines...) Expand all Loading... |
327 // maximum relative error of 0.17%. | 321 // maximum relative error of 0.17%. |
328 | 322 |
329 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. | 323 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. |
330 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, | 324 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, |
331 129.f, 129.f}; | 325 129.f, 129.f}; |
332 static const ALIGN16_BEG float min_input[4] ALIGN16_END = { | 326 static const ALIGN16_BEG float min_input[4] ALIGN16_END = { |
333 -126.99999f, -126.99999f, -126.99999f, -126.99999f}; | 327 -126.99999f, -126.99999f, -126.99999f, -126.99999f}; |
334 const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input)); | 328 const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input)); |
335 const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input)); | 329 const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input)); |
336 // Compute n. | 330 // Compute n. |
337 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, | 331 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, |
338 0.5f, 0.5f}; | 332 0.5f}; |
339 const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half)); | 333 const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half)); |
340 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); | 334 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); |
341 // Compute 2^n. | 335 // Compute 2^n. |
342 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { | 336 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { |
343 127, 127, 127, 127}; | 337 127, 127, 127, 127}; |
344 static const int float_exponent_shift = 23; | 338 static const int float_exponent_shift = 23; |
345 const __m128i two_n_exponent = | 339 const __m128i two_n_exponent = |
346 _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias)); | 340 _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias)); |
347 const __m128 two_n = | 341 const __m128 two_n = |
348 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); | 342 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
425 // Suppress error signal | 419 // Suppress error signal |
426 efw[0][i] *= hNl[i]; | 420 efw[0][i] *= hNl[i]; |
427 efw[1][i] *= hNl[i]; | 421 efw[1][i] *= hNl[i]; |
428 | 422 |
429 // Ooura fft returns incorrect sign on imaginary component. It matters | 423 // Ooura fft returns incorrect sign on imaginary component. It matters |
430 // here because we are making an additive change with comfort noise. | 424 // here because we are making an additive change with comfort noise. |
431 efw[1][i] *= -1; | 425 efw[1][i] *= -1; |
432 } | 426 } |
433 } | 427 } |
434 | 428 |
435 __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { | 429 __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) { |
436 // A+B C+D | 430 // A+B C+D |
437 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); | 431 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); |
438 // A+B+C+D A+B+C+D | 432 // A+B+C+D A+B+C+D |
439 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | 433 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); |
440 _mm_store_ss(dst, sum); | 434 _mm_store_ss(dst, sum); |
441 } | 435 } |
442 | 436 |
443 static int PartitionDelaySSE2(const AecCore* aec) { | 437 static int PartitionDelaySSE2(const AecCore* aec) { |
444 // Measures the energy in each filter partition and returns the partition with | 438 // Measures the energy in each filter partition and returns the partition with |
445 // highest energy. | 439 // highest energy. |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
485 // - sxd : cross-PSD of near-end and far-end | 479 // - sxd : cross-PSD of near-end and far-end |
486 // | 480 // |
487 // In addition to updating the PSDs, also the filter diverge state is determined | 481 // In addition to updating the PSDs, also the filter diverge state is determined |
488 // upon actions are taken. | 482 // upon actions are taken. |
489 static void SmoothedPSD(AecCore* aec, | 483 static void SmoothedPSD(AecCore* aec, |
490 float efw[2][PART_LEN1], | 484 float efw[2][PART_LEN1], |
491 float dfw[2][PART_LEN1], | 485 float dfw[2][PART_LEN1], |
492 float xfw[2][PART_LEN1], | 486 float xfw[2][PART_LEN1], |
493 int* extreme_filter_divergence) { | 487 int* extreme_filter_divergence) { |
494 // Power estimate smoothing coefficients. | 488 // Power estimate smoothing coefficients. |
495 const float* ptrGCoh = aec->extended_filter_enabled | 489 const float* ptrGCoh = |
496 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | 490 aec->extended_filter_enabled |
497 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | 491 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] |
| 492 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; |
498 int i; | 493 int i; |
499 float sdSum = 0, seSum = 0; | 494 float sdSum = 0, seSum = 0; |
500 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); | 495 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); |
501 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); | 496 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); |
502 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); | 497 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); |
503 __m128 vec_sdSum = _mm_set1_ps(0.0f); | 498 __m128 vec_sdSum = _mm_set1_ps(0.0f); |
504 __m128 vec_seSum = _mm_set1_ps(0.0f); | 499 __m128 vec_seSum = _mm_set1_ps(0.0f); |
505 | 500 |
506 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 501 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
507 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); | 502 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); |
508 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); | 503 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); |
509 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); | 504 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); |
510 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); | 505 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); |
(...skipping 12 matching lines...) Expand all Loading... |
523 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); | 518 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); |
524 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); | 519 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); |
525 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); | 520 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); |
526 _mm_storeu_ps(&aec->sd[i], vec_sd); | 521 _mm_storeu_ps(&aec->sd[i], vec_sd); |
527 _mm_storeu_ps(&aec->se[i], vec_se); | 522 _mm_storeu_ps(&aec->se[i], vec_se); |
528 _mm_storeu_ps(&aec->sx[i], vec_sx); | 523 _mm_storeu_ps(&aec->sx[i], vec_sx); |
529 | 524 |
530 { | 525 { |
531 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); | 526 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); |
532 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | 527 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); |
533 __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, | 528 __m128 vec_a = |
534 _MM_SHUFFLE(2, 0, 2, 0)); | 529 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
535 __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, | 530 __m128 vec_b = |
536 _MM_SHUFFLE(3, 1, 3, 1)); | 531 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
537 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); | 532 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); |
538 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); | 533 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); |
539 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | 534 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |
540 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | 535 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |
541 vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011, | 536 vec_dfwefw0011 = |
542 _mm_mul_ps(vec_dfw1, vec_efw1)); | 537 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); |
543 vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110, | 538 vec_dfwefw0110 = |
544 _mm_mul_ps(vec_dfw1, vec_efw0)); | 539 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); |
545 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); | 540 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); |
546 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); | 541 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); |
547 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | 542 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |
548 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | 543 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); |
549 } | 544 } |
550 | 545 |
551 { | 546 { |
552 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | 547 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); |
553 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | 548 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); |
554 __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, | 549 __m128 vec_a = |
555 _MM_SHUFFLE(2, 0, 2, 0)); | 550 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
556 __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, | 551 __m128 vec_b = |
557 _MM_SHUFFLE(3, 1, 3, 1)); | 552 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
558 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); | 553 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); |
559 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); | 554 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); |
560 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | 555 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |
561 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | 556 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |
562 vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011, | 557 vec_dfwxfw0011 = |
563 _mm_mul_ps(vec_dfw1, vec_xfw1)); | 558 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); |
564 vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110, | 559 vec_dfwxfw0110 = |
565 _mm_mul_ps(vec_dfw1, vec_xfw0)); | 560 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); |
566 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); | 561 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); |
567 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); | 562 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); |
568 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | 563 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |
569 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | 564 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); |
570 } | 565 } |
571 | 566 |
572 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); | 567 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); |
573 vec_seSum = _mm_add_ps(vec_seSum, vec_se); | 568 vec_seSum = _mm_add_ps(vec_seSum, vec_se); |
574 } | 569 } |
575 | 570 |
576 _mm_add_ps_4x1(vec_sdSum, &sdSum); | 571 _mm_add_ps_4x1(vec_sdSum, &sdSum); |
577 _mm_add_ps_4x1(vec_seSum, &seSum); | 572 _mm_add_ps_4x1(vec_seSum, &seSum); |
578 | 573 |
579 for (; i < PART_LEN1; i++) { | 574 for (; i < PART_LEN1; i++) { |
580 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | 575 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + |
581 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | 576 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); |
582 aec->se[i] = ptrGCoh[0] * aec->se[i] + | 577 aec->se[i] = ptrGCoh[0] * aec->se[i] + |
583 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | 578 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); |
584 // We threshold here to protect against the ill-effects of a zero farend. | 579 // We threshold here to protect against the ill-effects of a zero farend. |
585 // The threshold is not arbitrarily chosen, but balances protection and | 580 // The threshold is not arbitrarily chosen, but balances protection and |
586 // adverse interaction with the algorithm's tuning. | 581 // adverse interaction with the algorithm's tuning. |
587 // TODO(bjornv): investigate further why this is so sensitive. | 582 // TODO(bjornv): investigate further why this is so sensitive. |
588 aec->sx[i] = | 583 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + |
589 ptrGCoh[0] * aec->sx[i] + | 584 ptrGCoh[1] * WEBRTC_SPL_MAX( |
590 ptrGCoh[1] * WEBRTC_SPL_MAX( | 585 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], |
591 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | 586 WebRtcAec_kMinFarendPSD); |
592 WebRtcAec_kMinFarendPSD); | |
593 | 587 |
594 aec->sde[i][0] = | 588 aec->sde[i][0] = |
595 ptrGCoh[0] * aec->sde[i][0] + | 589 ptrGCoh[0] * aec->sde[i][0] + |
596 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | 590 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); |
597 aec->sde[i][1] = | 591 aec->sde[i][1] = |
598 ptrGCoh[0] * aec->sde[i][1] + | 592 ptrGCoh[0] * aec->sde[i][1] + |
599 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | 593 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); |
600 | 594 |
601 aec->sxd[i][0] = | 595 aec->sxd[i][0] = |
602 ptrGCoh[0] * aec->sxd[i][0] + | 596 ptrGCoh[0] * aec->sxd[i][0] + |
(...skipping 18 matching lines...) Expand all Loading... |
621 static void WindowDataSSE2(float* x_windowed, const float* x) { | 615 static void WindowDataSSE2(float* x_windowed, const float* x) { |
622 int i; | 616 int i; |
623 for (i = 0; i < PART_LEN; i += 4) { | 617 for (i = 0; i < PART_LEN; i += 4) { |
624 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); | 618 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); |
625 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); | 619 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); |
626 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); | 620 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); |
627 // A B C D | 621 // A B C D |
628 __m128 vec_sqrtHanning_rev = | 622 __m128 vec_sqrtHanning_rev = |
629 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | 623 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); |
630 // D C B A | 624 // D C B A |
631 vec_sqrtHanning_rev = | 625 vec_sqrtHanning_rev = _mm_shuffle_ps( |
632 _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, | 626 vec_sqrtHanning_rev, vec_sqrtHanning_rev, _MM_SHUFFLE(0, 1, 2, 3)); |
633 _MM_SHUFFLE(0, 1, 2, 3)); | |
634 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); | 627 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); |
635 _mm_storeu_ps(&x_windowed[PART_LEN + i], | 628 _mm_storeu_ps(&x_windowed[PART_LEN + i], |
636 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); | 629 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); |
637 } | 630 } |
638 } | 631 } |
639 | 632 |
640 // Puts fft output data into a complex valued array. | 633 // Puts fft output data into a complex valued array. |
641 static void StoreAsComplexSSE2(const float* data, | 634 static void StoreAsComplexSSE2(const float* data, |
642 float data_complex[2][PART_LEN1]) { | 635 float data_complex[2][PART_LEN1]) { |
643 int i; | 636 int i; |
644 for (i = 0; i < PART_LEN; i += 4) { | 637 for (i = 0; i < PART_LEN; i += 4) { |
645 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); | 638 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); |
646 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); | 639 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); |
647 const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, | 640 const __m128 vec_a = |
648 _MM_SHUFFLE(2, 0, 2, 0)); | 641 _mm_shuffle_ps(vec_fft0, vec_fft4, _MM_SHUFFLE(2, 0, 2, 0)); |
649 const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, | 642 const __m128 vec_b = |
650 _MM_SHUFFLE(3, 1, 3, 1)); | 643 _mm_shuffle_ps(vec_fft0, vec_fft4, _MM_SHUFFLE(3, 1, 3, 1)); |
651 _mm_storeu_ps(&data_complex[0][i], vec_a); | 644 _mm_storeu_ps(&data_complex[0][i], vec_a); |
652 _mm_storeu_ps(&data_complex[1][i], vec_b); | 645 _mm_storeu_ps(&data_complex[1][i], vec_b); |
653 } | 646 } |
654 // fix beginning/end values | 647 // fix beginning/end values |
655 data_complex[1][0] = 0; | 648 data_complex[1][0] = 0; |
656 data_complex[1][PART_LEN] = 0; | 649 data_complex[1][PART_LEN] = 0; |
657 data_complex[0][0] = data[0]; | 650 data_complex[0][0] = data[0]; |
658 data_complex[0][PART_LEN] = data[1]; | 651 data_complex[0][PART_LEN] = data[1]; |
659 } | 652 } |
660 | 653 |
661 static void SubbandCoherenceSSE2(AecCore* aec, | 654 static void SubbandCoherenceSSE2(AecCore* aec, |
662 float efw[2][PART_LEN1], | 655 float efw[2][PART_LEN1], |
663 float dfw[2][PART_LEN1], | 656 float dfw[2][PART_LEN1], |
664 float xfw[2][PART_LEN1], | 657 float xfw[2][PART_LEN1], |
665 float* fft, | 658 float* fft, |
666 float* cohde, | 659 float* cohde, |
667 float* cohxd, | 660 float* cohxd, |
668 int* extreme_filter_divergence) { | 661 int* extreme_filter_divergence) { |
669 int i; | 662 int i; |
670 | 663 |
671 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | 664 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); |
672 | 665 |
673 { | 666 { |
674 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); | 667 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); |
675 | 668 |
676 // Subband coherence | 669 // Subband coherence |
677 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 670 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
678 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); | 671 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); |
679 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); | 672 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); |
680 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); | 673 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); |
681 const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, | 674 const __m128 vec_sdse = |
682 _mm_mul_ps(vec_sd, vec_se)); | 675 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); |
683 const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, | 676 const __m128 vec_sdsx = |
684 _mm_mul_ps(vec_sd, vec_sx)); | 677 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); |
685 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); | 678 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); |
686 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | 679 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); |
687 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | 680 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); |
688 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | 681 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); |
689 const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, | 682 const __m128 vec_sde_0 = |
690 _MM_SHUFFLE(2, 0, 2, 0)); | 683 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
691 const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, | 684 const __m128 vec_sde_1 = |
692 _MM_SHUFFLE(3, 1, 3, 1)); | 685 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
693 const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, | 686 const __m128 vec_sxd_0 = |
694 _MM_SHUFFLE(2, 0, 2, 0)); | 687 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
695 const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, | 688 const __m128 vec_sxd_1 = |
696 _MM_SHUFFLE(3, 1, 3, 1)); | 689 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
697 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); | 690 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); |
698 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); | 691 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); |
699 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); | 692 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); |
700 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); | 693 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); |
701 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); | 694 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); |
702 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); | 695 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); |
703 _mm_storeu_ps(&cohde[i], vec_cohde); | 696 _mm_storeu_ps(&cohde[i], vec_cohde); |
704 _mm_storeu_ps(&cohxd[i], vec_cohxd); | 697 _mm_storeu_ps(&cohxd[i], vec_cohxd); |
705 } | 698 } |
706 | 699 |
(...skipping 12 matching lines...) Expand all Loading... |
719 void WebRtcAec_InitAec_SSE2(void) { | 712 void WebRtcAec_InitAec_SSE2(void) { |
720 WebRtcAec_FilterFar = FilterFarSSE2; | 713 WebRtcAec_FilterFar = FilterFarSSE2; |
721 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 714 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
722 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 715 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
723 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 716 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
724 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 717 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
725 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 718 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
726 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 719 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
727 WebRtcAec_WindowData = WindowDataSSE2; | 720 WebRtcAec_WindowData = WindowDataSSE2; |
728 } | 721 } |
OLD | NEW |