OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 /* | 11 /* |
12 * The core AEC algorithm, SSE2 version of speed-critical functions. | 12 * The core AEC algorithm, SSE2 version of speed-critical functions. |
13 */ | 13 */ |
14 | 14 |
15 #include <emmintrin.h> | 15 #include <emmintrin.h> |
16 #include <math.h> | 16 #include <math.h> |
17 #include <string.h> // memset | 17 #include <string.h> // memset |
18 | 18 |
| 19 extern "C" { |
19 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" | 20 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" |
| 21 } |
20 #include "webrtc/modules/audio_processing/aec/aec_common.h" | 22 #include "webrtc/modules/audio_processing/aec/aec_common.h" |
21 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | 23 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" |
| 24 extern "C" { |
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 25 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
| 26 } |
23 | 27 |
24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 28 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
25 return aRe * bRe - aIm * bIm; | 29 return aRe * bRe - aIm * bIm; |
26 } | 30 } |
27 | 31 |
28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 32 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
29 return aRe * bIm + aIm * bRe; | 33 return aRe * bIm + aIm * bRe; |
30 } | 34 } |
31 | 35 |
32 static void FilterFarSSE2(int num_partitions, | 36 static void FilterFarSSE2(int num_partitions, |
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
249 // compensate the fact that the exponent has been shifted in the top/ | 253 // compensate the fact that the exponent has been shifted in the top/ |
250 // fractional part and finally getting rid of the implicit leading one | 254 // fractional part and finally getting rid of the implicit leading one |
251 // from the mantissa by substracting it out. | 255 // from the mantissa by substracting it out. |
252 static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = { | 256 static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = { |
253 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; | 257 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; |
254 static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = { | 258 static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = { |
255 0x43800000, 0x43800000, 0x43800000, 0x43800000}; | 259 0x43800000, 0x43800000, 0x43800000, 0x43800000}; |
256 static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = { | 260 static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = { |
257 0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; | 261 0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; |
258 static const int shift_exponent_into_top_mantissa = 8; | 262 static const int shift_exponent_into_top_mantissa = 8; |
259 const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask)); | 263 const __m128 two_n = |
| 264 _mm_and_ps(a, *(reinterpret_cast<const __m128*>(float_exponent_mask))); |
260 const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32( | 265 const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32( |
261 _mm_castps_si128(two_n), shift_exponent_into_top_mantissa)); | 266 _mm_castps_si128(two_n), shift_exponent_into_top_mantissa)); |
262 const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent)); | 267 const __m128 n_0 = |
263 const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one)); | 268 _mm_or_ps(n_1, *(reinterpret_cast<const __m128*>(eight_biased_exponent))); |
| 269 const __m128 n = |
| 270 _mm_sub_ps(n_0, *(reinterpret_cast<const __m128*>(implicit_leading_one))); |
264 | 271 |
265 // Compute y. | 272 // Compute y. |
266 static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = { | 273 static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = { |
267 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; | 274 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; |
268 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { | 275 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { |
269 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; | 276 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; |
270 const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask)); | 277 const __m128 mantissa = |
| 278 _mm_and_ps(a, *(reinterpret_cast<const __m128*>(mantissa_mask))); |
271 const __m128 y = | 279 const __m128 y = |
272 _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one)); | 280 _mm_or_ps(mantissa, |
| 281 *(reinterpret_cast<const __m128*>(zero_biased_exponent_is_one))); |
273 | 282 |
274 // Approximate log2(y) ~= (y - 1) * pol5(y). | 283 // Approximate log2(y) ~= (y - 1) * pol5(y). |
275 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | 284 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 |
276 static const ALIGN16_BEG float ALIGN16_END C5[4] = { | 285 static const ALIGN16_BEG float ALIGN16_END C5[4] = { |
277 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; | 286 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; |
278 static const ALIGN16_BEG float ALIGN16_END C4[4] = { | 287 static const ALIGN16_BEG float ALIGN16_END C4[4] = { |
279 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; | 288 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; |
280 static const ALIGN16_BEG float ALIGN16_END C3[4] = { | 289 static const ALIGN16_BEG float ALIGN16_END C3[4] = { |
281 -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; | 290 -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; |
282 static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, | 291 static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, |
283 2.5988452f, 2.5988452f}; | 292 2.5988452f, 2.5988452f}; |
284 static const ALIGN16_BEG float ALIGN16_END C1[4] = { | 293 static const ALIGN16_BEG float ALIGN16_END C1[4] = { |
285 -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; | 294 -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; |
286 static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, | 295 static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, |
287 3.1157899f, 3.1157899f}; | 296 3.1157899f, 3.1157899f}; |
288 const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5)); | 297 const __m128 pol5_y_0 = |
289 const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4)); | 298 _mm_mul_ps(y, *(reinterpret_cast<const __m128*>(C5))); |
| 299 const __m128 pol5_y_1 = |
| 300 _mm_add_ps(pol5_y_0, *(reinterpret_cast<const __m128*>(C4))); |
290 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); | 301 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); |
291 const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3)); | 302 const __m128 pol5_y_3 = |
| 303 _mm_add_ps(pol5_y_2, *(reinterpret_cast<const __m128*>(C3))); |
292 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); | 304 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); |
293 const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2)); | 305 const __m128 pol5_y_5 = |
| 306 _mm_add_ps(pol5_y_4, *(reinterpret_cast<const __m128*>(C2))); |
294 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); | 307 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); |
295 const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1)); | 308 const __m128 pol5_y_7 = |
| 309 _mm_add_ps(pol5_y_6, *(reinterpret_cast<const __m128*>(C1))); |
296 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); | 310 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); |
297 const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0)); | 311 const __m128 pol5_y = |
| 312 _mm_add_ps(pol5_y_8, *(reinterpret_cast<const __m128*>(C0))); |
298 const __m128 y_minus_one = | 313 const __m128 y_minus_one = |
299 _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one)); | 314 _mm_sub_ps(y, |
| 315 *(reinterpret_cast<const __m128*>(zero_biased_exponent_is_one))); |
300 const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y); | 316 const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y); |
301 | 317 |
302 // Combine parts. | 318 // Combine parts. |
303 log2_a = _mm_add_ps(n, log2_y); | 319 log2_a = _mm_add_ps(n, log2_y); |
304 } | 320 } |
305 | 321 |
306 // b * log2(a) | 322 // b * log2(a) |
307 b_log2_a = _mm_mul_ps(b, log2_a); | 323 b_log2_a = _mm_mul_ps(b, log2_a); |
308 | 324 |
309 // Calculate exp2(x), x = b * log2(a). | 325 // Calculate exp2(x), x = b * log2(a). |
310 { | 326 { |
311 // To calculate 2^x, we decompose x like this: | 327 // To calculate 2^x, we decompose x like this: |
312 // x = n + y | 328 // x = n + y |
313 // n is an integer, the value of x - 0.5 rounded down, therefore | 329 // n is an integer, the value of x - 0.5 rounded down, therefore |
314 // y is in the [0.5, 1.5) range | 330 // y is in the [0.5, 1.5) range |
315 // | 331 // |
316 // 2^x = 2^n * 2^y | 332 // 2^x = 2^n * 2^y |
317 // 2^n can be evaluated by playing with float representation. | 333 // 2^n can be evaluated by playing with float representation. |
318 // 2^y in a small range can be approximated, this code uses an order two | 334 // 2^y in a small range can be approximated, this code uses an order two |
319 // polynomial approximation. The coefficients have been estimated | 335 // polynomial approximation. The coefficients have been estimated |
320 // with the Remez algorithm and the resulting polynomial has a | 336 // with the Remez algorithm and the resulting polynomial has a |
321 // maximum relative error of 0.17%. | 337 // maximum relative error of 0.17%. |
322 | 338 |
323 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. | 339 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. |
324 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, | 340 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, |
325 129.f, 129.f}; | 341 129.f, 129.f}; |
326 static const ALIGN16_BEG float min_input[4] ALIGN16_END = { | 342 static const ALIGN16_BEG float min_input[4] ALIGN16_END = { |
327 -126.99999f, -126.99999f, -126.99999f, -126.99999f}; | 343 -126.99999f, -126.99999f, -126.99999f, -126.99999f}; |
328 const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input)); | 344 const __m128 x_min = |
329 const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input)); | 345 _mm_min_ps(b_log2_a, *(reinterpret_cast<const __m128*>(max_input))); |
| 346 const __m128 x_max = |
| 347 _mm_max_ps(x_min, *(reinterpret_cast<const __m128*>(min_input))); |
330 // Compute n. | 348 // Compute n. |
331 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, | 349 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, |
332 0.5f}; | 350 0.5f}; |
333 const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half)); | 351 const __m128 x_minus_half = |
| 352 _mm_sub_ps(x_max, *(reinterpret_cast<const __m128*>(half))); |
334 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); | 353 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); |
335 // Compute 2^n. | 354 // Compute 2^n. |
336 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { | 355 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { |
337 127, 127, 127, 127}; | 356 127, 127, 127, 127}; |
338 static const int float_exponent_shift = 23; | 357 static const int float_exponent_shift = 23; |
339 const __m128i two_n_exponent = | 358 const __m128i two_n_exponent = |
340 _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias)); | 359 _mm_add_epi32(x_minus_half_floor, |
| 360 *(reinterpret_cast<const __m128i*>(float_exponent_bias))); |
341 const __m128 two_n = | 361 const __m128 two_n = |
342 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); | 362 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); |
343 // Compute y. | 363 // Compute y. |
344 const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); | 364 const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); |
345 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. | 365 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. |
346 static const ALIGN16_BEG float C2[4] ALIGN16_END = { | 366 static const ALIGN16_BEG float C2[4] ALIGN16_END = { |
347 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; | 367 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; |
348 static const ALIGN16_BEG float C1[4] ALIGN16_END = { | 368 static const ALIGN16_BEG float C1[4] ALIGN16_END = { |
349 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; | 369 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; |
350 static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, | 370 static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, |
351 1.0017247f, 1.0017247f}; | 371 1.0017247f, 1.0017247f}; |
352 const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2)); | 372 const __m128 exp2_y_0 = |
353 const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1)); | 373 _mm_mul_ps(y, *(reinterpret_cast<const __m128*>(C2))); |
| 374 const __m128 exp2_y_1 = |
| 375 _mm_add_ps(exp2_y_0, *(reinterpret_cast<const __m128*>(C1))); |
354 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); | 376 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); |
355 const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0)); | 377 const __m128 exp2_y = |
| 378 _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0))); |
356 | 379 |
357 // Combine parts. | 380 // Combine parts. |
358 a_exp_b = _mm_mul_ps(exp2_y, two_n); | 381 a_exp_b = _mm_mul_ps(exp2_y, two_n); |
359 } | 382 } |
360 return a_exp_b; | 383 return a_exp_b; |
361 } | 384 } |
362 | 385 |
363 static void OverdriveAndSuppressSSE2(AecCore* aec, | 386 static void OverdriveAndSuppressSSE2(AecCore* aec, |
364 float hNl[PART_LEN1], | 387 float hNl[PART_LEN1], |
365 const float hNlFb, | 388 const float hNlFb, |
(...skipping 346 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
712 void WebRtcAec_InitAec_SSE2(void) { | 735 void WebRtcAec_InitAec_SSE2(void) { |
713 WebRtcAec_FilterFar = FilterFarSSE2; | 736 WebRtcAec_FilterFar = FilterFarSSE2; |
714 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 737 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
715 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 738 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
716 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 739 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
717 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 740 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
718 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 741 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
719 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 742 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
720 WebRtcAec_WindowData = WindowDataSSE2; | 743 WebRtcAec_WindowData = WindowDataSSE2; |
721 } | 744 } |
OLD | NEW |