| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 /* | 11 /* |
| 12 * The core AEC algorithm, SSE2 version of speed-critical functions. | 12 * The core AEC algorithm, SSE2 version of speed-critical functions. |
| 13 */ | 13 */ |
| 14 | 14 |
| 15 #include <emmintrin.h> | 15 #include <emmintrin.h> |
| 16 #include <math.h> | 16 #include <math.h> |
| 17 #include <string.h> // memset | 17 #include <string.h> // memset |
| 18 | 18 |
| 19 extern "C" { |
| 19 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" | 20 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" |
| 21 } |
| 20 #include "webrtc/modules/audio_processing/aec/aec_common.h" | 22 #include "webrtc/modules/audio_processing/aec/aec_common.h" |
| 21 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | 23 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" |
| 24 extern "C" { |
| 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 25 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
| 26 } |
| 23 | 27 |
| 24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 28 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
| 25 return aRe * bRe - aIm * bIm; | 29 return aRe * bRe - aIm * bIm; |
| 26 } | 30 } |
| 27 | 31 |
| 28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 32 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
| 29 return aRe * bIm + aIm * bRe; | 33 return aRe * bIm + aIm * bRe; |
| 30 } | 34 } |
| 31 | 35 |
| 32 static void FilterFarSSE2(int num_partitions, | 36 static void FilterFarSSE2(int num_partitions, |
| (...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 249 // compensate the fact that the exponent has been shifted in the top/ | 253 // compensate the fact that the exponent has been shifted in the top/ |
| 250 // fractional part and finally getting rid of the implicit leading one | 254 // fractional part and finally getting rid of the implicit leading one |
| 251 // from the mantissa by substracting it out. | 255 // from the mantissa by substracting it out. |
| 252 static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = { | 256 static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = { |
| 253 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; | 257 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; |
| 254 static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = { | 258 static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = { |
| 255 0x43800000, 0x43800000, 0x43800000, 0x43800000}; | 259 0x43800000, 0x43800000, 0x43800000, 0x43800000}; |
| 256 static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = { | 260 static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = { |
| 257 0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; | 261 0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; |
| 258 static const int shift_exponent_into_top_mantissa = 8; | 262 static const int shift_exponent_into_top_mantissa = 8; |
| 259 const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask)); | 263 const __m128 two_n = |
| 264 _mm_and_ps(a, *(reinterpret_cast<const __m128*>(float_exponent_mask))); |
| 260 const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32( | 265 const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32( |
| 261 _mm_castps_si128(two_n), shift_exponent_into_top_mantissa)); | 266 _mm_castps_si128(two_n), shift_exponent_into_top_mantissa)); |
| 262 const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent)); | 267 const __m128 n_0 = |
| 263 const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one)); | 268 _mm_or_ps(n_1, *(reinterpret_cast<const __m128*>(eight_biased_exponent))); |
| 269 const __m128 n = |
| 270 _mm_sub_ps(n_0, *(reinterpret_cast<const __m128*>(implicit_leading_one))); |
| 264 | 271 |
| 265 // Compute y. | 272 // Compute y. |
| 266 static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = { | 273 static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = { |
| 267 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; | 274 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; |
| 268 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { | 275 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { |
| 269 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; | 276 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; |
| 270 const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask)); | 277 const __m128 mantissa = |
| 278 _mm_and_ps(a, *(reinterpret_cast<const __m128*>(mantissa_mask))); |
| 271 const __m128 y = | 279 const __m128 y = |
| 272 _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one)); | 280 _mm_or_ps(mantissa, |
| 281 *(reinterpret_cast<const __m128*>(zero_biased_exponent_is_one))); |
| 273 | 282 |
| 274 // Approximate log2(y) ~= (y - 1) * pol5(y). | 283 // Approximate log2(y) ~= (y - 1) * pol5(y). |
| 275 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | 284 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 |
| 276 static const ALIGN16_BEG float ALIGN16_END C5[4] = { | 285 static const ALIGN16_BEG float ALIGN16_END C5[4] = { |
| 277 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; | 286 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; |
| 278 static const ALIGN16_BEG float ALIGN16_END C4[4] = { | 287 static const ALIGN16_BEG float ALIGN16_END C4[4] = { |
| 279 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; | 288 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; |
| 280 static const ALIGN16_BEG float ALIGN16_END C3[4] = { | 289 static const ALIGN16_BEG float ALIGN16_END C3[4] = { |
| 281 -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; | 290 -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; |
| 282 static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, | 291 static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, |
| 283 2.5988452f, 2.5988452f}; | 292 2.5988452f, 2.5988452f}; |
| 284 static const ALIGN16_BEG float ALIGN16_END C1[4] = { | 293 static const ALIGN16_BEG float ALIGN16_END C1[4] = { |
| 285 -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; | 294 -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; |
| 286 static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, | 295 static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, |
| 287 3.1157899f, 3.1157899f}; | 296 3.1157899f, 3.1157899f}; |
| 288 const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5)); | 297 const __m128 pol5_y_0 = |
| 289 const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4)); | 298 _mm_mul_ps(y, *(reinterpret_cast<const __m128*>(C5))); |
| 299 const __m128 pol5_y_1 = |
| 300 _mm_add_ps(pol5_y_0, *(reinterpret_cast<const __m128*>(C4))); |
| 290 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); | 301 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); |
| 291 const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3)); | 302 const __m128 pol5_y_3 = |
| 303 _mm_add_ps(pol5_y_2, *(reinterpret_cast<const __m128*>(C3))); |
| 292 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); | 304 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); |
| 293 const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2)); | 305 const __m128 pol5_y_5 = |
| 306 _mm_add_ps(pol5_y_4, *(reinterpret_cast<const __m128*>(C2))); |
| 294 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); | 307 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); |
| 295 const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1)); | 308 const __m128 pol5_y_7 = |
| 309 _mm_add_ps(pol5_y_6, *(reinterpret_cast<const __m128*>(C1))); |
| 296 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); | 310 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); |
| 297 const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0)); | 311 const __m128 pol5_y = |
| 312 _mm_add_ps(pol5_y_8, *(reinterpret_cast<const __m128*>(C0))); |
| 298 const __m128 y_minus_one = | 313 const __m128 y_minus_one = |
| 299 _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one)); | 314 _mm_sub_ps(y, |
| 315 *(reinterpret_cast<const __m128*>(zero_biased_exponent_is_one))); |
| 300 const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y); | 316 const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y); |
| 301 | 317 |
| 302 // Combine parts. | 318 // Combine parts. |
| 303 log2_a = _mm_add_ps(n, log2_y); | 319 log2_a = _mm_add_ps(n, log2_y); |
| 304 } | 320 } |
| 305 | 321 |
| 306 // b * log2(a) | 322 // b * log2(a) |
| 307 b_log2_a = _mm_mul_ps(b, log2_a); | 323 b_log2_a = _mm_mul_ps(b, log2_a); |
| 308 | 324 |
| 309 // Calculate exp2(x), x = b * log2(a). | 325 // Calculate exp2(x), x = b * log2(a). |
| 310 { | 326 { |
| 311 // To calculate 2^x, we decompose x like this: | 327 // To calculate 2^x, we decompose x like this: |
| 312 // x = n + y | 328 // x = n + y |
| 313 // n is an integer, the value of x - 0.5 rounded down, therefore | 329 // n is an integer, the value of x - 0.5 rounded down, therefore |
| 314 // y is in the [0.5, 1.5) range | 330 // y is in the [0.5, 1.5) range |
| 315 // | 331 // |
| 316 // 2^x = 2^n * 2^y | 332 // 2^x = 2^n * 2^y |
| 317 // 2^n can be evaluated by playing with float representation. | 333 // 2^n can be evaluated by playing with float representation. |
| 318 // 2^y in a small range can be approximated, this code uses an order two | 334 // 2^y in a small range can be approximated, this code uses an order two |
| 319 // polynomial approximation. The coefficients have been estimated | 335 // polynomial approximation. The coefficients have been estimated |
| 320 // with the Remez algorithm and the resulting polynomial has a | 336 // with the Remez algorithm and the resulting polynomial has a |
| 321 // maximum relative error of 0.17%. | 337 // maximum relative error of 0.17%. |
| 322 | 338 |
| 323 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. | 339 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. |
| 324 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, | 340 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, |
| 325 129.f, 129.f}; | 341 129.f, 129.f}; |
| 326 static const ALIGN16_BEG float min_input[4] ALIGN16_END = { | 342 static const ALIGN16_BEG float min_input[4] ALIGN16_END = { |
| 327 -126.99999f, -126.99999f, -126.99999f, -126.99999f}; | 343 -126.99999f, -126.99999f, -126.99999f, -126.99999f}; |
| 328 const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input)); | 344 const __m128 x_min = |
| 329 const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input)); | 345 _mm_min_ps(b_log2_a, *(reinterpret_cast<const __m128*>(max_input))); |
| 346 const __m128 x_max = |
| 347 _mm_max_ps(x_min, *(reinterpret_cast<const __m128*>(min_input))); |
| 330 // Compute n. | 348 // Compute n. |
| 331 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, | 349 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, |
| 332 0.5f}; | 350 0.5f}; |
| 333 const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half)); | 351 const __m128 x_minus_half = |
| 352 _mm_sub_ps(x_max, *(reinterpret_cast<const __m128*>(half))); |
| 334 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); | 353 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); |
| 335 // Compute 2^n. | 354 // Compute 2^n. |
| 336 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { | 355 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { |
| 337 127, 127, 127, 127}; | 356 127, 127, 127, 127}; |
| 338 static const int float_exponent_shift = 23; | 357 static const int float_exponent_shift = 23; |
| 339 const __m128i two_n_exponent = | 358 const __m128i two_n_exponent = |
| 340 _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias)); | 359 _mm_add_epi32(x_minus_half_floor, |
| 360 *(reinterpret_cast<const __m128i*>(float_exponent_bias))); |
| 341 const __m128 two_n = | 361 const __m128 two_n = |
| 342 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); | 362 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); |
| 343 // Compute y. | 363 // Compute y. |
| 344 const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); | 364 const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); |
| 345 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. | 365 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. |
| 346 static const ALIGN16_BEG float C2[4] ALIGN16_END = { | 366 static const ALIGN16_BEG float C2[4] ALIGN16_END = { |
| 347 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; | 367 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; |
| 348 static const ALIGN16_BEG float C1[4] ALIGN16_END = { | 368 static const ALIGN16_BEG float C1[4] ALIGN16_END = { |
| 349 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; | 369 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; |
| 350 static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, | 370 static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, |
| 351 1.0017247f, 1.0017247f}; | 371 1.0017247f, 1.0017247f}; |
| 352 const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2)); | 372 const __m128 exp2_y_0 = |
| 353 const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1)); | 373 _mm_mul_ps(y, *(reinterpret_cast<const __m128*>(C2))); |
| 374 const __m128 exp2_y_1 = |
| 375 _mm_add_ps(exp2_y_0, *(reinterpret_cast<const __m128*>(C1))); |
| 354 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); | 376 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); |
| 355 const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0)); | 377 const __m128 exp2_y = |
| 378 _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0))); |
| 356 | 379 |
| 357 // Combine parts. | 380 // Combine parts. |
| 358 a_exp_b = _mm_mul_ps(exp2_y, two_n); | 381 a_exp_b = _mm_mul_ps(exp2_y, two_n); |
| 359 } | 382 } |
| 360 return a_exp_b; | 383 return a_exp_b; |
| 361 } | 384 } |
| 362 | 385 |
| 363 static void OverdriveAndSuppressSSE2(AecCore* aec, | 386 static void OverdriveAndSuppressSSE2(AecCore* aec, |
| 364 float hNl[PART_LEN1], | 387 float hNl[PART_LEN1], |
| 365 const float hNlFb, | 388 const float hNlFb, |
| (...skipping 346 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 712 void WebRtcAec_InitAec_SSE2(void) { | 735 void WebRtcAec_InitAec_SSE2(void) { |
| 713 WebRtcAec_FilterFar = FilterFarSSE2; | 736 WebRtcAec_FilterFar = FilterFarSSE2; |
| 714 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 737 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
| 715 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 738 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
| 716 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 739 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
| 717 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 740 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
| 718 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 741 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
| 719 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 742 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
| 720 WebRtcAec_WindowData = WindowDataSSE2; | 743 WebRtcAec_WindowData = WindowDataSSE2; |
| 721 } | 744 } |
| OLD | NEW |