| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 357 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 368 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); | 368 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); |
| 369 const __m128 exp2_y = | 369 const __m128 exp2_y = |
| 370 _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0))); | 370 _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0))); |
| 371 | 371 |
| 372 // Combine parts. | 372 // Combine parts. |
| 373 a_exp_b = _mm_mul_ps(exp2_y, two_n); | 373 a_exp_b = _mm_mul_ps(exp2_y, two_n); |
| 374 } | 374 } |
| 375 return a_exp_b; | 375 return a_exp_b; |
| 376 } | 376 } |
| 377 | 377 |
| 378 static void OverdriveAndSuppressSSE2(float overdrive_scaling, | 378 static void OverdriveSSE2(float overdrive_scaling, |
| 379 float hNl[PART_LEN1], | 379 float hNlFb, |
| 380 const float hNlFb, | 380 float hNl[PART_LEN1]) { |
| 381 float efw[2][PART_LEN1]) { | |
| 382 int i; | 381 int i; |
| 383 const __m128 vec_hNlFb = _mm_set1_ps(hNlFb); | 382 const __m128 vec_hNlFb = _mm_set1_ps(hNlFb); |
| 384 const __m128 vec_one = _mm_set1_ps(1.0f); | 383 const __m128 vec_one = _mm_set1_ps(1.0f); |
| 385 const __m128 vec_minus_one = _mm_set1_ps(-1.0f); | |
| 386 const __m128 vec_overdrive_scaling = _mm_set1_ps(overdrive_scaling); | 384 const __m128 vec_overdrive_scaling = _mm_set1_ps(overdrive_scaling); |
| 387 // vectorized code (four at once) | 385 // vectorized code (four at once) |
| 388 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 386 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 389 // Weight subbands | 387 // Weight subbands |
| 390 __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); | 388 __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); |
| 391 const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); | 389 const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); |
| 392 const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); | 390 const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); |
| 393 const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb); | 391 const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb); |
| 394 const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); | 392 const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); |
| 395 const __m128 vec_one_weightCurve_hNl = | 393 const __m128 vec_one_weightCurve_hNl = |
| 396 _mm_mul_ps(vec_one_weightCurve, vec_hNl); | 394 _mm_mul_ps(vec_one_weightCurve, vec_hNl); |
| 397 const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); | 395 const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); |
| 398 const __m128 vec_if1 = _mm_and_ps( | 396 const __m128 vec_if1 = _mm_and_ps( |
| 399 bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); | 397 bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); |
| 400 vec_hNl = _mm_or_ps(vec_if0, vec_if1); | 398 vec_hNl = _mm_or_ps(vec_if0, vec_if1); |
| 401 | 399 |
| 402 { | 400 const __m128 vec_overDriveCurve = |
| 403 const __m128 vec_overDriveCurve = | 401 _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]); |
| 404 _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]); | 402 const __m128 vec_overDriveSm_overDriveCurve = |
| 405 const __m128 vec_overDriveSm_overDriveCurve = | 403 _mm_mul_ps(vec_overdrive_scaling, vec_overDriveCurve); |
| 406 _mm_mul_ps(vec_overdrive_scaling, vec_overDriveCurve); | 404 vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); |
| 407 vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); | 405 _mm_storeu_ps(&hNl[i], vec_hNl); |
| 408 _mm_storeu_ps(&hNl[i], vec_hNl); | |
| 409 } | |
| 410 | |
| 411 // Suppress error signal | |
| 412 { | |
| 413 __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]); | |
| 414 __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]); | |
| 415 vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl); | |
| 416 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl); | |
| 417 | |
| 418 // Ooura fft returns incorrect sign on imaginary component. It matters | |
| 419 // here because we are making an additive change with comfort noise. | |
| 420 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one); | |
| 421 _mm_storeu_ps(&efw[0][i], vec_efw_re); | |
| 422 _mm_storeu_ps(&efw[1][i], vec_efw_im); | |
| 423 } | |
| 424 } | 406 } |
| 425 // scalar code for the remaining items. | 407 // scalar code for the remaining items. |
| 426 for (; i < PART_LEN1; i++) { | 408 for (; i < PART_LEN1; i++) { |
| 427 // Weight subbands | 409 // Weight subbands |
| 428 if (hNl[i] > hNlFb) { | 410 if (hNl[i] > hNlFb) { |
| 429 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + | 411 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + |
| 430 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; | 412 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; |
| 431 } | 413 } |
| 432 hNl[i] = powf(hNl[i], overdrive_scaling * WebRtcAec_overDriveCurve[i]); | 414 hNl[i] = powf(hNl[i], overdrive_scaling * WebRtcAec_overDriveCurve[i]); |
| 415 } |
| 416 } |
| 433 | 417 |
| 418 static void SuppressSSE2(const float hNl[PART_LEN1], float efw[2][PART_LEN1]) { |
| 419 int i; |
| 420 const __m128 vec_minus_one = _mm_set1_ps(-1.0f); |
| 421 // vectorized code (four at once) |
| 422 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 423 // Suppress error signal |
| 424 __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); |
| 425 __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]); |
| 426 __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]); |
| 427 vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl); |
| 428 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl); |
| 429 |
| 430 // Ooura fft returns incorrect sign on imaginary component. It matters |
| 431 // here because we are making an additive change with comfort noise. |
| 432 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one); |
| 433 _mm_storeu_ps(&efw[0][i], vec_efw_re); |
| 434 _mm_storeu_ps(&efw[1][i], vec_efw_im); |
| 435 } |
| 436 // scalar code for the remaining items. |
| 437 for (; i < PART_LEN1; i++) { |
| 434 // Suppress error signal | 438 // Suppress error signal |
| 435 efw[0][i] *= hNl[i]; | 439 efw[0][i] *= hNl[i]; |
| 436 efw[1][i] *= hNl[i]; | 440 efw[1][i] *= hNl[i]; |
| 437 | 441 |
| 438 // Ooura fft returns incorrect sign on imaginary component. It matters | 442 // Ooura fft returns incorrect sign on imaginary component. It matters |
| 439 // here because we are making an additive change with comfort noise. | 443 // here because we are making an additive change with comfort noise. |
| 440 efw[1][i] *= -1; | 444 efw[1][i] *= -1; |
| 441 } | 445 } |
| 442 } | 446 } |
| 443 | 447 |
| (...skipping 284 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 728 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / | 732 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / |
| 729 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); | 733 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); |
| 730 } | 734 } |
| 731 } | 735 } |
| 732 } | 736 } |
| 733 | 737 |
| 734 void WebRtcAec_InitAec_SSE2(void) { | 738 void WebRtcAec_InitAec_SSE2(void) { |
| 735 WebRtcAec_FilterFar = FilterFarSSE2; | 739 WebRtcAec_FilterFar = FilterFarSSE2; |
| 736 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 740 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
| 737 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 741 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
| 738 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 742 WebRtcAec_Overdrive = OverdriveSSE2; |
| 743 WebRtcAec_Suppress = SuppressSSE2; |
| 739 WebRtcAec_ComputeCoherence = ComputeCoherenceSSE2; | 744 WebRtcAec_ComputeCoherence = ComputeCoherenceSSE2; |
| 740 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraSSE2; | 745 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraSSE2; |
| 741 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 746 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
| 742 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 747 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
| 743 WebRtcAec_WindowData = WindowDataSSE2; | 748 WebRtcAec_WindowData = WindowDataSSE2; |
| 744 } | 749 } |
| 745 } // namespace webrtc | 750 } // namespace webrtc |
| OLD | NEW |