OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 357 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
368 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); | 368 const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); |
369 const __m128 exp2_y = | 369 const __m128 exp2_y = |
370 _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0))); | 370 _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0))); |
371 | 371 |
372 // Combine parts. | 372 // Combine parts. |
373 a_exp_b = _mm_mul_ps(exp2_y, two_n); | 373 a_exp_b = _mm_mul_ps(exp2_y, two_n); |
374 } | 374 } |
375 return a_exp_b; | 375 return a_exp_b; |
376 } | 376 } |
377 | 377 |
378 static void OverdriveAndSuppressSSE2(float overdrive_scaling, | 378 static void OverdriveSSE2(float overdrive_scaling, |
379 float hNl[PART_LEN1], | 379 float hNlFb, |
380 const float hNlFb, | 380 float hNl[PART_LEN1]) { |
381 float efw[2][PART_LEN1]) { | |
382 int i; | 381 int i; |
383 const __m128 vec_hNlFb = _mm_set1_ps(hNlFb); | 382 const __m128 vec_hNlFb = _mm_set1_ps(hNlFb); |
384 const __m128 vec_one = _mm_set1_ps(1.0f); | 383 const __m128 vec_one = _mm_set1_ps(1.0f); |
385 const __m128 vec_minus_one = _mm_set1_ps(-1.0f); | |
386 const __m128 vec_overdrive_scaling = _mm_set1_ps(overdrive_scaling); | 384 const __m128 vec_overdrive_scaling = _mm_set1_ps(overdrive_scaling); |
387 // vectorized code (four at once) | 385 // vectorized code (four at once) |
388 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 386 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
389 // Weight subbands | 387 // Weight subbands |
390 __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); | 388 __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); |
391 const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); | 389 const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); |
392 const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); | 390 const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); |
393 const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb); | 391 const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb); |
394 const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); | 392 const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); |
395 const __m128 vec_one_weightCurve_hNl = | 393 const __m128 vec_one_weightCurve_hNl = |
396 _mm_mul_ps(vec_one_weightCurve, vec_hNl); | 394 _mm_mul_ps(vec_one_weightCurve, vec_hNl); |
397 const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); | 395 const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); |
398 const __m128 vec_if1 = _mm_and_ps( | 396 const __m128 vec_if1 = _mm_and_ps( |
399 bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); | 397 bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); |
400 vec_hNl = _mm_or_ps(vec_if0, vec_if1); | 398 vec_hNl = _mm_or_ps(vec_if0, vec_if1); |
401 | 399 |
402 { | 400 const __m128 vec_overDriveCurve = |
403 const __m128 vec_overDriveCurve = | 401 _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]); |
404 _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]); | 402 const __m128 vec_overDriveSm_overDriveCurve = |
405 const __m128 vec_overDriveSm_overDriveCurve = | 403 _mm_mul_ps(vec_overdrive_scaling, vec_overDriveCurve); |
406 _mm_mul_ps(vec_overdrive_scaling, vec_overDriveCurve); | 404 vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); |
407 vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); | 405 _mm_storeu_ps(&hNl[i], vec_hNl); |
408 _mm_storeu_ps(&hNl[i], vec_hNl); | |
409 } | |
410 | |
411 // Suppress error signal | |
412 { | |
413 __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]); | |
414 __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]); | |
415 vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl); | |
416 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl); | |
417 | |
418 // Ooura fft returns incorrect sign on imaginary component. It matters | |
419 // here because we are making an additive change with comfort noise. | |
420 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one); | |
421 _mm_storeu_ps(&efw[0][i], vec_efw_re); | |
422 _mm_storeu_ps(&efw[1][i], vec_efw_im); | |
423 } | |
424 } | 406 } |
425 // scalar code for the remaining items. | 407 // scalar code for the remaining items. |
426 for (; i < PART_LEN1; i++) { | 408 for (; i < PART_LEN1; i++) { |
427 // Weight subbands | 409 // Weight subbands |
428 if (hNl[i] > hNlFb) { | 410 if (hNl[i] > hNlFb) { |
429 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + | 411 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + |
430 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; | 412 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; |
431 } | 413 } |
432 hNl[i] = powf(hNl[i], overdrive_scaling * WebRtcAec_overDriveCurve[i]); | 414 hNl[i] = powf(hNl[i], overdrive_scaling * WebRtcAec_overDriveCurve[i]); |
| 415 } |
| 416 } |
433 | 417 |
| 418 static void SuppressSSE2(const float hNl[PART_LEN1], float efw[2][PART_LEN1]) { |
| 419 int i; |
| 420 const __m128 vec_minus_one = _mm_set1_ps(-1.0f); |
| 421 // vectorized code (four at once) |
| 422 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 423 // Suppress error signal |
| 424 __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); |
| 425 __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]); |
| 426 __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]); |
| 427 vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl); |
| 428 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl); |
| 429 |
| 430 // Ooura fft returns incorrect sign on imaginary component. It matters |
| 431 // here because we are making an additive change with comfort noise. |
| 432 vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one); |
| 433 _mm_storeu_ps(&efw[0][i], vec_efw_re); |
| 434 _mm_storeu_ps(&efw[1][i], vec_efw_im); |
| 435 } |
| 436 // scalar code for the remaining items. |
| 437 for (; i < PART_LEN1; i++) { |
434 // Suppress error signal | 438 // Suppress error signal |
435 efw[0][i] *= hNl[i]; | 439 efw[0][i] *= hNl[i]; |
436 efw[1][i] *= hNl[i]; | 440 efw[1][i] *= hNl[i]; |
437 | 441 |
438 // Ooura fft returns incorrect sign on imaginary component. It matters | 442 // Ooura fft returns incorrect sign on imaginary component. It matters |
439 // here because we are making an additive change with comfort noise. | 443 // here because we are making an additive change with comfort noise. |
440 efw[1][i] *= -1; | 444 efw[1][i] *= -1; |
441 } | 445 } |
442 } | 446 } |
443 | 447 |
(...skipping 284 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
728 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / | 732 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / |
729 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); | 733 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); |
730 } | 734 } |
731 } | 735 } |
732 } | 736 } |
733 | 737 |
734 void WebRtcAec_InitAec_SSE2(void) { | 738 void WebRtcAec_InitAec_SSE2(void) { |
735 WebRtcAec_FilterFar = FilterFarSSE2; | 739 WebRtcAec_FilterFar = FilterFarSSE2; |
736 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 740 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
737 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 741 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
738 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 742 WebRtcAec_Overdrive = OverdriveSSE2; |
| 743 WebRtcAec_Suppress = SuppressSSE2; |
739 WebRtcAec_ComputeCoherence = ComputeCoherenceSSE2; | 744 WebRtcAec_ComputeCoherence = ComputeCoherenceSSE2; |
740 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraSSE2; | 745 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraSSE2; |
741 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 746 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
742 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 747 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
743 WebRtcAec_WindowData = WindowDataSSE2; | 748 WebRtcAec_WindowData = WindowDataSSE2; |
744 } | 749 } |
745 } // namespace webrtc | 750 } // namespace webrtc |
OLD | NEW |