OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 421 matching lines...) Loading... |
432 } | 432 } |
433 } | 433 } |
434 | 434 |
435 __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { | 435 __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { |
436 // A+B C+D | 436 // A+B C+D |
437 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); | 437 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); |
438 // A+B+C+D A+B+C+D | 438 // A+B+C+D A+B+C+D |
439 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | 439 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); |
440 _mm_store_ss(dst, sum); | 440 _mm_store_ss(dst, sum); |
441 } | 441 } |
442 static int PartitionDelay(const AecCore* aec) { | 442 |
| 443 static int PartitionDelaySSE2(const AecCore* aec) { |
443 // Measures the energy in each filter partition and returns the partition with | 444 // Measures the energy in each filter partition and returns the partition with |
444 // highest energy. | 445 // highest energy. |
445 // TODO(bjornv): Spread computational cost by computing one partition per | 446 // TODO(bjornv): Spread computational cost by computing one partition per |
446 // block? | 447 // block? |
447 float wfEnMax = 0; | 448 float wfEnMax = 0; |
448 int i; | 449 int i; |
449 int delay = 0; | 450 int delay = 0; |
450 | 451 |
451 for (i = 0; i < aec->num_partitions; i++) { | 452 for (i = 0; i < aec->num_partitions; i++) { |
452 int j; | 453 int j; |
(...skipping 159 matching lines...) Loading... |
612 | 613 |
613 if (aec->divergeState) | 614 if (aec->divergeState) |
614 memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); | 615 memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); |
615 | 616 |
616 // Reset if error is significantly larger than nearend (13 dB). | 617 // Reset if error is significantly larger than nearend (13 dB). |
617 if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) | 618 if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) |
618 memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); | 619 memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); |
619 } | 620 } |
620 | 621 |
621 // Window time domain data to be used by the fft. | 622 // Window time domain data to be used by the fft. |
622 __inline static void WindowData(float* x_windowed, const float* x) { | 623 static void WindowDataSSE2(float* x_windowed, const float* x) { |
623 int i; | 624 int i; |
624 for (i = 0; i < PART_LEN; i += 4) { | 625 for (i = 0; i < PART_LEN; i += 4) { |
625 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); | 626 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); |
626 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); | 627 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); |
627 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); | 628 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); |
628 // A B C D | 629 // A B C D |
629 __m128 vec_sqrtHanning_rev = | 630 __m128 vec_sqrtHanning_rev = |
630 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | 631 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); |
631 // D C B A | 632 // D C B A |
632 vec_sqrtHanning_rev = | 633 vec_sqrtHanning_rev = |
633 _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, | 634 _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, |
634 _MM_SHUFFLE(0, 1, 2, 3)); | 635 _MM_SHUFFLE(0, 1, 2, 3)); |
635 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); | 636 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); |
636 _mm_storeu_ps(&x_windowed[PART_LEN + i], | 637 _mm_storeu_ps(&x_windowed[PART_LEN + i], |
637 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); | 638 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); |
638 } | 639 } |
639 } | 640 } |
640 | 641 |
641 // Puts fft output data into a complex valued array. | 642 // Puts fft output data into a complex valued array. |
642 __inline static void StoreAsComplex(const float* data, | 643 static void StoreAsComplexSSE2(const float* data, |
643 float data_complex[2][PART_LEN1]) { | 644 float data_complex[2][PART_LEN1]) { |
644 int i; | 645 int i; |
645 for (i = 0; i < PART_LEN; i += 4) { | 646 for (i = 0; i < PART_LEN; i += 4) { |
646 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); | 647 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); |
647 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); | 648 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); |
648 const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, | 649 const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, |
649 _MM_SHUFFLE(2, 0, 2, 0)); | 650 _MM_SHUFFLE(2, 0, 2, 0)); |
650 const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, | 651 const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, |
651 _MM_SHUFFLE(3, 1, 3, 1)); | 652 _MM_SHUFFLE(3, 1, 3, 1)); |
652 _mm_storeu_ps(&data_complex[0][i], vec_a); | 653 _mm_storeu_ps(&data_complex[0][i], vec_a); |
653 _mm_storeu_ps(&data_complex[1][i], vec_b); | 654 _mm_storeu_ps(&data_complex[1][i], vec_b); |
654 } | 655 } |
655 // fix beginning/end values | 656 // fix beginning/end values |
656 data_complex[1][0] = 0; | 657 data_complex[1][0] = 0; |
657 data_complex[1][PART_LEN] = 0; | 658 data_complex[1][PART_LEN] = 0; |
658 data_complex[0][0] = data[0]; | 659 data_complex[0][0] = data[0]; |
659 data_complex[0][PART_LEN] = data[1]; | 660 data_complex[0][PART_LEN] = data[1]; |
660 } | 661 } |
661 | 662 |
662 static void SubbandCoherenceSSE2(AecCore* aec, | 663 static void SubbandCoherenceSSE2(AecCore* aec, |
663 float efw[2][PART_LEN1], | 664 float efw[2][PART_LEN1], |
| 665 float dfw[2][PART_LEN1], |
664 float xfw[2][PART_LEN1], | 666 float xfw[2][PART_LEN1], |
665 float* fft, | 667 float* fft, |
666 float* cohde, | 668 float* cohde, |
667 float* cohxd) { | 669 float* cohxd) { |
668 float dfw[2][PART_LEN1]; | |
669 int i; | 670 int i; |
670 | 671 |
671 if (aec->delayEstCtr == 0) | |
672 aec->delayIdx = PartitionDelay(aec); | |
673 | |
674 // Use delayed far. | |
675 memcpy(xfw, | |
676 aec->xfwBuf + aec->delayIdx * PART_LEN1, | |
677 sizeof(xfw[0][0]) * 2 * PART_LEN1); | |
678 | |
679 // Windowed near fft | |
680 WindowData(fft, aec->dBuf); | |
681 aec_rdft_forward_128(fft); | |
682 StoreAsComplex(fft, dfw); | |
683 | |
684 // Windowed error fft | |
685 WindowData(fft, aec->eBuf); | |
686 aec_rdft_forward_128(fft); | |
687 StoreAsComplex(fft, efw); | |
688 | |
689 SmoothedPSD(aec, efw, dfw, xfw); | 672 SmoothedPSD(aec, efw, dfw, xfw); |
690 | 673 |
691 { | 674 { |
692 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); | 675 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); |
693 | 676 |
694 // Subband coherence | 677 // Subband coherence |
695 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 678 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
696 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); | 679 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); |
697 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); | 680 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); |
698 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); | 681 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); |
(...skipping 34 matching lines...) Loading... |
733 } | 716 } |
734 } | 717 } |
735 } | 718 } |
736 | 719 |
737 void WebRtcAec_InitAec_SSE2(void) { | 720 void WebRtcAec_InitAec_SSE2(void) { |
738 WebRtcAec_FilterFar = FilterFarSSE2; | 721 WebRtcAec_FilterFar = FilterFarSSE2; |
739 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 722 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
740 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 723 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
741 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 724 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
742 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 725 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
| 726 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
| 727 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
| 728 WebRtcAec_WindowData = WindowDataSSE2; |
743 } | 729 } |
OLD | NEW |