| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 421 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 432 } | 432 } |
| 433 } | 433 } |
| 434 | 434 |
| 435 __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { | 435 __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { |
| 436 // A+B C+D | 436 // A+B C+D |
| 437 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); | 437 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); |
| 438 // A+B+C+D A+B+C+D | 438 // A+B+C+D A+B+C+D |
| 439 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | 439 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); |
| 440 _mm_store_ss(dst, sum); | 440 _mm_store_ss(dst, sum); |
| 441 } | 441 } |
| 442 static int PartitionDelay(const AecCore* aec) { | 442 |
| 443 static int PartitionDelaySSE2(const AecCore* aec) { |
| 443 // Measures the energy in each filter partition and returns the partition with | 444 // Measures the energy in each filter partition and returns the partition with |
| 444 // highest energy. | 445 // highest energy. |
| 445 // TODO(bjornv): Spread computational cost by computing one partition per | 446 // TODO(bjornv): Spread computational cost by computing one partition per |
| 446 // block? | 447 // block? |
| 447 float wfEnMax = 0; | 448 float wfEnMax = 0; |
| 448 int i; | 449 int i; |
| 449 int delay = 0; | 450 int delay = 0; |
| 450 | 451 |
| 451 for (i = 0; i < aec->num_partitions; i++) { | 452 for (i = 0; i < aec->num_partitions; i++) { |
| 452 int j; | 453 int j; |
| (...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 612 | 613 |
| 613 if (aec->divergeState) | 614 if (aec->divergeState) |
| 614 memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); | 615 memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); |
| 615 | 616 |
| 616 // Reset if error is significantly larger than nearend (13 dB). | 617 // Reset if error is significantly larger than nearend (13 dB). |
| 617 if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) | 618 if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) |
| 618 memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); | 619 memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); |
| 619 } | 620 } |
| 620 | 621 |
| 621 // Window time domain data to be used by the fft. | 622 // Window time domain data to be used by the fft. |
| 622 __inline static void WindowData(float* x_windowed, const float* x) { | 623 static void WindowDataSSE2(float* x_windowed, const float* x) { |
| 623 int i; | 624 int i; |
| 624 for (i = 0; i < PART_LEN; i += 4) { | 625 for (i = 0; i < PART_LEN; i += 4) { |
| 625 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); | 626 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); |
| 626 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); | 627 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); |
| 627 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); | 628 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); |
| 628 // A B C D | 629 // A B C D |
| 629 __m128 vec_sqrtHanning_rev = | 630 __m128 vec_sqrtHanning_rev = |
| 630 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | 631 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); |
| 631 // D C B A | 632 // D C B A |
| 632 vec_sqrtHanning_rev = | 633 vec_sqrtHanning_rev = |
| 633 _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, | 634 _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, |
| 634 _MM_SHUFFLE(0, 1, 2, 3)); | 635 _MM_SHUFFLE(0, 1, 2, 3)); |
| 635 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); | 636 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); |
| 636 _mm_storeu_ps(&x_windowed[PART_LEN + i], | 637 _mm_storeu_ps(&x_windowed[PART_LEN + i], |
| 637 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); | 638 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); |
| 638 } | 639 } |
| 639 } | 640 } |
| 640 | 641 |
| 641 // Puts fft output data into a complex valued array. | 642 // Puts fft output data into a complex valued array. |
| 642 __inline static void StoreAsComplex(const float* data, | 643 static void StoreAsComplexSSE2(const float* data, |
| 643 float data_complex[2][PART_LEN1]) { | 644 float data_complex[2][PART_LEN1]) { |
| 644 int i; | 645 int i; |
| 645 for (i = 0; i < PART_LEN; i += 4) { | 646 for (i = 0; i < PART_LEN; i += 4) { |
| 646 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); | 647 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); |
| 647 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); | 648 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); |
| 648 const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, | 649 const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, |
| 649 _MM_SHUFFLE(2, 0, 2, 0)); | 650 _MM_SHUFFLE(2, 0, 2, 0)); |
| 650 const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, | 651 const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, |
| 651 _MM_SHUFFLE(3, 1, 3, 1)); | 652 _MM_SHUFFLE(3, 1, 3, 1)); |
| 652 _mm_storeu_ps(&data_complex[0][i], vec_a); | 653 _mm_storeu_ps(&data_complex[0][i], vec_a); |
| 653 _mm_storeu_ps(&data_complex[1][i], vec_b); | 654 _mm_storeu_ps(&data_complex[1][i], vec_b); |
| 654 } | 655 } |
| 655 // fix beginning/end values | 656 // fix beginning/end values |
| 656 data_complex[1][0] = 0; | 657 data_complex[1][0] = 0; |
| 657 data_complex[1][PART_LEN] = 0; | 658 data_complex[1][PART_LEN] = 0; |
| 658 data_complex[0][0] = data[0]; | 659 data_complex[0][0] = data[0]; |
| 659 data_complex[0][PART_LEN] = data[1]; | 660 data_complex[0][PART_LEN] = data[1]; |
| 660 } | 661 } |
| 661 | 662 |
| 662 static void SubbandCoherenceSSE2(AecCore* aec, | 663 static void SubbandCoherenceSSE2(AecCore* aec, |
| 663 float efw[2][PART_LEN1], | 664 float efw[2][PART_LEN1], |
| 665 float dfw[2][PART_LEN1], |
| 664 float xfw[2][PART_LEN1], | 666 float xfw[2][PART_LEN1], |
| 665 float* fft, | 667 float* fft, |
| 666 float* cohde, | 668 float* cohde, |
| 667 float* cohxd) { | 669 float* cohxd) { |
| 668 float dfw[2][PART_LEN1]; | |
| 669 int i; | 670 int i; |
| 670 | 671 |
| 671 if (aec->delayEstCtr == 0) | |
| 672 aec->delayIdx = PartitionDelay(aec); | |
| 673 | |
| 674 // Use delayed far. | |
| 675 memcpy(xfw, | |
| 676 aec->xfwBuf + aec->delayIdx * PART_LEN1, | |
| 677 sizeof(xfw[0][0]) * 2 * PART_LEN1); | |
| 678 | |
| 679 // Windowed near fft | |
| 680 WindowData(fft, aec->dBuf); | |
| 681 aec_rdft_forward_128(fft); | |
| 682 StoreAsComplex(fft, dfw); | |
| 683 | |
| 684 // Windowed error fft | |
| 685 WindowData(fft, aec->eBuf); | |
| 686 aec_rdft_forward_128(fft); | |
| 687 StoreAsComplex(fft, efw); | |
| 688 | |
| 689 SmoothedPSD(aec, efw, dfw, xfw); | 672 SmoothedPSD(aec, efw, dfw, xfw); |
| 690 | 673 |
| 691 { | 674 { |
| 692 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); | 675 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); |
| 693 | 676 |
| 694 // Subband coherence | 677 // Subband coherence |
| 695 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 678 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 696 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); | 679 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); |
| 697 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); | 680 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); |
| 698 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); | 681 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 733 } | 716 } |
| 734 } | 717 } |
| 735 } | 718 } |
| 736 | 719 |
| 737 void WebRtcAec_InitAec_SSE2(void) { | 720 void WebRtcAec_InitAec_SSE2(void) { |
| 738 WebRtcAec_FilterFar = FilterFarSSE2; | 721 WebRtcAec_FilterFar = FilterFarSSE2; |
| 739 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 722 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
| 740 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 723 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
| 741 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 724 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
| 742 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 725 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
| 726 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
| 727 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
| 728 WebRtcAec_WindowData = WindowDataSSE2; |
| 743 } | 729 } |
| OLD | NEW |