| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 431 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 442 } | 442 } |
| 443 | 443 |
| 444 __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) { | 444 __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) { |
| 445 // A+B C+D | 445 // A+B C+D |
| 446 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); | 446 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); |
| 447 // A+B+C+D A+B+C+D | 447 // A+B+C+D A+B+C+D |
| 448 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | 448 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); |
| 449 _mm_store_ss(dst, sum); | 449 _mm_store_ss(dst, sum); |
| 450 } | 450 } |
| 451 | 451 |
| 452 static int PartitionDelaySSE2(const AecCore* aec) { | 452 static int PartitionDelaySSE2( |
| 453 int num_partitions, |
| 454 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { |
| 453 // Measures the energy in each filter partition and returns the partition with | 455 // Measures the energy in each filter partition and returns the partition with |
| 454 // highest energy. | 456 // highest energy. |
| 455 // TODO(bjornv): Spread computational cost by computing one partition per | 457 // TODO(bjornv): Spread computational cost by computing one partition per |
| 456 // block? | 458 // block? |
| 457 float wfEnMax = 0; | 459 float wfEnMax = 0; |
| 458 int i; | 460 int i; |
| 459 int delay = 0; | 461 int delay = 0; |
| 460 | 462 |
| 461 for (i = 0; i < aec->num_partitions; i++) { | 463 for (i = 0; i < num_partitions; i++) { |
| 462 int j; | 464 int j; |
| 463 int pos = i * PART_LEN1; | 465 int pos = i * PART_LEN1; |
| 464 float wfEn = 0; | 466 float wfEn = 0; |
| 465 __m128 vec_wfEn = _mm_set1_ps(0.0f); | 467 __m128 vec_wfEn = _mm_set1_ps(0.0f); |
| 466 // vectorized code (four at once) | 468 // vectorized code (four at once) |
| 467 for (j = 0; j + 3 < PART_LEN1; j += 4) { | 469 for (j = 0; j + 3 < PART_LEN1; j += 4) { |
| 468 const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); | 470 const __m128 vec_wfBuf0 = _mm_loadu_ps(&h_fft_buf[0][pos + j]); |
| 469 const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); | 471 const __m128 vec_wfBuf1 = _mm_loadu_ps(&h_fft_buf[1][pos + j]); |
| 470 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); | 472 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); |
| 471 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); | 473 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); |
| 472 } | 474 } |
| 473 _mm_add_ps_4x1(vec_wfEn, &wfEn); | 475 _mm_add_ps_4x1(vec_wfEn, &wfEn); |
| 474 | 476 |
| 475 // scalar code for the remaining items. | 477 // scalar code for the remaining items. |
| 476 for (; j < PART_LEN1; j++) { | 478 for (; j < PART_LEN1; j++) { |
| 477 wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + | 479 wfEn += h_fft_buf[0][pos + j] * h_fft_buf[0][pos + j] + |
| 478 aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; | 480 h_fft_buf[1][pos + j] * h_fft_buf[1][pos + j]; |
| 479 } | 481 } |
| 480 | 482 |
| 481 if (wfEn > wfEnMax) { | 483 if (wfEn > wfEnMax) { |
| 482 wfEnMax = wfEn; | 484 wfEnMax = wfEn; |
| 483 delay = i; | 485 delay = i; |
| 484 } | 486 } |
| 485 } | 487 } |
| 486 return delay; | 488 return delay; |
| 487 } | 489 } |
| 488 | 490 |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 744 WebRtcAec_FilterFar = FilterFarSSE2; | 746 WebRtcAec_FilterFar = FilterFarSSE2; |
| 745 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 747 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
| 746 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 748 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
| 747 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 749 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
| 748 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 750 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
| 749 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 751 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
| 750 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 752 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
| 751 WebRtcAec_WindowData = WindowDataSSE2; | 753 WebRtcAec_WindowData = WindowDataSSE2; |
| 752 } | 754 } |
| 753 } // namespace webrtc | 755 } // namespace webrtc |
| OLD | NEW |