OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 431 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
442 } | 442 } |
443 | 443 |
444 __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) { | 444 __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) { |
445 // A+B C+D | 445 // A+B C+D |
446 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); | 446 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); |
447 // A+B+C+D A+B+C+D | 447 // A+B+C+D A+B+C+D |
448 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | 448 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); |
449 _mm_store_ss(dst, sum); | 449 _mm_store_ss(dst, sum); |
450 } | 450 } |
451 | 451 |
452 static int PartitionDelaySSE2(const AecCore* aec) { | 452 static int PartitionDelaySSE2( |
| 453 int num_partitions, |
| 454 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { |
453 // Measures the energy in each filter partition and returns the partition with | 455 // Measures the energy in each filter partition and returns the partition with |
454 // highest energy. | 456 // highest energy. |
455 // TODO(bjornv): Spread computational cost by computing one partition per | 457 // TODO(bjornv): Spread computational cost by computing one partition per |
456 // block? | 458 // block? |
457 float wfEnMax = 0; | 459 float wfEnMax = 0; |
458 int i; | 460 int i; |
459 int delay = 0; | 461 int delay = 0; |
460 | 462 |
461 for (i = 0; i < aec->num_partitions; i++) { | 463 for (i = 0; i < num_partitions; i++) { |
462 int j; | 464 int j; |
463 int pos = i * PART_LEN1; | 465 int pos = i * PART_LEN1; |
464 float wfEn = 0; | 466 float wfEn = 0; |
465 __m128 vec_wfEn = _mm_set1_ps(0.0f); | 467 __m128 vec_wfEn = _mm_set1_ps(0.0f); |
466 // vectorized code (four at once) | 468 // vectorized code (four at once) |
467 for (j = 0; j + 3 < PART_LEN1; j += 4) { | 469 for (j = 0; j + 3 < PART_LEN1; j += 4) { |
468 const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); | 470 const __m128 vec_wfBuf0 = _mm_loadu_ps(&h_fft_buf[0][pos + j]); |
469 const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); | 471 const __m128 vec_wfBuf1 = _mm_loadu_ps(&h_fft_buf[1][pos + j]); |
470 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); | 472 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); |
471 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); | 473 vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); |
472 } | 474 } |
473 _mm_add_ps_4x1(vec_wfEn, &wfEn); | 475 _mm_add_ps_4x1(vec_wfEn, &wfEn); |
474 | 476 |
475 // scalar code for the remaining items. | 477 // scalar code for the remaining items. |
476 for (; j < PART_LEN1; j++) { | 478 for (; j < PART_LEN1; j++) { |
477 wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + | 479 wfEn += h_fft_buf[0][pos + j] * h_fft_buf[0][pos + j] + |
478 aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; | 480 h_fft_buf[1][pos + j] * h_fft_buf[1][pos + j]; |
479 } | 481 } |
480 | 482 |
481 if (wfEn > wfEnMax) { | 483 if (wfEn > wfEnMax) { |
482 wfEnMax = wfEn; | 484 wfEnMax = wfEn; |
483 delay = i; | 485 delay = i; |
484 } | 486 } |
485 } | 487 } |
486 return delay; | 488 return delay; |
487 } | 489 } |
488 | 490 |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
744 WebRtcAec_FilterFar = FilterFarSSE2; | 746 WebRtcAec_FilterFar = FilterFarSSE2; |
745 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 747 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
746 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 748 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
747 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 749 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
748 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 750 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
749 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 751 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
750 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 752 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
751 WebRtcAec_WindowData = WindowDataSSE2; | 753 WebRtcAec_WindowData = WindowDataSSE2; |
752 } | 754 } |
753 } // namespace webrtc | 755 } // namespace webrtc |
OLD | NEW |