OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 477 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
488 | 488 |
489 // Updates the following smoothed Power Spectral Densities (PSD): | 489 // Updates the following smoothed Power Spectral Densities (PSD): |
490 // - sd : near-end | 490 // - sd : near-end |
491 // - se : residual echo | 491 // - se : residual echo |
492 // - sx : far-end | 492 // - sx : far-end |
493 // - sde : cross-PSD of near-end and residual echo | 493 // - sde : cross-PSD of near-end and residual echo |
494 // - sxd : cross-PSD of near-end and far-end | 494 // - sxd : cross-PSD of near-end and far-end |
495 // | 495 // |
496 // In addition to updating the PSDs, also the filter diverge state is determined | 496 // In addition to updating the PSDs, also the filter diverge state is determined |
497 // upon actions are taken. | 497 // upon actions are taken. |
498 static void SmoothedPSD(AecCore* aec, | 498 static void SmoothedPSD(int mult, |
| 499 bool extended_filter_enabled, |
499 float efw[2][PART_LEN1], | 500 float efw[2][PART_LEN1], |
500 float dfw[2][PART_LEN1], | 501 float dfw[2][PART_LEN1], |
501 float xfw[2][PART_LEN1], | 502 float xfw[2][PART_LEN1], |
| 503 CoherenceState* coherence_state, |
| 504 short* filter_divergence_state, |
502 int* extreme_filter_divergence) { | 505 int* extreme_filter_divergence) { |
503 // Power estimate smoothing coefficients. | 506 // Power estimate smoothing coefficients. |
504 const float* ptrGCoh = | 507 const float* ptrGCoh = |
505 aec->extended_filter_enabled | 508 extended_filter_enabled |
506 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | 509 ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1] |
507 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | 510 : WebRtcAec_kNormalSmoothingCoefficients[mult - 1]; |
508 int i; | 511 int i; |
509 float sdSum = 0, seSum = 0; | 512 float sdSum = 0, seSum = 0; |
510 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); | 513 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); |
511 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); | 514 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); |
512 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); | 515 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); |
513 __m128 vec_sdSum = _mm_set1_ps(0.0f); | 516 __m128 vec_sdSum = _mm_set1_ps(0.0f); |
514 __m128 vec_seSum = _mm_set1_ps(0.0f); | 517 __m128 vec_seSum = _mm_set1_ps(0.0f); |
515 | 518 |
516 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 519 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
517 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); | 520 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); |
518 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); | 521 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); |
519 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); | 522 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); |
520 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); | 523 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); |
521 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); | 524 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); |
522 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); | 525 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); |
523 __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); | 526 __m128 vec_sd = |
524 __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); | 527 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sd[i]), vec_GCoh0); |
525 __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); | 528 __m128 vec_se = |
| 529 _mm_mul_ps(_mm_loadu_ps(&coherence_state->se[i]), vec_GCoh0); |
| 530 __m128 vec_sx = |
| 531 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sx[i]), vec_GCoh0); |
526 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); | 532 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); |
527 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); | 533 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); |
528 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); | 534 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); |
529 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); | 535 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); |
530 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); | 536 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); |
531 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); | 537 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); |
532 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); | 538 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); |
533 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); | 539 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); |
534 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); | 540 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); |
535 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); | 541 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); |
536 _mm_storeu_ps(&aec->sd[i], vec_sd); | 542 _mm_storeu_ps(&coherence_state->sd[i], vec_sd); |
537 _mm_storeu_ps(&aec->se[i], vec_se); | 543 _mm_storeu_ps(&coherence_state->se[i], vec_se); |
538 _mm_storeu_ps(&aec->sx[i], vec_sx); | 544 _mm_storeu_ps(&coherence_state->sx[i], vec_sx); |
539 | 545 |
540 { | 546 { |
541 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); | 547 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]); |
542 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | 548 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]); |
543 __m128 vec_a = | 549 __m128 vec_a = |
544 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 550 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
545 __m128 vec_b = | 551 __m128 vec_b = |
546 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 552 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
547 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); | 553 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); |
548 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); | 554 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); |
549 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | 555 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |
550 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | 556 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |
551 vec_dfwefw0011 = | 557 vec_dfwefw0011 = |
552 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); | 558 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); |
553 vec_dfwefw0110 = | 559 vec_dfwefw0110 = |
554 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); | 560 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); |
555 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); | 561 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); |
556 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); | 562 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); |
557 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | 563 _mm_storeu_ps(&coherence_state->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |
558 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | 564 _mm_storeu_ps(&coherence_state->sde[i + 2][0], |
| 565 _mm_unpackhi_ps(vec_a, vec_b)); |
559 } | 566 } |
560 | 567 |
561 { | 568 { |
562 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | 569 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]); |
563 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | 570 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]); |
564 __m128 vec_a = | 571 __m128 vec_a = |
565 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 572 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
566 __m128 vec_b = | 573 __m128 vec_b = |
567 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 574 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
568 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); | 575 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); |
569 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); | 576 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); |
570 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | 577 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |
571 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | 578 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |
572 vec_dfwxfw0011 = | 579 vec_dfwxfw0011 = |
573 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); | 580 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); |
574 vec_dfwxfw0110 = | 581 vec_dfwxfw0110 = |
575 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); | 582 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); |
576 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); | 583 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); |
577 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); | 584 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); |
578 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | 585 _mm_storeu_ps(&coherence_state->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |
579 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | 586 _mm_storeu_ps(&coherence_state->sxd[i + 2][0], |
| 587 _mm_unpackhi_ps(vec_a, vec_b)); |
580 } | 588 } |
581 | 589 |
582 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); | 590 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); |
583 vec_seSum = _mm_add_ps(vec_seSum, vec_se); | 591 vec_seSum = _mm_add_ps(vec_seSum, vec_se); |
584 } | 592 } |
585 | 593 |
586 _mm_add_ps_4x1(vec_sdSum, &sdSum); | 594 _mm_add_ps_4x1(vec_sdSum, &sdSum); |
587 _mm_add_ps_4x1(vec_seSum, &seSum); | 595 _mm_add_ps_4x1(vec_seSum, &seSum); |
588 | 596 |
589 for (; i < PART_LEN1; i++) { | 597 for (; i < PART_LEN1; i++) { |
590 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | 598 coherence_state->sd[i] = |
591 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | 599 ptrGCoh[0] * coherence_state->sd[i] + |
592 aec->se[i] = ptrGCoh[0] * aec->se[i] + | 600 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); |
593 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | 601 coherence_state->se[i] = |
| 602 ptrGCoh[0] * coherence_state->se[i] + |
| 603 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); |
594 // We threshold here to protect against the ill-effects of a zero farend. | 604 // We threshold here to protect against the ill-effects of a zero farend. |
595 // The threshold is not arbitrarily chosen, but balances protection and | 605 // The threshold is not arbitrarily chosen, but balances protection and |
596 // adverse interaction with the algorithm's tuning. | 606 // adverse interaction with the algorithm's tuning. |
597 // TODO(bjornv): investigate further why this is so sensitive. | 607 // TODO(bjornv): investigate further why this is so sensitive. |
598 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + | 608 coherence_state->sx[i] = |
599 ptrGCoh[1] * WEBRTC_SPL_MAX( | 609 ptrGCoh[0] * coherence_state->sx[i] + |
600 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | 610 ptrGCoh[1] * |
601 WebRtcAec_kMinFarendPSD); | 611 WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], |
| 612 WebRtcAec_kMinFarendPSD); |
602 | 613 |
603 aec->sde[i][0] = | 614 coherence_state->sde[i][0] = |
604 ptrGCoh[0] * aec->sde[i][0] + | 615 ptrGCoh[0] * coherence_state->sde[i][0] + |
605 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | 616 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); |
606 aec->sde[i][1] = | 617 coherence_state->sde[i][1] = |
607 ptrGCoh[0] * aec->sde[i][1] + | 618 ptrGCoh[0] * coherence_state->sde[i][1] + |
608 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | 619 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); |
609 | 620 |
610 aec->sxd[i][0] = | 621 coherence_state->sxd[i][0] = |
611 ptrGCoh[0] * aec->sxd[i][0] + | 622 ptrGCoh[0] * coherence_state->sxd[i][0] + |
612 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); | 623 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); |
613 aec->sxd[i][1] = | 624 coherence_state->sxd[i][1] = |
614 ptrGCoh[0] * aec->sxd[i][1] + | 625 ptrGCoh[0] * coherence_state->sxd[i][1] + |
615 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); | 626 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); |
616 | 627 |
617 sdSum += aec->sd[i]; | 628 sdSum += coherence_state->sd[i]; |
618 seSum += aec->se[i]; | 629 seSum += coherence_state->se[i]; |
619 } | 630 } |
620 | 631 |
621 // Divergent filter safeguard update. | 632 // Divergent filter safeguard update. |
622 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; | 633 *filter_divergence_state = |
| 634 (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum; |
623 | 635 |
624 // Signal extreme filter divergence if the error is significantly larger | 636 // Signal extreme filter divergence if the error is significantly larger |
625 // than the nearend (13 dB). | 637 // than the nearend (13 dB). |
626 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); | 638 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); |
627 } | 639 } |
628 | 640 |
629 // Window time domain data to be used by the fft. | 641 // Window time domain data to be used by the fft. |
630 static void WindowDataSSE2(float* x_windowed, const float* x) { | 642 static void WindowDataSSE2(float* x_windowed, const float* x) { |
631 int i; | 643 int i; |
632 for (i = 0; i < PART_LEN; i += 4) { | 644 for (i = 0; i < PART_LEN; i += 4) { |
(...skipping 26 matching lines...) Expand all Loading... |
659 _mm_storeu_ps(&data_complex[0][i], vec_a); | 671 _mm_storeu_ps(&data_complex[0][i], vec_a); |
660 _mm_storeu_ps(&data_complex[1][i], vec_b); | 672 _mm_storeu_ps(&data_complex[1][i], vec_b); |
661 } | 673 } |
662 // fix beginning/end values | 674 // fix beginning/end values |
663 data_complex[1][0] = 0; | 675 data_complex[1][0] = 0; |
664 data_complex[1][PART_LEN] = 0; | 676 data_complex[1][PART_LEN] = 0; |
665 data_complex[0][0] = data[0]; | 677 data_complex[0][0] = data[0]; |
666 data_complex[0][PART_LEN] = data[1]; | 678 data_complex[0][PART_LEN] = data[1]; |
667 } | 679 } |
668 | 680 |
669 static void SubbandCoherenceSSE2(AecCore* aec, | 681 static void SubbandCoherenceSSE2(int mult, |
| 682 bool extended_filter_enabled, |
670 float efw[2][PART_LEN1], | 683 float efw[2][PART_LEN1], |
671 float dfw[2][PART_LEN1], | 684 float dfw[2][PART_LEN1], |
672 float xfw[2][PART_LEN1], | 685 float xfw[2][PART_LEN1], |
673 float* fft, | 686 float* fft, |
674 float* cohde, | 687 float* cohde, |
675 float* cohxd, | 688 float* cohxd, |
| 689 CoherenceState* coherence_state, |
| 690 short* filter_divergence_state, |
676 int* extreme_filter_divergence) { | 691 int* extreme_filter_divergence) { |
677 int i; | 692 int i; |
678 | 693 |
679 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | 694 SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state, |
| 695 filter_divergence_state, extreme_filter_divergence); |
680 | 696 |
681 { | 697 { |
682 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); | 698 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); |
683 | 699 |
684 // Subband coherence | 700 // Subband coherence |
685 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 701 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
686 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); | 702 const __m128 vec_sd = _mm_loadu_ps(&coherence_state->sd[i]); |
687 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); | 703 const __m128 vec_se = _mm_loadu_ps(&coherence_state->se[i]); |
688 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); | 704 const __m128 vec_sx = _mm_loadu_ps(&coherence_state->sx[i]); |
689 const __m128 vec_sdse = | 705 const __m128 vec_sdse = |
690 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); | 706 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); |
691 const __m128 vec_sdsx = | 707 const __m128 vec_sdsx = |
692 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); | 708 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); |
693 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); | 709 const __m128 vec_sde_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]); |
694 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | 710 const __m128 vec_sde_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]); |
695 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | 711 const __m128 vec_sxd_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]); |
696 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | 712 const __m128 vec_sxd_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]); |
697 const __m128 vec_sde_0 = | 713 const __m128 vec_sde_0 = |
698 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 714 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
699 const __m128 vec_sde_1 = | 715 const __m128 vec_sde_1 = |
700 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 716 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
701 const __m128 vec_sxd_0 = | 717 const __m128 vec_sxd_0 = |
702 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 718 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
703 const __m128 vec_sxd_1 = | 719 const __m128 vec_sxd_1 = |
704 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 720 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
705 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); | 721 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); |
706 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); | 722 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); |
707 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); | 723 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); |
708 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); | 724 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); |
709 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); | 725 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); |
710 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); | 726 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); |
711 _mm_storeu_ps(&cohde[i], vec_cohde); | 727 _mm_storeu_ps(&cohde[i], vec_cohde); |
712 _mm_storeu_ps(&cohxd[i], vec_cohxd); | 728 _mm_storeu_ps(&cohxd[i], vec_cohxd); |
713 } | 729 } |
714 | 730 |
715 // scalar code for the remaining items. | 731 // scalar code for the remaining items. |
716 for (; i < PART_LEN1; i++) { | 732 for (; i < PART_LEN1; i++) { |
717 cohde[i] = | 733 cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] + |
718 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / | 734 coherence_state->sde[i][1] * coherence_state->sde[i][1]) / |
719 (aec->sd[i] * aec->se[i] + 1e-10f); | 735 (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f); |
720 cohxd[i] = | 736 cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] + |
721 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / | 737 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / |
722 (aec->sx[i] * aec->sd[i] + 1e-10f); | 738 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); |
723 } | 739 } |
724 } | 740 } |
725 } | 741 } |
726 | 742 |
727 void WebRtcAec_InitAec_SSE2(void) { | 743 void WebRtcAec_InitAec_SSE2(void) { |
728 WebRtcAec_FilterFar = FilterFarSSE2; | 744 WebRtcAec_FilterFar = FilterFarSSE2; |
729 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 745 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
730 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 746 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
731 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 747 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
732 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 748 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
733 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 749 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
734 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 750 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
735 WebRtcAec_WindowData = WindowDataSSE2; | 751 WebRtcAec_WindowData = WindowDataSSE2; |
736 } | 752 } |
737 } // namespace webrtc | 753 } // namespace webrtc |
OLD | NEW |