| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 477 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 488 | 488 |
| 489 // Updates the following smoothed Power Spectral Densities (PSD): | 489 // Updates the following smoothed Power Spectral Densities (PSD): |
| 490 // - sd : near-end | 490 // - sd : near-end |
| 491 // - se : residual echo | 491 // - se : residual echo |
| 492 // - sx : far-end | 492 // - sx : far-end |
| 493 // - sde : cross-PSD of near-end and residual echo | 493 // - sde : cross-PSD of near-end and residual echo |
| 494 // - sxd : cross-PSD of near-end and far-end | 494 // - sxd : cross-PSD of near-end and far-end |
| 495 // | 495 // |
| 496 // In addition to updating the PSDs, also the filter diverge state is determined | 496 // In addition to updating the PSDs, also the filter diverge state is determined |
| 497 // upon actions are taken. | 497 // upon actions are taken. |
| 498 static void SmoothedPSD(AecCore* aec, | 498 static void SmoothedPSD(int mult, |
| 499 bool extended_filter_enabled, |
| 499 float efw[2][PART_LEN1], | 500 float efw[2][PART_LEN1], |
| 500 float dfw[2][PART_LEN1], | 501 float dfw[2][PART_LEN1], |
| 501 float xfw[2][PART_LEN1], | 502 float xfw[2][PART_LEN1], |
| 503 CoherenceState* coherence_state, |
| 504 short* filter_divergence_state, |
| 502 int* extreme_filter_divergence) { | 505 int* extreme_filter_divergence) { |
| 503 // Power estimate smoothing coefficients. | 506 // Power estimate smoothing coefficients. |
| 504 const float* ptrGCoh = | 507 const float* ptrGCoh = |
| 505 aec->extended_filter_enabled | 508 extended_filter_enabled |
| 506 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | 509 ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1] |
| 507 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | 510 : WebRtcAec_kNormalSmoothingCoefficients[mult - 1]; |
| 508 int i; | 511 int i; |
| 509 float sdSum = 0, seSum = 0; | 512 float sdSum = 0, seSum = 0; |
| 510 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); | 513 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); |
| 511 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); | 514 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); |
| 512 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); | 515 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); |
| 513 __m128 vec_sdSum = _mm_set1_ps(0.0f); | 516 __m128 vec_sdSum = _mm_set1_ps(0.0f); |
| 514 __m128 vec_seSum = _mm_set1_ps(0.0f); | 517 __m128 vec_seSum = _mm_set1_ps(0.0f); |
| 515 | 518 |
| 516 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 519 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 517 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); | 520 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); |
| 518 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); | 521 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); |
| 519 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); | 522 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); |
| 520 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); | 523 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); |
| 521 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); | 524 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); |
| 522 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); | 525 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); |
| 523 __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); | 526 __m128 vec_sd = |
| 524 __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); | 527 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sd[i]), vec_GCoh0); |
| 525 __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); | 528 __m128 vec_se = |
| 529 _mm_mul_ps(_mm_loadu_ps(&coherence_state->se[i]), vec_GCoh0); |
| 530 __m128 vec_sx = |
| 531 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sx[i]), vec_GCoh0); |
| 526 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); | 532 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); |
| 527 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); | 533 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); |
| 528 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); | 534 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); |
| 529 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); | 535 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); |
| 530 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); | 536 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); |
| 531 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); | 537 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); |
| 532 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); | 538 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); |
| 533 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); | 539 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); |
| 534 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); | 540 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); |
| 535 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); | 541 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); |
| 536 _mm_storeu_ps(&aec->sd[i], vec_sd); | 542 _mm_storeu_ps(&coherence_state->sd[i], vec_sd); |
| 537 _mm_storeu_ps(&aec->se[i], vec_se); | 543 _mm_storeu_ps(&coherence_state->se[i], vec_se); |
| 538 _mm_storeu_ps(&aec->sx[i], vec_sx); | 544 _mm_storeu_ps(&coherence_state->sx[i], vec_sx); |
| 539 | 545 |
| 540 { | 546 { |
| 541 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); | 547 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]); |
| 542 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | 548 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]); |
| 543 __m128 vec_a = | 549 __m128 vec_a = |
| 544 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 550 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
| 545 __m128 vec_b = | 551 __m128 vec_b = |
| 546 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 552 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
| 547 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); | 553 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); |
| 548 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); | 554 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); |
| 549 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | 555 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |
| 550 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | 556 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |
| 551 vec_dfwefw0011 = | 557 vec_dfwefw0011 = |
| 552 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); | 558 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); |
| 553 vec_dfwefw0110 = | 559 vec_dfwefw0110 = |
| 554 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); | 560 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); |
| 555 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); | 561 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); |
| 556 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); | 562 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); |
| 557 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | 563 _mm_storeu_ps(&coherence_state->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |
| 558 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | 564 _mm_storeu_ps(&coherence_state->sde[i + 2][0], |
| 565 _mm_unpackhi_ps(vec_a, vec_b)); |
| 559 } | 566 } |
| 560 | 567 |
| 561 { | 568 { |
| 562 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | 569 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]); |
| 563 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | 570 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]); |
| 564 __m128 vec_a = | 571 __m128 vec_a = |
| 565 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 572 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
| 566 __m128 vec_b = | 573 __m128 vec_b = |
| 567 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 574 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
| 568 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); | 575 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); |
| 569 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); | 576 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); |
| 570 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | 577 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |
| 571 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | 578 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |
| 572 vec_dfwxfw0011 = | 579 vec_dfwxfw0011 = |
| 573 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); | 580 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); |
| 574 vec_dfwxfw0110 = | 581 vec_dfwxfw0110 = |
| 575 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); | 582 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); |
| 576 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); | 583 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); |
| 577 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); | 584 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); |
| 578 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | 585 _mm_storeu_ps(&coherence_state->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |
| 579 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | 586 _mm_storeu_ps(&coherence_state->sxd[i + 2][0], |
| 587 _mm_unpackhi_ps(vec_a, vec_b)); |
| 580 } | 588 } |
| 581 | 589 |
| 582 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); | 590 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); |
| 583 vec_seSum = _mm_add_ps(vec_seSum, vec_se); | 591 vec_seSum = _mm_add_ps(vec_seSum, vec_se); |
| 584 } | 592 } |
| 585 | 593 |
| 586 _mm_add_ps_4x1(vec_sdSum, &sdSum); | 594 _mm_add_ps_4x1(vec_sdSum, &sdSum); |
| 587 _mm_add_ps_4x1(vec_seSum, &seSum); | 595 _mm_add_ps_4x1(vec_seSum, &seSum); |
| 588 | 596 |
| 589 for (; i < PART_LEN1; i++) { | 597 for (; i < PART_LEN1; i++) { |
| 590 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | 598 coherence_state->sd[i] = |
| 591 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | 599 ptrGCoh[0] * coherence_state->sd[i] + |
| 592 aec->se[i] = ptrGCoh[0] * aec->se[i] + | 600 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); |
| 593 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | 601 coherence_state->se[i] = |
| 602 ptrGCoh[0] * coherence_state->se[i] + |
| 603 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); |
| 594 // We threshold here to protect against the ill-effects of a zero farend. | 604 // We threshold here to protect against the ill-effects of a zero farend. |
| 595 // The threshold is not arbitrarily chosen, but balances protection and | 605 // The threshold is not arbitrarily chosen, but balances protection and |
| 596 // adverse interaction with the algorithm's tuning. | 606 // adverse interaction with the algorithm's tuning. |
| 597 // TODO(bjornv): investigate further why this is so sensitive. | 607 // TODO(bjornv): investigate further why this is so sensitive. |
| 598 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + | 608 coherence_state->sx[i] = |
| 599 ptrGCoh[1] * WEBRTC_SPL_MAX( | 609 ptrGCoh[0] * coherence_state->sx[i] + |
| 600 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | 610 ptrGCoh[1] * |
| 601 WebRtcAec_kMinFarendPSD); | 611 WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], |
| 612 WebRtcAec_kMinFarendPSD); |
| 602 | 613 |
| 603 aec->sde[i][0] = | 614 coherence_state->sde[i][0] = |
| 604 ptrGCoh[0] * aec->sde[i][0] + | 615 ptrGCoh[0] * coherence_state->sde[i][0] + |
| 605 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | 616 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); |
| 606 aec->sde[i][1] = | 617 coherence_state->sde[i][1] = |
| 607 ptrGCoh[0] * aec->sde[i][1] + | 618 ptrGCoh[0] * coherence_state->sde[i][1] + |
| 608 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | 619 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); |
| 609 | 620 |
| 610 aec->sxd[i][0] = | 621 coherence_state->sxd[i][0] = |
| 611 ptrGCoh[0] * aec->sxd[i][0] + | 622 ptrGCoh[0] * coherence_state->sxd[i][0] + |
| 612 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); | 623 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); |
| 613 aec->sxd[i][1] = | 624 coherence_state->sxd[i][1] = |
| 614 ptrGCoh[0] * aec->sxd[i][1] + | 625 ptrGCoh[0] * coherence_state->sxd[i][1] + |
| 615 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); | 626 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); |
| 616 | 627 |
| 617 sdSum += aec->sd[i]; | 628 sdSum += coherence_state->sd[i]; |
| 618 seSum += aec->se[i]; | 629 seSum += coherence_state->se[i]; |
| 619 } | 630 } |
| 620 | 631 |
| 621 // Divergent filter safeguard update. | 632 // Divergent filter safeguard update. |
| 622 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; | 633 *filter_divergence_state = |
| 634 (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum; |
| 623 | 635 |
| 624 // Signal extreme filter divergence if the error is significantly larger | 636 // Signal extreme filter divergence if the error is significantly larger |
| 625 // than the nearend (13 dB). | 637 // than the nearend (13 dB). |
| 626 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); | 638 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); |
| 627 } | 639 } |
| 628 | 640 |
| 629 // Window time domain data to be used by the fft. | 641 // Window time domain data to be used by the fft. |
| 630 static void WindowDataSSE2(float* x_windowed, const float* x) { | 642 static void WindowDataSSE2(float* x_windowed, const float* x) { |
| 631 int i; | 643 int i; |
| 632 for (i = 0; i < PART_LEN; i += 4) { | 644 for (i = 0; i < PART_LEN; i += 4) { |
| (...skipping 26 matching lines...) Expand all Loading... |
| 659 _mm_storeu_ps(&data_complex[0][i], vec_a); | 671 _mm_storeu_ps(&data_complex[0][i], vec_a); |
| 660 _mm_storeu_ps(&data_complex[1][i], vec_b); | 672 _mm_storeu_ps(&data_complex[1][i], vec_b); |
| 661 } | 673 } |
| 662 // fix beginning/end values | 674 // fix beginning/end values |
| 663 data_complex[1][0] = 0; | 675 data_complex[1][0] = 0; |
| 664 data_complex[1][PART_LEN] = 0; | 676 data_complex[1][PART_LEN] = 0; |
| 665 data_complex[0][0] = data[0]; | 677 data_complex[0][0] = data[0]; |
| 666 data_complex[0][PART_LEN] = data[1]; | 678 data_complex[0][PART_LEN] = data[1]; |
| 667 } | 679 } |
| 668 | 680 |
| 669 static void SubbandCoherenceSSE2(AecCore* aec, | 681 static void SubbandCoherenceSSE2(int mult, |
| 682 bool extended_filter_enabled, |
| 670 float efw[2][PART_LEN1], | 683 float efw[2][PART_LEN1], |
| 671 float dfw[2][PART_LEN1], | 684 float dfw[2][PART_LEN1], |
| 672 float xfw[2][PART_LEN1], | 685 float xfw[2][PART_LEN1], |
| 673 float* fft, | 686 float* fft, |
| 674 float* cohde, | 687 float* cohde, |
| 675 float* cohxd, | 688 float* cohxd, |
| 689 CoherenceState* coherence_state, |
| 690 short* filter_divergence_state, |
| 676 int* extreme_filter_divergence) { | 691 int* extreme_filter_divergence) { |
| 677 int i; | 692 int i; |
| 678 | 693 |
| 679 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | 694 SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state, |
| 695 filter_divergence_state, extreme_filter_divergence); |
| 680 | 696 |
| 681 { | 697 { |
| 682 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); | 698 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); |
| 683 | 699 |
| 684 // Subband coherence | 700 // Subband coherence |
| 685 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 701 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 686 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); | 702 const __m128 vec_sd = _mm_loadu_ps(&coherence_state->sd[i]); |
| 687 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); | 703 const __m128 vec_se = _mm_loadu_ps(&coherence_state->se[i]); |
| 688 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); | 704 const __m128 vec_sx = _mm_loadu_ps(&coherence_state->sx[i]); |
| 689 const __m128 vec_sdse = | 705 const __m128 vec_sdse = |
| 690 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); | 706 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); |
| 691 const __m128 vec_sdsx = | 707 const __m128 vec_sdsx = |
| 692 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); | 708 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); |
| 693 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); | 709 const __m128 vec_sde_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]); |
| 694 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | 710 const __m128 vec_sde_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]); |
| 695 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | 711 const __m128 vec_sxd_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]); |
| 696 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | 712 const __m128 vec_sxd_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]); |
| 697 const __m128 vec_sde_0 = | 713 const __m128 vec_sde_0 = |
| 698 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 714 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
| 699 const __m128 vec_sde_1 = | 715 const __m128 vec_sde_1 = |
| 700 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 716 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
| 701 const __m128 vec_sxd_0 = | 717 const __m128 vec_sxd_0 = |
| 702 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); | 718 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); |
| 703 const __m128 vec_sxd_1 = | 719 const __m128 vec_sxd_1 = |
| 704 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); | 720 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); |
| 705 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); | 721 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); |
| 706 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); | 722 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); |
| 707 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); | 723 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); |
| 708 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); | 724 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); |
| 709 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); | 725 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); |
| 710 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); | 726 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); |
| 711 _mm_storeu_ps(&cohde[i], vec_cohde); | 727 _mm_storeu_ps(&cohde[i], vec_cohde); |
| 712 _mm_storeu_ps(&cohxd[i], vec_cohxd); | 728 _mm_storeu_ps(&cohxd[i], vec_cohxd); |
| 713 } | 729 } |
| 714 | 730 |
| 715 // scalar code for the remaining items. | 731 // scalar code for the remaining items. |
| 716 for (; i < PART_LEN1; i++) { | 732 for (; i < PART_LEN1; i++) { |
| 717 cohde[i] = | 733 cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] + |
| 718 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / | 734 coherence_state->sde[i][1] * coherence_state->sde[i][1]) / |
| 719 (aec->sd[i] * aec->se[i] + 1e-10f); | 735 (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f); |
| 720 cohxd[i] = | 736 cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] + |
| 721 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / | 737 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / |
| 722 (aec->sx[i] * aec->sd[i] + 1e-10f); | 738 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); |
| 723 } | 739 } |
| 724 } | 740 } |
| 725 } | 741 } |
| 726 | 742 |
| 727 void WebRtcAec_InitAec_SSE2(void) { | 743 void WebRtcAec_InitAec_SSE2(void) { |
| 728 WebRtcAec_FilterFar = FilterFarSSE2; | 744 WebRtcAec_FilterFar = FilterFarSSE2; |
| 729 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 745 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
| 730 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 746 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
| 731 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 747 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
| 732 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 748 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
| 733 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; | 749 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; |
| 734 WebRtcAec_PartitionDelay = PartitionDelaySSE2; | 750 WebRtcAec_PartitionDelay = PartitionDelaySSE2; |
| 735 WebRtcAec_WindowData = WindowDataSSE2; | 751 WebRtcAec_WindowData = WindowDataSSE2; |
| 736 } | 752 } |
| 737 } // namespace webrtc | 753 } // namespace webrtc |
| OLD | NEW |