Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(104)

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_core_sse2.cc

Issue 1936173002: Changed the AEC SubbandCoherence function to not use the full aec state (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@RefactorAec1_CL
Patch Set: Fixed bad merge Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « webrtc/modules/audio_processing/aec/aec_core_neon.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 477 matching lines...) Expand 10 before | Expand all | Expand 10 after
488 488
489 // Updates the following smoothed Power Spectral Densities (PSD): 489 // Updates the following smoothed Power Spectral Densities (PSD):
490 // - sd : near-end 490 // - sd : near-end
491 // - se : residual echo 491 // - se : residual echo
492 // - sx : far-end 492 // - sx : far-end
493 // - sde : cross-PSD of near-end and residual echo 493 // - sde : cross-PSD of near-end and residual echo
494 // - sxd : cross-PSD of near-end and far-end 494 // - sxd : cross-PSD of near-end and far-end
495 // 495 //
496 // In addition to updating the PSDs, also the filter diverge state is determined 496 // In addition to updating the PSDs, also the filter diverge state is determined
497 // upon actions are taken. 497 // upon actions are taken.
498 static void SmoothedPSD(AecCore* aec, 498 static void SmoothedPSD(int mult,
499 bool extended_filter_enabled,
499 float efw[2][PART_LEN1], 500 float efw[2][PART_LEN1],
500 float dfw[2][PART_LEN1], 501 float dfw[2][PART_LEN1],
501 float xfw[2][PART_LEN1], 502 float xfw[2][PART_LEN1],
503 CoherenceState* coherence_state,
504 short* filter_divergence_state,
502 int* extreme_filter_divergence) { 505 int* extreme_filter_divergence) {
503 // Power estimate smoothing coefficients. 506 // Power estimate smoothing coefficients.
504 const float* ptrGCoh = 507 const float* ptrGCoh =
505 aec->extended_filter_enabled 508 extended_filter_enabled
506 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] 509 ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1]
507 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; 510 : WebRtcAec_kNormalSmoothingCoefficients[mult - 1];
508 int i; 511 int i;
509 float sdSum = 0, seSum = 0; 512 float sdSum = 0, seSum = 0;
510 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); 513 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);
511 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); 514 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
512 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); 515 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
513 __m128 vec_sdSum = _mm_set1_ps(0.0f); 516 __m128 vec_sdSum = _mm_set1_ps(0.0f);
514 __m128 vec_seSum = _mm_set1_ps(0.0f); 517 __m128 vec_seSum = _mm_set1_ps(0.0f);
515 518
516 for (i = 0; i + 3 < PART_LEN1; i += 4) { 519 for (i = 0; i + 3 < PART_LEN1; i += 4) {
517 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); 520 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
518 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); 521 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
519 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); 522 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
520 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); 523 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
521 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); 524 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
522 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); 525 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
523 __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); 526 __m128 vec_sd =
524 __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); 527 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sd[i]), vec_GCoh0);
525 __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); 528 __m128 vec_se =
529 _mm_mul_ps(_mm_loadu_ps(&coherence_state->se[i]), vec_GCoh0);
530 __m128 vec_sx =
531 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sx[i]), vec_GCoh0);
526 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); 532 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
527 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); 533 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
528 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); 534 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
529 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); 535 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
530 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); 536 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
531 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); 537 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
532 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); 538 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
533 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); 539 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
534 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); 540 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
535 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); 541 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
536 _mm_storeu_ps(&aec->sd[i], vec_sd); 542 _mm_storeu_ps(&coherence_state->sd[i], vec_sd);
537 _mm_storeu_ps(&aec->se[i], vec_se); 543 _mm_storeu_ps(&coherence_state->se[i], vec_se);
538 _mm_storeu_ps(&aec->sx[i], vec_sx); 544 _mm_storeu_ps(&coherence_state->sx[i], vec_sx);
539 545
540 { 546 {
541 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); 547 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]);
542 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); 548 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]);
543 __m128 vec_a = 549 __m128 vec_a =
544 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); 550 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));
545 __m128 vec_b = 551 __m128 vec_b =
546 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); 552 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));
547 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); 553 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
548 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); 554 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
549 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); 555 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
550 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); 556 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
551 vec_dfwefw0011 = 557 vec_dfwefw0011 =
552 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); 558 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1));
553 vec_dfwefw0110 = 559 vec_dfwefw0110 =
554 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); 560 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0));
555 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); 561 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
556 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); 562 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
557 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); 563 _mm_storeu_ps(&coherence_state->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
558 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); 564 _mm_storeu_ps(&coherence_state->sde[i + 2][0],
565 _mm_unpackhi_ps(vec_a, vec_b));
559 } 566 }
560 567
561 { 568 {
562 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); 569 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]);
563 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); 570 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]);
564 __m128 vec_a = 571 __m128 vec_a =
565 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); 572 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));
566 __m128 vec_b = 573 __m128 vec_b =
567 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); 574 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));
568 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); 575 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
569 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); 576 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
570 vec_a = _mm_mul_ps(vec_a, vec_GCoh0); 577 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
571 vec_b = _mm_mul_ps(vec_b, vec_GCoh0); 578 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
572 vec_dfwxfw0011 = 579 vec_dfwxfw0011 =
573 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); 580 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1));
574 vec_dfwxfw0110 = 581 vec_dfwxfw0110 =
575 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); 582 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0));
576 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); 583 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
577 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); 584 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
578 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); 585 _mm_storeu_ps(&coherence_state->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
579 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); 586 _mm_storeu_ps(&coherence_state->sxd[i + 2][0],
587 _mm_unpackhi_ps(vec_a, vec_b));
580 } 588 }
581 589
582 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); 590 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
583 vec_seSum = _mm_add_ps(vec_seSum, vec_se); 591 vec_seSum = _mm_add_ps(vec_seSum, vec_se);
584 } 592 }
585 593
586 _mm_add_ps_4x1(vec_sdSum, &sdSum); 594 _mm_add_ps_4x1(vec_sdSum, &sdSum);
587 _mm_add_ps_4x1(vec_seSum, &seSum); 595 _mm_add_ps_4x1(vec_seSum, &seSum);
588 596
589 for (; i < PART_LEN1; i++) { 597 for (; i < PART_LEN1; i++) {
590 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + 598 coherence_state->sd[i] =
591 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); 599 ptrGCoh[0] * coherence_state->sd[i] +
592 aec->se[i] = ptrGCoh[0] * aec->se[i] + 600 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
593 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); 601 coherence_state->se[i] =
602 ptrGCoh[0] * coherence_state->se[i] +
603 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
594 // We threshold here to protect against the ill-effects of a zero farend. 604 // We threshold here to protect against the ill-effects of a zero farend.
595 // The threshold is not arbitrarily chosen, but balances protection and 605 // The threshold is not arbitrarily chosen, but balances protection and
596 // adverse interaction with the algorithm's tuning. 606 // adverse interaction with the algorithm's tuning.
597 // TODO(bjornv): investigate further why this is so sensitive. 607 // TODO(bjornv): investigate further why this is so sensitive.
598 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + 608 coherence_state->sx[i] =
599 ptrGCoh[1] * WEBRTC_SPL_MAX( 609 ptrGCoh[0] * coherence_state->sx[i] +
600 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], 610 ptrGCoh[1] *
601 WebRtcAec_kMinFarendPSD); 611 WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
612 WebRtcAec_kMinFarendPSD);
602 613
603 aec->sde[i][0] = 614 coherence_state->sde[i][0] =
604 ptrGCoh[0] * aec->sde[i][0] + 615 ptrGCoh[0] * coherence_state->sde[i][0] +
605 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); 616 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
606 aec->sde[i][1] = 617 coherence_state->sde[i][1] =
607 ptrGCoh[0] * aec->sde[i][1] + 618 ptrGCoh[0] * coherence_state->sde[i][1] +
608 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); 619 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
609 620
610 aec->sxd[i][0] = 621 coherence_state->sxd[i][0] =
611 ptrGCoh[0] * aec->sxd[i][0] + 622 ptrGCoh[0] * coherence_state->sxd[i][0] +
612 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); 623 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
613 aec->sxd[i][1] = 624 coherence_state->sxd[i][1] =
614 ptrGCoh[0] * aec->sxd[i][1] + 625 ptrGCoh[0] * coherence_state->sxd[i][1] +
615 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); 626 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
616 627
617 sdSum += aec->sd[i]; 628 sdSum += coherence_state->sd[i];
618 seSum += aec->se[i]; 629 seSum += coherence_state->se[i];
619 } 630 }
620 631
621 // Divergent filter safeguard update. 632 // Divergent filter safeguard update.
622 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; 633 *filter_divergence_state =
634 (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum;
623 635
624 // Signal extreme filter divergence if the error is significantly larger 636 // Signal extreme filter divergence if the error is significantly larger
625 // than the nearend (13 dB). 637 // than the nearend (13 dB).
626 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); 638 *extreme_filter_divergence = (seSum > (19.95f * sdSum));
627 } 639 }
628 640
629 // Window time domain data to be used by the fft. 641 // Window time domain data to be used by the fft.
630 static void WindowDataSSE2(float* x_windowed, const float* x) { 642 static void WindowDataSSE2(float* x_windowed, const float* x) {
631 int i; 643 int i;
632 for (i = 0; i < PART_LEN; i += 4) { 644 for (i = 0; i < PART_LEN; i += 4) {
(...skipping 26 matching lines...) Expand all
659 _mm_storeu_ps(&data_complex[0][i], vec_a); 671 _mm_storeu_ps(&data_complex[0][i], vec_a);
660 _mm_storeu_ps(&data_complex[1][i], vec_b); 672 _mm_storeu_ps(&data_complex[1][i], vec_b);
661 } 673 }
662 // fix beginning/end values 674 // fix beginning/end values
663 data_complex[1][0] = 0; 675 data_complex[1][0] = 0;
664 data_complex[1][PART_LEN] = 0; 676 data_complex[1][PART_LEN] = 0;
665 data_complex[0][0] = data[0]; 677 data_complex[0][0] = data[0];
666 data_complex[0][PART_LEN] = data[1]; 678 data_complex[0][PART_LEN] = data[1];
667 } 679 }
668 680
669 static void SubbandCoherenceSSE2(AecCore* aec, 681 static void SubbandCoherenceSSE2(int mult,
682 bool extended_filter_enabled,
670 float efw[2][PART_LEN1], 683 float efw[2][PART_LEN1],
671 float dfw[2][PART_LEN1], 684 float dfw[2][PART_LEN1],
672 float xfw[2][PART_LEN1], 685 float xfw[2][PART_LEN1],
673 float* fft, 686 float* fft,
674 float* cohde, 687 float* cohde,
675 float* cohxd, 688 float* cohxd,
689 CoherenceState* coherence_state,
690 short* filter_divergence_state,
676 int* extreme_filter_divergence) { 691 int* extreme_filter_divergence) {
677 int i; 692 int i;
678 693
679 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); 694 SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state,
695 filter_divergence_state, extreme_filter_divergence);
680 696
681 { 697 {
682 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); 698 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);
683 699
684 // Subband coherence 700 // Subband coherence
685 for (i = 0; i + 3 < PART_LEN1; i += 4) { 701 for (i = 0; i + 3 < PART_LEN1; i += 4) {
686 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); 702 const __m128 vec_sd = _mm_loadu_ps(&coherence_state->sd[i]);
687 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); 703 const __m128 vec_se = _mm_loadu_ps(&coherence_state->se[i]);
688 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); 704 const __m128 vec_sx = _mm_loadu_ps(&coherence_state->sx[i]);
689 const __m128 vec_sdse = 705 const __m128 vec_sdse =
690 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); 706 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se));
691 const __m128 vec_sdsx = 707 const __m128 vec_sdsx =
692 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); 708 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx));
693 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); 709 const __m128 vec_sde_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]);
694 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); 710 const __m128 vec_sde_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]);
695 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); 711 const __m128 vec_sxd_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]);
696 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); 712 const __m128 vec_sxd_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]);
697 const __m128 vec_sde_0 = 713 const __m128 vec_sde_0 =
698 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); 714 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0));
699 const __m128 vec_sde_1 = 715 const __m128 vec_sde_1 =
700 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); 716 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1));
701 const __m128 vec_sxd_0 = 717 const __m128 vec_sxd_0 =
702 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); 718 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0));
703 const __m128 vec_sxd_1 = 719 const __m128 vec_sxd_1 =
704 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); 720 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1));
705 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); 721 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
706 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); 722 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
707 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); 723 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
708 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); 724 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
709 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); 725 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
710 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); 726 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
711 _mm_storeu_ps(&cohde[i], vec_cohde); 727 _mm_storeu_ps(&cohde[i], vec_cohde);
712 _mm_storeu_ps(&cohxd[i], vec_cohxd); 728 _mm_storeu_ps(&cohxd[i], vec_cohxd);
713 } 729 }
714 730
715 // scalar code for the remaining items. 731 // scalar code for the remaining items.
716 for (; i < PART_LEN1; i++) { 732 for (; i < PART_LEN1; i++) {
717 cohde[i] = 733 cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] +
718 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / 734 coherence_state->sde[i][1] * coherence_state->sde[i][1]) /
719 (aec->sd[i] * aec->se[i] + 1e-10f); 735 (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f);
720 cohxd[i] = 736 cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] +
721 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / 737 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) /
722 (aec->sx[i] * aec->sd[i] + 1e-10f); 738 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f);
723 } 739 }
724 } 740 }
725 } 741 }
726 742
727 void WebRtcAec_InitAec_SSE2(void) { 743 void WebRtcAec_InitAec_SSE2(void) {
728 WebRtcAec_FilterFar = FilterFarSSE2; 744 WebRtcAec_FilterFar = FilterFarSSE2;
729 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; 745 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
730 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; 746 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
731 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; 747 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
732 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; 748 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;
733 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2; 749 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;
734 WebRtcAec_PartitionDelay = PartitionDelaySSE2; 750 WebRtcAec_PartitionDelay = PartitionDelaySSE2;
735 WebRtcAec_WindowData = WindowDataSSE2; 751 WebRtcAec_WindowData = WindowDataSSE2;
736 } 752 }
737 } // namespace webrtc 753 } // namespace webrtc
OLDNEW
« no previous file with comments | « webrtc/modules/audio_processing/aec/aec_core_neon.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698