webrtc/modules/audio_processing/aec/aec_core_sse2.cc - Issue 1936173002: Changed the AEC SubbandCoherence function to not use the full aec state

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_core_sse2.cc

Issue 1936173002: Changed the AEC SubbandCoherence function to not use the full aec state (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@RefactorAec1_CL

Patch Set: Fixed bad merge Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 477 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
488	488

489 // Updates the following smoothed Power Spectral Densities (PSD):	489 // Updates the following smoothed Power Spectral Densities (PSD):

490 // - sd : near-end	490 // - sd : near-end

491 // - se : residual echo	491 // - se : residual echo

492 // - sx : far-end	492 // - sx : far-end

493 // - sde : cross-PSD of near-end and residual echo	493 // - sde : cross-PSD of near-end and residual echo

494 // - sxd : cross-PSD of near-end and far-end	494 // - sxd : cross-PSD of near-end and far-end

495 //	495 //

496 // In addition to updating the PSDs, also the filter diverge state is determined	496 // In addition to updating the PSDs, also the filter diverge state is determined

497 // upon actions are taken.	497 // upon actions are taken.

498 static void SmoothedPSD(AecCore* aec,	498 static void SmoothedPSD(int mult,

	499 bool extended_filter_enabled,

499 float efw[2][PART_LEN1],	500 float efw[2][PART_LEN1],

500 float dfw[2][PART_LEN1],	501 float dfw[2][PART_LEN1],

501 float xfw[2][PART_LEN1],	502 float xfw[2][PART_LEN1],

	503 CoherenceState* coherence_state,

	504 short* filter_divergence_state,

502 int* extreme_filter_divergence) {	505 int* extreme_filter_divergence) {

503 // Power estimate smoothing coefficients.	506 // Power estimate smoothing coefficients.

504 const float* ptrGCoh =	507 const float* ptrGCoh =

505 aec->extended_filter_enabled	508 extended_filter_enabled

506 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]	509 ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1]

507 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];	510 : WebRtcAec_kNormalSmoothingCoefficients[mult - 1];

508 int i;	511 int i;

509 float sdSum = 0, seSum = 0;	512 float sdSum = 0, seSum = 0;

510 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);	513 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);

511 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);	514 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);

512 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);	515 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);

513 __m128 vec_sdSum = _mm_set1_ps(0.0f);	516 __m128 vec_sdSum = _mm_set1_ps(0.0f);

514 __m128 vec_seSum = _mm_set1_ps(0.0f);	517 __m128 vec_seSum = _mm_set1_ps(0.0f);

515	518

516 for (i = 0; i + 3 < PART_LEN1; i += 4) {	519 for (i = 0; i + 3 < PART_LEN1; i += 4) {

517 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);	520 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);

518 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);	521 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);

519 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);	522 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);

520 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);	523 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);

521 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);	524 const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);

522 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);	525 const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);

523 __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);	526 __m128 vec_sd =

524 __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);	527 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sd[i]), vec_GCoh0);

525 __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);	528 __m128 vec_se =

	529 _mm_mul_ps(_mm_loadu_ps(&coherence_state->se[i]), vec_GCoh0);

	530 __m128 vec_sx =

	531 _mm_mul_ps(_mm_loadu_ps(&coherence_state->sx[i]), vec_GCoh0);

526 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);	532 __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);

527 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);	533 __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);

528 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);	534 __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);

529 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));	535 vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));

530 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));	536 vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));

531 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));	537 vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));

532 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);	538 vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);

533 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));	539 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));

534 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));	540 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));

535 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));	541 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));

536 _mm_storeu_ps(&aec->sd[i], vec_sd);	542 _mm_storeu_ps(&coherence_state->sd[i], vec_sd);

537 _mm_storeu_ps(&aec->se[i], vec_se);	543 _mm_storeu_ps(&coherence_state->se[i], vec_se);

538 _mm_storeu_ps(&aec->sx[i], vec_sx);	544 _mm_storeu_ps(&coherence_state->sx[i], vec_sx);

539	545

540 {	546 {

541 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);	547 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]);

542 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);	548 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]);

543 __m128 vec_a =	549 __m128 vec_a =

544 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));	550 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));

545 __m128 vec_b =	551 __m128 vec_b =

546 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));	552 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));

547 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);	553 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);

548 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);	554 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);

549 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);	555 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);

550 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);	556 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);

551 vec_dfwefw0011 =	557 vec_dfwefw0011 =

552 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1));	558 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1));

553 vec_dfwefw0110 =	559 vec_dfwefw0110 =

554 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0));	560 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0));

555 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));	561 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));

556 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));	562 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));

557 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));	563 _mm_storeu_ps(&coherence_state->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));

558 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));	564 _mm_storeu_ps(&coherence_state->sde[i + 2][0],

	565 _mm_unpackhi_ps(vec_a, vec_b));

559 }	566 }

560	567

561 {	568 {

562 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);	569 const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]);

563 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);	570 const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]);

564 __m128 vec_a =	571 __m128 vec_a =

565 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));	572 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));

566 __m128 vec_b =	573 __m128 vec_b =

567 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));	574 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));

568 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);	575 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);

569 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);	576 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);

570 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);	577 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);

571 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);	578 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);

572 vec_dfwxfw0011 =	579 vec_dfwxfw0011 =

573 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1));	580 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1));

574 vec_dfwxfw0110 =	581 vec_dfwxfw0110 =

575 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0));	582 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0));

576 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));	583 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));

577 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));	584 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));

578 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));	585 _mm_storeu_ps(&coherence_state->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));

579 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));	586 _mm_storeu_ps(&coherence_state->sxd[i + 2][0],

	587 _mm_unpackhi_ps(vec_a, vec_b));

580 }	588 }

581	589

582 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);	590 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);

583 vec_seSum = _mm_add_ps(vec_seSum, vec_se);	591 vec_seSum = _mm_add_ps(vec_seSum, vec_se);

584 }	592 }

585	593

586 _mm_add_ps_4x1(vec_sdSum, &sdSum);	594 _mm_add_ps_4x1(vec_sdSum, &sdSum);

587 _mm_add_ps_4x1(vec_seSum, &seSum);	595 _mm_add_ps_4x1(vec_seSum, &seSum);

588	596

589 for (; i < PART_LEN1; i++) {	597 for (; i < PART_LEN1; i++) {

590 aec->sd[i] = ptrGCoh[0] * aec->sd[i] +	598 coherence_state->sd[i] =

591 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);	599 ptrGCoh[0] * coherence_state->sd[i] +

592 aec->se[i] = ptrGCoh[0] * aec->se[i] +	600 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);

593 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);	601 coherence_state->se[i] =

	602 ptrGCoh[0] * coherence_state->se[i] +

	603 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);

594 // We threshold here to protect against the ill-effects of a zero farend.	604 // We threshold here to protect against the ill-effects of a zero farend.

595 // The threshold is not arbitrarily chosen, but balances protection and	605 // The threshold is not arbitrarily chosen, but balances protection and

596 // adverse interaction with the algorithm's tuning.	606 // adverse interaction with the algorithm's tuning.

597 // TODO(bjornv): investigate further why this is so sensitive.	607 // TODO(bjornv): investigate further why this is so sensitive.

598 aec->sx[i] = ptrGCoh[0] * aec->sx[i] +	608 coherence_state->sx[i] =

599 ptrGCoh[1] * WEBRTC_SPL_MAX(	609 ptrGCoh[0] * coherence_state->sx[i] +

600 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],	610 ptrGCoh[1] *

601 WebRtcAec_kMinFarendPSD);	611 WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],

	612 WebRtcAec_kMinFarendPSD);

602	613

603 aec->sde[i][0] =	614 coherence_state->sde[i][0] =

604 ptrGCoh[0] * aec->sde[i][0] +	615 ptrGCoh[0] * coherence_state->sde[i][0] +

605 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);	616 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);

606 aec->sde[i][1] =	617 coherence_state->sde[i][1] =

607 ptrGCoh[0] * aec->sde[i][1] +	618 ptrGCoh[0] * coherence_state->sde[i][1] +

608 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);	619 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

609	620

610 aec->sxd[i][0] =	621 coherence_state->sxd[i][0] =

611 ptrGCoh[0] * aec->sxd[i][0] +	622 ptrGCoh[0] * coherence_state->sxd[i][0] +

612 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);	623 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);

613 aec->sxd[i][1] =	624 coherence_state->sxd[i][1] =

614 ptrGCoh[0] * aec->sxd[i][1] +	625 ptrGCoh[0] * coherence_state->sxd[i][1] +

615 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);	626 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);

616	627

617 sdSum += aec->sd[i];	628 sdSum += coherence_state->sd[i];

618 seSum += aec->se[i];	629 seSum += coherence_state->se[i];

619 }	630 }

620	631

621 // Divergent filter safeguard update.	632 // Divergent filter safeguard update.

622 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;	633 *filter_divergence_state =

	634 (filter_divergence_state ? 1.05f : 1.0f) seSum > sdSum;

623	635

624 // Signal extreme filter divergence if the error is significantly larger	636 // Signal extreme filter divergence if the error is significantly larger

625 // than the nearend (13 dB).	637 // than the nearend (13 dB).

626 extreme_filter_divergence = (seSum > (19.95f sdSum));	638 extreme_filter_divergence = (seSum > (19.95f sdSum));

627 }	639 }

628	640

629 // Window time domain data to be used by the fft.	641 // Window time domain data to be used by the fft.

630 static void WindowDataSSE2(float* x_windowed, const float* x) {	642 static void WindowDataSSE2(float* x_windowed, const float* x) {

631 int i;	643 int i;

632 for (i = 0; i < PART_LEN; i += 4) {	644 for (i = 0; i < PART_LEN; i += 4) {

(...skipping 26 matching lines...) Expand all Loading...
659 _mm_storeu_ps(&data_complex[0][i], vec_a);	671 _mm_storeu_ps(&data_complex[0][i], vec_a);

660 _mm_storeu_ps(&data_complex[1][i], vec_b);	672 _mm_storeu_ps(&data_complex[1][i], vec_b);

661 }	673 }

662 // fix beginning/end values	674 // fix beginning/end values

663 data_complex[1][0] = 0;	675 data_complex[1][0] = 0;

664 data_complex[1][PART_LEN] = 0;	676 data_complex[1][PART_LEN] = 0;

665 data_complex[0][0] = data[0];	677 data_complex[0][0] = data[0];

666 data_complex[0][PART_LEN] = data[1];	678 data_complex[0][PART_LEN] = data[1];

667 }	679 }

668	680

669 static void SubbandCoherenceSSE2(AecCore* aec,	681 static void SubbandCoherenceSSE2(int mult,

	682 bool extended_filter_enabled,

670 float efw[2][PART_LEN1],	683 float efw[2][PART_LEN1],

671 float dfw[2][PART_LEN1],	684 float dfw[2][PART_LEN1],

672 float xfw[2][PART_LEN1],	685 float xfw[2][PART_LEN1],

673 float* fft,	686 float* fft,

674 float* cohde,	687 float* cohde,

675 float* cohxd,	688 float* cohxd,

	689 CoherenceState* coherence_state,

	690 short* filter_divergence_state,

676 int* extreme_filter_divergence) {	691 int* extreme_filter_divergence) {

677 int i;	692 int i;

678	693

679 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);	694 SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state,

	695 filter_divergence_state, extreme_filter_divergence);

680	696

681 {	697 {

682 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);	698 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);

683	699

684 // Subband coherence	700 // Subband coherence

685 for (i = 0; i + 3 < PART_LEN1; i += 4) {	701 for (i = 0; i + 3 < PART_LEN1; i += 4) {

686 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);	702 const __m128 vec_sd = _mm_loadu_ps(&coherence_state->sd[i]);

687 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);	703 const __m128 vec_se = _mm_loadu_ps(&coherence_state->se[i]);

688 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);	704 const __m128 vec_sx = _mm_loadu_ps(&coherence_state->sx[i]);

689 const __m128 vec_sdse =	705 const __m128 vec_sdse =

690 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se));	706 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se));

691 const __m128 vec_sdsx =	707 const __m128 vec_sdsx =

692 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx));	708 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx));

693 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);	709 const __m128 vec_sde_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]);

694 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);	710 const __m128 vec_sde_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]);

695 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);	711 const __m128 vec_sxd_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]);

696 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);	712 const __m128 vec_sxd_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]);

697 const __m128 vec_sde_0 =	713 const __m128 vec_sde_0 =

698 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0));	714 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0));

699 const __m128 vec_sde_1 =	715 const __m128 vec_sde_1 =

700 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1));	716 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1));

701 const __m128 vec_sxd_0 =	717 const __m128 vec_sxd_0 =

702 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0));	718 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0));

703 const __m128 vec_sxd_1 =	719 const __m128 vec_sxd_1 =

704 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1));	720 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1));

705 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);	721 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);

706 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);	722 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);

707 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));	723 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));

708 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);	724 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);

709 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));	725 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));

710 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);	726 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);

711 _mm_storeu_ps(&cohde[i], vec_cohde);	727 _mm_storeu_ps(&cohde[i], vec_cohde);

712 _mm_storeu_ps(&cohxd[i], vec_cohxd);	728 _mm_storeu_ps(&cohxd[i], vec_cohxd);

713 }	729 }

714	730

715 // scalar code for the remaining items.	731 // scalar code for the remaining items.

716 for (; i < PART_LEN1; i++) {	732 for (; i < PART_LEN1; i++) {

717 cohde[i] =	733 cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] +

718 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /	734 coherence_state->sde[i][1] * coherence_state->sde[i][1]) /

719 (aec->sd[i] * aec->se[i] + 1e-10f);	735 (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f);

720 cohxd[i] =	736 cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] +

721 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /	737 coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) /

722 (aec->sx[i] * aec->sd[i] + 1e-10f);	738 (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f);

723 }	739 }

724 }	740 }

725 }	741 }

726	742

727 void WebRtcAec_InitAec_SSE2(void) {	743 void WebRtcAec_InitAec_SSE2(void) {

728 WebRtcAec_FilterFar = FilterFarSSE2;	744 WebRtcAec_FilterFar = FilterFarSSE2;

729 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;	745 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;

730 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;	746 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;

731 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;	747 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;

732 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;	748 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;

733 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;	749 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;

734 WebRtcAec_PartitionDelay = PartitionDelaySSE2;	750 WebRtcAec_PartitionDelay = PartitionDelaySSE2;

735 WebRtcAec_WindowData = WindowDataSSE2;	751 WebRtcAec_WindowData = WindowDataSSE2;

736 }	752 }

737 } // namespace webrtc	753 } // namespace webrtc

OLD	NEW

« no previous file with comments | « webrtc/modules/audio_processing/aec/aec_core_neon.cc ('k') | no next file » | no next file with comments »