webrtc/modules/audio_processing/aec/aec_core_sse2.c - Issue 1639283002: Clang format changes

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_core_sse2.c

Issue 1639283002: Clang format changes (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Merged with latest code from master Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 11 matching lines...) Expand all Loading...
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"	22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"

23	23

24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {	24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {

25 return aRe * bRe - aIm * bIm;	25 return aRe * bRe - aIm * bIm;

26 }	26 }

27	27

28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {	28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {

29 return aRe * bIm + aIm * bRe;	29 return aRe * bIm + aIm * bRe;

30 }	30 }

31	31

32 static void FilterFarSSE2(	32 static void FilterFarSSE2(int num_partitions,

33 int num_partitions,	33 int x_fft_buf_block_pos,

34 int x_fft_buf_block_pos,	34 float x_fft_buf[2]

35 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],	35 [kExtendedNumPartitions * PART_LEN1],

36 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],	36 float h_fft_buf[2]

37 float y_fft[2][PART_LEN1]) {	37 [kExtendedNumPartitions * PART_LEN1],

38	38 float y_fft[2][PART_LEN1]) {

39 int i;	39 int i;

40 for (i = 0; i < num_partitions; i++) {	40 for (i = 0; i < num_partitions; i++) {

41 int j;	41 int j;

42 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;	42 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;

43 int pos = i * PART_LEN1;	43 int pos = i * PART_LEN1;

44 // Check for wrap	44 // Check for wrap

45 if (i + x_fft_buf_block_pos >= num_partitions) {	45 if (i + x_fft_buf_block_pos >= num_partitions) {

46 xPos -= num_partitions * (PART_LEN1);	46 xPos -= num_partitions * (PART_LEN1);

47 }	47 }

48	48

(...skipping 11 matching lines...) Expand all Loading...
60 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re);	60 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re);

61 const __m128 e = _mm_sub_ps(a, b);	61 const __m128 e = _mm_sub_ps(a, b);

62 const __m128 f = _mm_add_ps(c, d);	62 const __m128 f = _mm_add_ps(c, d);

63 const __m128 g = _mm_add_ps(y_fft_re, e);	63 const __m128 g = _mm_add_ps(y_fft_re, e);

64 const __m128 h = _mm_add_ps(y_fft_im, f);	64 const __m128 h = _mm_add_ps(y_fft_im, f);

65 _mm_storeu_ps(&y_fft[0][j], g);	65 _mm_storeu_ps(&y_fft[0][j], g);

66 _mm_storeu_ps(&y_fft[1][j], h);	66 _mm_storeu_ps(&y_fft[1][j], h);

67 }	67 }

68 // scalar code for the remaining items.	68 // scalar code for the remaining items.

69 for (; j < PART_LEN1; j++) {	69 for (; j < PART_LEN1; j++) {

70 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],	70 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j],

71 x_fft_buf[1][xPos + j],	71 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]);

72 h_fft_buf[0][pos + j],	72 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j],

73 h_fft_buf[1][pos + j]);	73 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]);

74 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],

75 x_fft_buf[1][xPos + j],

76 h_fft_buf[0][pos + j],

77 h_fft_buf[1][pos + j]);

78 }	74 }

79 }	75 }

80 }	76 }

81	77

82 static void ScaleErrorSignalSSE2(int extended_filter_enabled,	78 static void ScaleErrorSignalSSE2(int extended_filter_enabled,

83 float normal_mu,	79 float normal_mu,

84 float normal_error_threshold,	80 float normal_error_threshold,

85 float x_pow[PART_LEN1],	81 float x_pow[PART_LEN1],

86 float ef[2][PART_LEN1]) {	82 float ef[2][PART_LEN1]) {

87 const __m128 k1e_10f = _mm_set1_ps(1e-10f);	83 const __m128 k1e_10f = _mm_set1_ps(1e-10f);

88 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu)	84 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu)

89 : _mm_set1_ps(normal_mu);	85 : _mm_set1_ps(normal_mu);

90 const __m128 kThresh = extended_filter_enabled	86 const __m128 kThresh = extended_filter_enabled

91 ? _mm_set1_ps(kExtendedErrorThreshold)	87 ? _mm_set1_ps(kExtendedErrorThreshold)

92 : _mm_set1_ps(normal_error_threshold);	88 : _mm_set1_ps(normal_error_threshold);

93	89

94 int i;	90 int i;

95 // vectorized code (four at once)	91 // vectorized code (four at once)

96 for (i = 0; i + 3 < PART_LEN1; i += 4) {	92 for (i = 0; i + 3 < PART_LEN1; i += 4) {

97 const __m128 x_pow_local = _mm_loadu_ps(&x_pow[i]);	93 const __m128 x_pow_local = _mm_loadu_ps(&x_pow[i]);

98 const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);	94 const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);

99 const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);	95 const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

(...skipping 17 matching lines...) Expand all Loading...
117 ef_re = _mm_or_ps(ef_re, ef_re_if);	113 ef_re = _mm_or_ps(ef_re, ef_re_if);

118 ef_im = _mm_or_ps(ef_im, ef_im_if);	114 ef_im = _mm_or_ps(ef_im, ef_im_if);

119 ef_re = _mm_mul_ps(ef_re, kMu);	115 ef_re = _mm_mul_ps(ef_re, kMu);

120 ef_im = _mm_mul_ps(ef_im, kMu);	116 ef_im = _mm_mul_ps(ef_im, kMu);

121	117

122 _mm_storeu_ps(&ef[0][i], ef_re);	118 _mm_storeu_ps(&ef[0][i], ef_re);

123 _mm_storeu_ps(&ef[1][i], ef_im);	119 _mm_storeu_ps(&ef[1][i], ef_im);

124 }	120 }

125 // scalar code for the remaining items.	121 // scalar code for the remaining items.

126 {	122 {

127 const float mu =	123 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;

128 extended_filter_enabled ? kExtendedMu : normal_mu;

129 const float error_threshold = extended_filter_enabled	124 const float error_threshold = extended_filter_enabled

130 ? kExtendedErrorThreshold	125 ? kExtendedErrorThreshold

131 : normal_error_threshold;	126 : normal_error_threshold;

132 for (; i < (PART_LEN1); i++) {	127 for (; i < (PART_LEN1); i++) {

133 float abs_ef;	128 float abs_ef;

134 ef[0][i] /= (x_pow[i] + 1e-10f);	129 ef[0][i] /= (x_pow[i] + 1e-10f);

135 ef[1][i] /= (x_pow[i] + 1e-10f);	130 ef[1][i] /= (x_pow[i] + 1e-10f);

136 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);	131 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

137	132

138 if (abs_ef > error_threshold) {	133 if (abs_ef > error_threshold) {

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
181 const __m128 e = _mm_add_ps(a, b);	176 const __m128 e = _mm_add_ps(a, b);

182 const __m128 f = _mm_sub_ps(c, d);	177 const __m128 f = _mm_sub_ps(c, d);

183 // Interleave real and imaginary parts.	178 // Interleave real and imaginary parts.

184 const __m128 g = _mm_unpacklo_ps(e, f);	179 const __m128 g = _mm_unpacklo_ps(e, f);

185 const __m128 h = _mm_unpackhi_ps(e, f);	180 const __m128 h = _mm_unpackhi_ps(e, f);

186 // Store	181 // Store

187 _mm_storeu_ps(&fft[2 * j + 0], g);	182 _mm_storeu_ps(&fft[2 * j + 0], g);

188 _mm_storeu_ps(&fft[2 * j + 4], h);	183 _mm_storeu_ps(&fft[2 * j + 4], h);

189 }	184 }

190 // ... and fixup the first imaginary entry.	185 // ... and fixup the first imaginary entry.

191 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],	186 fft[1] =

192 -x_fft_buf[1][xPos + PART_LEN],	187 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN],

193 e_fft[0][PART_LEN],	188 e_fft[0][PART_LEN], e_fft[1][PART_LEN]);

194 e_fft[1][PART_LEN]);

195	189

196 aec_rdft_inverse_128(fft);	190 aec_rdft_inverse_128(fft);

197 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);	191 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

198	192

199 // fft scaling	193 // fft scaling

200 {	194 {

201 float scale = 2.0f / PART_LEN2;	195 float scale = 2.0f / PART_LEN2;

202 const __m128 scale_ps = _mm_load_ps1(&scale);	196 const __m128 scale_ps = _mm_load_ps1(&scale);

203 for (j = 0; j < PART_LEN; j += 4) {	197 for (j = 0; j < PART_LEN; j += 4) {

204 const __m128 fft_ps = _mm_loadu_ps(&fft[j]);	198 const __m128 fft_ps = _mm_loadu_ps(&fft[j]);

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
274 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {	268 static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {

275 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};	269 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};

276 const __m128 mantissa = _mm_and_ps(a, ((__m128)mantissa_mask));	270 const __m128 mantissa = _mm_and_ps(a, ((__m128)mantissa_mask));

277 const __m128 y =	271 const __m128 y =

278 _mm_or_ps(mantissa, ((__m128)zero_biased_exponent_is_one));	272 _mm_or_ps(mantissa, ((__m128)zero_biased_exponent_is_one));

279	273

280 // Approximate log2(y) ~= (y - 1) * pol5(y).	274 // Approximate log2(y) ~= (y - 1) * pol5(y).

281 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0	275 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0

282 static const ALIGN16_BEG float ALIGN16_END C5[4] = {	276 static const ALIGN16_BEG float ALIGN16_END C5[4] = {

283 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};	277 -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};

284 static const ALIGN16_BEG float ALIGN16_END	278 static const ALIGN16_BEG float ALIGN16_END C4[4] = {

285 C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};	279 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};

286 static const ALIGN16_BEG float ALIGN16_END	280 static const ALIGN16_BEG float ALIGN16_END C3[4] = {

287 C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};	281 -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};

288 static const ALIGN16_BEG float ALIGN16_END	282 static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f,

289 C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};	283 2.5988452f, 2.5988452f};

290 static const ALIGN16_BEG float ALIGN16_END	284 static const ALIGN16_BEG float ALIGN16_END C1[4] = {

291 C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};	285 -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};

292 static const ALIGN16_BEG float ALIGN16_END	286 static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f,

293 C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};	287 3.1157899f, 3.1157899f};

294 const __m128 pol5_y_0 = _mm_mul_ps(y, ((__m128)C5));	288 const __m128 pol5_y_0 = _mm_mul_ps(y, ((__m128)C5));

295 const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, ((__m128)C4));	289 const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, ((__m128)C4));

296 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);	290 const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);

297 const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, ((__m128)C3));	291 const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, ((__m128)C3));

298 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);	292 const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);

299 const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, ((__m128)C2));	293 const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, ((__m128)C2));

300 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);	294 const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);

301 const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, ((__m128)C1));	295 const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, ((__m128)C1));

302 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);	296 const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);

303 const __m128 pol5_y = _mm_add_ps(pol5_y_8, ((__m128)C0));	297 const __m128 pol5_y = _mm_add_ps(pol5_y_8, ((__m128)C0));

(...skipping 23 matching lines...) Expand all Loading...
327 // maximum relative error of 0.17%.	321 // maximum relative error of 0.17%.

328	322

329 // To avoid over/underflow, we reduce the range of input to ]-127, 129].	323 // To avoid over/underflow, we reduce the range of input to ]-127, 129].

330 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,	324 static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,

331 129.f, 129.f};	325 129.f, 129.f};

332 static const ALIGN16_BEG float min_input[4] ALIGN16_END = {	326 static const ALIGN16_BEG float min_input[4] ALIGN16_END = {

333 -126.99999f, -126.99999f, -126.99999f, -126.99999f};	327 -126.99999f, -126.99999f, -126.99999f, -126.99999f};

334 const __m128 x_min = _mm_min_ps(b_log2_a, ((__m128)max_input));	328 const __m128 x_min = _mm_min_ps(b_log2_a, ((__m128)max_input));

335 const __m128 x_max = _mm_max_ps(x_min, ((__m128)min_input));	329 const __m128 x_max = _mm_max_ps(x_min, ((__m128)min_input));

336 // Compute n.	330 // Compute n.

337 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,	331 static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f,

338 0.5f, 0.5f};	332 0.5f};

339 const __m128 x_minus_half = _mm_sub_ps(x_max, ((__m128)half));	333 const __m128 x_minus_half = _mm_sub_ps(x_max, ((__m128)half));

340 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);	334 const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);

341 // Compute 2^n.	335 // Compute 2^n.

342 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {	336 static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {

343 127, 127, 127, 127};	337 127, 127, 127, 127};

344 static const int float_exponent_shift = 23;	338 static const int float_exponent_shift = 23;

345 const __m128i two_n_exponent =	339 const __m128i two_n_exponent =

346 _mm_add_epi32(x_minus_half_floor, ((__m128i)float_exponent_bias));	340 _mm_add_epi32(x_minus_half_floor, ((__m128i)float_exponent_bias));

347 const __m128 two_n =	341 const __m128 two_n =

348 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));	342 _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
425 // Suppress error signal	419 // Suppress error signal

426 efw[0][i] *= hNl[i];	420 efw[0][i] *= hNl[i];

427 efw[1][i] *= hNl[i];	421 efw[1][i] *= hNl[i];

428	422

429 // Ooura fft returns incorrect sign on imaginary component. It matters	423 // Ooura fft returns incorrect sign on imaginary component. It matters

430 // here because we are making an additive change with comfort noise.	424 // here because we are making an additive change with comfort noise.

431 efw[1][i] *= -1;	425 efw[1][i] *= -1;

432 }	426 }

433 }	427 }

434	428

435 __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {	429 __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) {

436 // A+B C+D	430 // A+B C+D

437 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));	431 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));

438 // A+B+C+D A+B+C+D	432 // A+B+C+D A+B+C+D

439 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));	433 sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));

440 _mm_store_ss(dst, sum);	434 _mm_store_ss(dst, sum);

441 }	435 }

442	436

443 static int PartitionDelaySSE2(const AecCore* aec) {	437 static int PartitionDelaySSE2(const AecCore* aec) {

444 // Measures the energy in each filter partition and returns the partition with	438 // Measures the energy in each filter partition and returns the partition with

445 // highest energy.	439 // highest energy.

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
485 // - sxd : cross-PSD of near-end and far-end	479 // - sxd : cross-PSD of near-end and far-end

486 //	480 //

487 // In addition to updating the PSDs, also the filter diverge state is determined	481 // In addition to updating the PSDs, also the filter diverge state is determined

488 // upon actions are taken.	482 // upon actions are taken.

489 static void SmoothedPSD(AecCore* aec,	483 static void SmoothedPSD(AecCore* aec,

490 float efw[2][PART_LEN1],	484 float efw[2][PART_LEN1],

491 float dfw[2][PART_LEN1],	485 float dfw[2][PART_LEN1],

492 float xfw[2][PART_LEN1],	486 float xfw[2][PART_LEN1],

493 int* extreme_filter_divergence) {	487 int* extreme_filter_divergence) {

494 // Power estimate smoothing coefficients.	488 // Power estimate smoothing coefficients.

495 const float* ptrGCoh = aec->extended_filter_enabled	489 const float* ptrGCoh =

496 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]	490 aec->extended_filter_enabled

497 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];	491 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]

	492 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];

498 int i;	493 int i;

499 float sdSum = 0, seSum = 0;	494 float sdSum = 0, seSum = 0;

500 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);	495 const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);

501 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);	496 const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);

502 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);	497 const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);

503 __m128 vec_sdSum = _mm_set1_ps(0.0f);	498 __m128 vec_sdSum = _mm_set1_ps(0.0f);

504 __m128 vec_seSum = _mm_set1_ps(0.0f);	499 __m128 vec_seSum = _mm_set1_ps(0.0f);

505	500

506 for (i = 0; i + 3 < PART_LEN1; i += 4) {	501 for (i = 0; i + 3 < PART_LEN1; i += 4) {

507 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);	502 const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);

508 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);	503 const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);

509 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);	504 const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);

510 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);	505 const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);

(...skipping 12 matching lines...) Expand all Loading...
523 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));	518 vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));

524 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));	519 vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));

525 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));	520 vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));

526 _mm_storeu_ps(&aec->sd[i], vec_sd);	521 _mm_storeu_ps(&aec->sd[i], vec_sd);

527 _mm_storeu_ps(&aec->se[i], vec_se);	522 _mm_storeu_ps(&aec->se[i], vec_se);

528 _mm_storeu_ps(&aec->sx[i], vec_sx);	523 _mm_storeu_ps(&aec->sx[i], vec_sx);

529	524

530 {	525 {

531 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);	526 const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);

532 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);	527 const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);

533 __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,	528 __m128 vec_a =

534 _MM_SHUFFLE(2, 0, 2, 0));	529 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));

535 __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,	530 __m128 vec_b =

536 _MM_SHUFFLE(3, 1, 3, 1));	531 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));

537 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);	532 __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);

538 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);	533 __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);

539 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);	534 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);

540 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);	535 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);

541 vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,	536 vec_dfwefw0011 =

542 _mm_mul_ps(vec_dfw1, vec_efw1));	537 _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1));

543 vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,	538 vec_dfwefw0110 =

544 _mm_mul_ps(vec_dfw1, vec_efw0));	539 _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0));

545 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));	540 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));

546 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));	541 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));

547 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));	542 _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));

548 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));	543 _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));

549 }	544 }

550	545

551 {	546 {

552 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);	547 const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);

553 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);	548 const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);

554 __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,	549 __m128 vec_a =

555 _MM_SHUFFLE(2, 0, 2, 0));	550 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));

556 __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,	551 __m128 vec_b =

557 _MM_SHUFFLE(3, 1, 3, 1));	552 _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));

558 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);	553 __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);

559 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);	554 __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);

560 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);	555 vec_a = _mm_mul_ps(vec_a, vec_GCoh0);

561 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);	556 vec_b = _mm_mul_ps(vec_b, vec_GCoh0);

562 vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,	557 vec_dfwxfw0011 =

563 _mm_mul_ps(vec_dfw1, vec_xfw1));	558 _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1));

564 vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,	559 vec_dfwxfw0110 =

565 _mm_mul_ps(vec_dfw1, vec_xfw0));	560 _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0));

566 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));	561 vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));

567 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));	562 vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));

568 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));	563 _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));

569 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));	564 _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));

570 }	565 }

571	566

572 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);	567 vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);

573 vec_seSum = _mm_add_ps(vec_seSum, vec_se);	568 vec_seSum = _mm_add_ps(vec_seSum, vec_se);

574 }	569 }

575	570

576 _mm_add_ps_4x1(vec_sdSum, &sdSum);	571 _mm_add_ps_4x1(vec_sdSum, &sdSum);

577 _mm_add_ps_4x1(vec_seSum, &seSum);	572 _mm_add_ps_4x1(vec_seSum, &seSum);

578	573

579 for (; i < PART_LEN1; i++) {	574 for (; i < PART_LEN1; i++) {

580 aec->sd[i] = ptrGCoh[0] * aec->sd[i] +	575 aec->sd[i] = ptrGCoh[0] * aec->sd[i] +

581 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);	576 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);

582 aec->se[i] = ptrGCoh[0] * aec->se[i] +	577 aec->se[i] = ptrGCoh[0] * aec->se[i] +

583 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);	578 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);

584 // We threshold here to protect against the ill-effects of a zero farend.	579 // We threshold here to protect against the ill-effects of a zero farend.

585 // The threshold is not arbitrarily chosen, but balances protection and	580 // The threshold is not arbitrarily chosen, but balances protection and

586 // adverse interaction with the algorithm's tuning.	581 // adverse interaction with the algorithm's tuning.

587 // TODO(bjornv): investigate further why this is so sensitive.	582 // TODO(bjornv): investigate further why this is so sensitive.

588 aec->sx[i] =	583 aec->sx[i] = ptrGCoh[0] * aec->sx[i] +

589 ptrGCoh[0] * aec->sx[i] +	584 ptrGCoh[1] * WEBRTC_SPL_MAX(

590 ptrGCoh[1] * WEBRTC_SPL_MAX(	585 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],

591 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],	586 WebRtcAec_kMinFarendPSD);

592 WebRtcAec_kMinFarendPSD);

593	587

594 aec->sde[i][0] =	588 aec->sde[i][0] =

595 ptrGCoh[0] * aec->sde[i][0] +	589 ptrGCoh[0] * aec->sde[i][0] +

596 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);	590 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);

597 aec->sde[i][1] =	591 aec->sde[i][1] =

598 ptrGCoh[0] * aec->sde[i][1] +	592 ptrGCoh[0] * aec->sde[i][1] +

599 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);	593 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

600	594

601 aec->sxd[i][0] =	595 aec->sxd[i][0] =

602 ptrGCoh[0] * aec->sxd[i][0] +	596 ptrGCoh[0] * aec->sxd[i][0] +

(...skipping 18 matching lines...) Expand all Loading...
621 static void WindowDataSSE2(float* x_windowed, const float* x) {	615 static void WindowDataSSE2(float* x_windowed, const float* x) {

622 int i;	616 int i;

623 for (i = 0; i < PART_LEN; i += 4) {	617 for (i = 0; i < PART_LEN; i += 4) {

624 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);	618 const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);

625 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);	619 const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);

626 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);	620 const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);

627 // A B C D	621 // A B C D

628 __m128 vec_sqrtHanning_rev =	622 __m128 vec_sqrtHanning_rev =

629 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);	623 _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);

630 // D C B A	624 // D C B A

631 vec_sqrtHanning_rev =	625 vec_sqrtHanning_rev = _mm_shuffle_ps(

632 _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev,	626 vec_sqrtHanning_rev, vec_sqrtHanning_rev, _MM_SHUFFLE(0, 1, 2, 3));

633 _MM_SHUFFLE(0, 1, 2, 3));

634 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));	627 _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));

635 _mm_storeu_ps(&x_windowed[PART_LEN + i],	628 _mm_storeu_ps(&x_windowed[PART_LEN + i],

636 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));	629 _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));

637 }	630 }

638 }	631 }

639	632

640 // Puts fft output data into a complex valued array.	633 // Puts fft output data into a complex valued array.

641 static void StoreAsComplexSSE2(const float* data,	634 static void StoreAsComplexSSE2(const float* data,

642 float data_complex[2][PART_LEN1]) {	635 float data_complex[2][PART_LEN1]) {

643 int i;	636 int i;

644 for (i = 0; i < PART_LEN; i += 4) {	637 for (i = 0; i < PART_LEN; i += 4) {

645 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);	638 const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);

646 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);	639 const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);

647 const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4,	640 const __m128 vec_a =

648 _MM_SHUFFLE(2, 0, 2, 0));	641 _mm_shuffle_ps(vec_fft0, vec_fft4, _MM_SHUFFLE(2, 0, 2, 0));

649 const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4,	642 const __m128 vec_b =

650 _MM_SHUFFLE(3, 1, 3, 1));	643 _mm_shuffle_ps(vec_fft0, vec_fft4, _MM_SHUFFLE(3, 1, 3, 1));

651 _mm_storeu_ps(&data_complex[0][i], vec_a);	644 _mm_storeu_ps(&data_complex[0][i], vec_a);

652 _mm_storeu_ps(&data_complex[1][i], vec_b);	645 _mm_storeu_ps(&data_complex[1][i], vec_b);

653 }	646 }

654 // fix beginning/end values	647 // fix beginning/end values

655 data_complex[1][0] = 0;	648 data_complex[1][0] = 0;

656 data_complex[1][PART_LEN] = 0;	649 data_complex[1][PART_LEN] = 0;

657 data_complex[0][0] = data[0];	650 data_complex[0][0] = data[0];

658 data_complex[0][PART_LEN] = data[1];	651 data_complex[0][PART_LEN] = data[1];

659 }	652 }

660	653

661 static void SubbandCoherenceSSE2(AecCore* aec,	654 static void SubbandCoherenceSSE2(AecCore* aec,

662 float efw[2][PART_LEN1],	655 float efw[2][PART_LEN1],

663 float dfw[2][PART_LEN1],	656 float dfw[2][PART_LEN1],

664 float xfw[2][PART_LEN1],	657 float xfw[2][PART_LEN1],

665 float* fft,	658 float* fft,

666 float* cohde,	659 float* cohde,

667 float* cohxd,	660 float* cohxd,

668 int* extreme_filter_divergence) {	661 int* extreme_filter_divergence) {

669 int i;	662 int i;

670	663

671 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);	664 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);

672	665

673 {	666 {

674 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);	667 const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);

675	668

676 // Subband coherence	669 // Subband coherence

677 for (i = 0; i + 3 < PART_LEN1; i += 4) {	670 for (i = 0; i + 3 < PART_LEN1; i += 4) {

678 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);	671 const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);

679 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);	672 const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);

680 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);	673 const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);

681 const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,	674 const __m128 vec_sdse =

682 _mm_mul_ps(vec_sd, vec_se));	675 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se));

683 const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,	676 const __m128 vec_sdsx =

684 _mm_mul_ps(vec_sd, vec_sx));	677 _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx));

685 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);	678 const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);

686 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);	679 const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);

687 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);	680 const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);

688 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);	681 const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);

689 const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,	682 const __m128 vec_sde_0 =

690 _MM_SHUFFLE(2, 0, 2, 0));	683 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0));

691 const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,	684 const __m128 vec_sde_1 =

692 _MM_SHUFFLE(3, 1, 3, 1));	685 _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1));

693 const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,	686 const __m128 vec_sxd_0 =

694 _MM_SHUFFLE(2, 0, 2, 0));	687 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0));

695 const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,	688 const __m128 vec_sxd_1 =

696 _MM_SHUFFLE(3, 1, 3, 1));	689 _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1));

697 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);	690 __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);

698 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);	691 __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);

699 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));	692 vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));

700 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);	693 vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);

701 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));	694 vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));

702 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);	695 vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);

703 _mm_storeu_ps(&cohde[i], vec_cohde);	696 _mm_storeu_ps(&cohde[i], vec_cohde);

704 _mm_storeu_ps(&cohxd[i], vec_cohxd);	697 _mm_storeu_ps(&cohxd[i], vec_cohxd);

705 }	698 }

706	699

(...skipping 12 matching lines...) Expand all Loading...
719 void WebRtcAec_InitAec_SSE2(void) {	712 void WebRtcAec_InitAec_SSE2(void) {

720 WebRtcAec_FilterFar = FilterFarSSE2;	713 WebRtcAec_FilterFar = FilterFarSSE2;

721 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;	714 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;

722 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;	715 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;

723 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;	716 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;

724 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;	717 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;

725 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;	718 WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;

726 WebRtcAec_PartitionDelay = PartitionDelaySSE2;	719 WebRtcAec_PartitionDelay = PartitionDelaySSE2;

727 WebRtcAec_WindowData = WindowDataSSE2;	720 WebRtcAec_WindowData = WindowDataSSE2;

728 }	721 }

OLD	NEW

« no previous file with comments | « webrtc/modules/audio_processing/aec/aec_core_neon.c ('k') | webrtc/modules/audio_processing/aec/aec_resampler.h » ('j') | no next file with comments »