webrtc/modules/audio_processing/aec/aec_core_sse2.c - Issue 1456123003: Ducking fix #3: Removed the state as an input to the FilterAdaptation function

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_core_sse2.c

Issue 1456123003: Ducking fix #3: Removed the state as an input to the FilterAdaptation function (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@Aec_Code_Cleanup2_CL

Patch Set: Removed gcc build-breaking method argument specifiers Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 11 matching lines...) Expand all Loading...
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"	22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"

23	23

24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {	24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {

25 return aRe * bRe - aIm * bIm;	25 return aRe * bRe - aIm * bIm;

26 }	26 }

27	27

28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {	28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {

29 return aRe * bIm + aIm * bRe;	29 return aRe * bIm + aIm * bRe;

30 }	30 }

31	31

32 static void FilterFarSSE2(int num_partitions,	32 static void FilterFarSSE2(

33 int xfBufBlockPos,	33 int num_partitions,

34 float xfBuf[2][kExtendedNumPartitions * PART_LEN1],	34 int x_fft_buf_block_pos,

35 float wfBuf[2][kExtendedNumPartitions * PART_LEN1],	35 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],

36 float yf[2][PART_LEN1]) {	36 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],

	37 float y_fft[2][PART_LEN1]) {

37	38

38 int i;	39 int i;

39 for (i = 0; i < num_partitions; i++) {	40 for (i = 0; i < num_partitions; i++) {

40 int j;	41 int j;

41 int xPos = (i + xfBufBlockPos) * PART_LEN1;	42 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;

42 int pos = i * PART_LEN1;	43 int pos = i * PART_LEN1;

43 // Check for wrap	44 // Check for wrap

44 if (i + xfBufBlockPos >= num_partitions) {	45 if (i + x_fft_buf_block_pos >= num_partitions) {

45 xPos -= num_partitions * (PART_LEN1);	46 xPos -= num_partitions * (PART_LEN1);

46 }	47 }

47	48

48 // vectorized code (four at once)	49 // vectorized code (four at once)

49 for (j = 0; j + 3 < PART_LEN1; j += 4) {	50 for (j = 0; j + 3 < PART_LEN1; j += 4) {

50 const __m128 xfBuf_re = _mm_loadu_ps(&xfBuf[0][xPos + j]);	51 const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);

51 const __m128 xfBuf_im = _mm_loadu_ps(&xfBuf[1][xPos + j]);	52 const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);

52 const __m128 wfBuf_re = _mm_loadu_ps(&wfBuf[0][pos + j]);	53 const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);

53 const __m128 wfBuf_im = _mm_loadu_ps(&wfBuf[1][pos + j]);	54 const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);

54 const __m128 yf_re = _mm_loadu_ps(&yf[0][j]);	55 const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]);

55 const __m128 yf_im = _mm_loadu_ps(&yf[1][j]);	56 const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]);

56 const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re);	57 const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re);

57 const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im);	58 const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im);

58 const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im);	59 const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im);

59 const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re);	60 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re);

60 const __m128 e = _mm_sub_ps(a, b);	61 const __m128 e = _mm_sub_ps(a, b);

61 const __m128 f = _mm_add_ps(c, d);	62 const __m128 f = _mm_add_ps(c, d);

62 const __m128 g = _mm_add_ps(yf_re, e);	63 const __m128 g = _mm_add_ps(y_fft_re, e);

63 const __m128 h = _mm_add_ps(yf_im, f);	64 const __m128 h = _mm_add_ps(y_fft_im, f);

64 _mm_storeu_ps(&yf[0][j], g);	65 _mm_storeu_ps(&y_fft[0][j], g);

65 _mm_storeu_ps(&yf[1][j], h);	66 _mm_storeu_ps(&y_fft[1][j], h);

66 }	67 }

67 // scalar code for the remaining items.	68 // scalar code for the remaining items.

68 for (; j < PART_LEN1; j++) {	69 for (; j < PART_LEN1; j++) {

69 yf[0][j] += MulRe(xfBuf[0][xPos + j],	70 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],

70 xfBuf[1][xPos + j],	71 x_fft_buf[1][xPos + j],

71 wfBuf[0][pos + j],	72 h_fft_buf[0][pos + j],

72 wfBuf[1][pos + j]);	73 h_fft_buf[1][pos + j]);

73 yf[1][j] += MulIm(xfBuf[0][xPos + j],	74 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],

74 xfBuf[1][xPos + j],	75 x_fft_buf[1][xPos + j],

75 wfBuf[0][pos + j],	76 h_fft_buf[0][pos + j],

76 wfBuf[1][pos + j]);	77 h_fft_buf[1][pos + j]);

77 }	78 }

78 }	79 }

79 }	80 }

80	81

81 static void ScaleErrorSignalSSE2(int extended_filter_enabled,	82 static void ScaleErrorSignalSSE2(int extended_filter_enabled,

82 float normal_mu,	83 float normal_mu,

83 float normal_error_threshold,	84 float normal_error_threshold,

84 float *x_pow,	85 float x_pow[PART_LEN1],

85 float ef[2][PART_LEN1]) {	86 float ef[2][PART_LEN1]) {

86 const __m128 k1e_10f = _mm_set1_ps(1e-10f);	87 const __m128 k1e_10f = _mm_set1_ps(1e-10f);

87 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu)	88 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu)

88 : _mm_set1_ps(normal_mu);	89 : _mm_set1_ps(normal_mu);

89 const __m128 kThresh = extended_filter_enabled	90 const __m128 kThresh = extended_filter_enabled

90 ? _mm_set1_ps(kExtendedErrorThreshold)	91 ? _mm_set1_ps(kExtendedErrorThreshold)

91 : _mm_set1_ps(normal_error_threshold);	92 : _mm_set1_ps(normal_error_threshold);

92	93

93 int i;	94 int i;

94 // vectorized code (four at once)	95 // vectorized code (four at once)

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
140 ef[1][i] *= abs_ef;	141 ef[1][i] *= abs_ef;

141 }	142 }

142	143

143 // Stepsize factor	144 // Stepsize factor

144 ef[0][i] *= mu;	145 ef[0][i] *= mu;

145 ef[1][i] *= mu;	146 ef[1][i] *= mu;

146 }	147 }

147 }	148 }

148 }	149 }

149	150

150 static void FilterAdaptationSSE2(AecCore* aec,	151 static void FilterAdaptationSSE2(

151 float* fft,	152 int num_partitions,

152 float ef[2][PART_LEN1]) {	153 int x_fft_buf_block_pos,

	154 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],

	155 float e_fft[2][PART_LEN1],

	156 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {

	157 float fft[PART_LEN2];

153 int i, j;	158 int i, j;

154 const int num_partitions = aec->num_partitions;

155 for (i = 0; i < num_partitions; i++) {	159 for (i = 0; i < num_partitions; i++) {

156 int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);	160 int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1);

157 int pos = i * PART_LEN1;	161 int pos = i * PART_LEN1;

158 // Check for wrap	162 // Check for wrap

159 if (i + aec->xfBufBlockPos >= num_partitions) {	163 if (i + x_fft_buf_block_pos >= num_partitions) {

160 xPos -= num_partitions * PART_LEN1;	164 xPos -= num_partitions * PART_LEN1;

161 }	165 }

162	166

163 // Process the whole array...	167 // Process the whole array...

164 for (j = 0; j < PART_LEN; j += 4) {	168 for (j = 0; j < PART_LEN; j += 4) {

165 // Load xfBuf and ef.	169 // Load x_fft_buf and e_fft.

166 const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);	170 const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);

167 const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);	171 const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);

168 const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);	172 const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]);

169 const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);	173 const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]);

170 // Calculate the product of conjugate(xfBuf) by ef.	174 // Calculate the product of conjugate(x_fft_buf) by e_fft.

171 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm	175 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm

172 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe	176 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe

173 const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);	177 const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re);

174 const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);	178 const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im);

175 const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);	179 const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im);

176 const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);	180 const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re);

177 const __m128 e = _mm_add_ps(a, b);	181 const __m128 e = _mm_add_ps(a, b);

178 const __m128 f = _mm_sub_ps(c, d);	182 const __m128 f = _mm_sub_ps(c, d);

179 // Interleave real and imaginary parts.	183 // Interleave real and imaginary parts.

180 const __m128 g = _mm_unpacklo_ps(e, f);	184 const __m128 g = _mm_unpacklo_ps(e, f);

181 const __m128 h = _mm_unpackhi_ps(e, f);	185 const __m128 h = _mm_unpackhi_ps(e, f);

182 // Store	186 // Store

183 _mm_storeu_ps(&fft[2 * j + 0], g);	187 _mm_storeu_ps(&fft[2 * j + 0], g);

184 _mm_storeu_ps(&fft[2 * j + 4], h);	188 _mm_storeu_ps(&fft[2 * j + 4], h);

185 }	189 }

186 // ... and fixup the first imaginary entry.	190 // ... and fixup the first imaginary entry.

187 fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],	191 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],

188 -aec->xfBuf[1][xPos + PART_LEN],	192 -x_fft_buf[1][xPos + PART_LEN],

189 ef[0][PART_LEN],	193 e_fft[0][PART_LEN],

190 ef[1][PART_LEN]);	194 e_fft[1][PART_LEN]);

191	195

192 aec_rdft_inverse_128(fft);	196 aec_rdft_inverse_128(fft);

193 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);	197 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

194	198

195 // fft scaling	199 // fft scaling

196 {	200 {

197 float scale = 2.0f / PART_LEN2;	201 float scale = 2.0f / PART_LEN2;

198 const __m128 scale_ps = _mm_load_ps1(&scale);	202 const __m128 scale_ps = _mm_load_ps1(&scale);

199 for (j = 0; j < PART_LEN; j += 4) {	203 for (j = 0; j < PART_LEN; j += 4) {

200 const __m128 fft_ps = _mm_loadu_ps(&fft[j]);	204 const __m128 fft_ps = _mm_loadu_ps(&fft[j]);

201 const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);	205 const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);

202 _mm_storeu_ps(&fft[j], fft_scale);	206 _mm_storeu_ps(&fft[j], fft_scale);

203 }	207 }

204 }	208 }

205 aec_rdft_forward_128(fft);	209 aec_rdft_forward_128(fft);

206	210

207 {	211 {

208 float wt1 = aec->wfBuf[1][pos];	212 float wt1 = h_fft_buf[1][pos];

209 aec->wfBuf[0][pos + PART_LEN] += fft[1];	213 h_fft_buf[0][pos + PART_LEN] += fft[1];

210 for (j = 0; j < PART_LEN; j += 4) {	214 for (j = 0; j < PART_LEN; j += 4) {

211 __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);	215 __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);

212 __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);	216 __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);

213 const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);	217 const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);

214 const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);	218 const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);

215 const __m128 fft_re =	219 const __m128 fft_re =

216 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));	220 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));

217 const __m128 fft_im =	221 const __m128 fft_im =

218 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));	222 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));

219 wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);	223 wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);

220 wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);	224 wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);

221 _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);	225 _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re);

222 _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);	226 _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im);

223 }	227 }

224 aec->wfBuf[1][pos] = wt1;	228 h_fft_buf[1][pos] = wt1;

225 }	229 }

226 }	230 }

227 }	231 }

228	232

229 static __m128 mm_pow_ps(__m128 a, __m128 b) {	233 static __m128 mm_pow_ps(__m128 a, __m128 b) {

230 // a^b = exp2(b * log2(a))	234 // a^b = exp2(b * log2(a))

231 // exp2(x) and log2(x) are calculated using polynomial approximations.	235 // exp2(x) and log2(x) are calculated using polynomial approximations.

232 __m128 log2_a, b_log2_a, a_exp_b;	236 __m128 log2_a, b_log2_a, a_exp_b;

233	237

234 // Calculate log2(x), x = a.	238 // Calculate log2(x), x = a.

(...skipping 495 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
730 }	734 }

731 }	735 }

732	736

733 void WebRtcAec_InitAec_SSE2(void) {	737 void WebRtcAec_InitAec_SSE2(void) {

734 WebRtcAec_FilterFar = FilterFarSSE2;	738 WebRtcAec_FilterFar = FilterFarSSE2;

735 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;	739 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;

736 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;	740 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;

737 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;	741 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;

738 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;	742 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;

739 }	743 }

OLD	NEW

« no previous file with comments | « webrtc/modules/audio_processing/aec/aec_core_neon.c ('k') | no next file » | no next file with comments »