OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 11 matching lines...) Expand all Loading... | |
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
23 | 23 |
24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
25 return aRe * bRe - aIm * bIm; | 25 return aRe * bRe - aIm * bIm; |
26 } | 26 } |
27 | 27 |
28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
29 return aRe * bIm + aIm * bRe; | 29 return aRe * bIm + aIm * bRe; |
30 } | 30 } |
31 | 31 |
32 static void FilterFarSSE2(int num_partitions, | 32 static void FilterFarSSE2( |
33 int xfBufBlockPos, | 33 int num_partitions, |
34 float xfBuf[2][kExtendedNumPartitions * PART_LEN1], | 34 int x_fft_buf_block_pos, |
35 float wfBuf[2][kExtendedNumPartitions * PART_LEN1], | 35 const float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], |
36 float yf[2][PART_LEN1]) { | 36 const float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], |
37 float y_fft[2][PART_LEN1]) { | |
37 | 38 |
38 int i; | 39 int i; |
39 const int num_partitions_local = num_partitions; | 40 const int num_partitions_local = num_partitions; |
40 for (i = 0; i < num_partitions_local; i++) { | 41 for (i = 0; i < num_partitions_local; i++) { |
41 int j; | 42 int j; |
42 int xPos = (i + xfBufBlockPos) * PART_LEN1; | 43 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; |
43 int pos = i * PART_LEN1; | 44 int pos = i * PART_LEN1; |
44 // Check for wrap | 45 // Check for wrap |
45 if (i + xfBufBlockPos >= num_partitions_local) { | 46 if (i + x_fft_buf_block_pos >= num_partitions_local) { |
46 xPos -= num_partitions_local * (PART_LEN1); | 47 xPos -= num_partitions_local * (PART_LEN1); |
47 } | 48 } |
48 | 49 |
49 // vectorized code (four at once) | 50 // vectorized code (four at once) |
50 for (j = 0; j + 3 < PART_LEN1; j += 4) { | 51 for (j = 0; j + 3 < PART_LEN1; j += 4) { |
51 const __m128 xfBuf_re = _mm_loadu_ps(&xfBuf[0][xPos + j]); | 52 const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); |
52 const __m128 xfBuf_im = _mm_loadu_ps(&xfBuf[1][xPos + j]); | 53 const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); |
53 const __m128 wfBuf_re = _mm_loadu_ps(&wfBuf[0][pos + j]); | 54 const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); |
54 const __m128 wfBuf_im = _mm_loadu_ps(&wfBuf[1][pos + j]); | 55 const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); |
55 const __m128 yf_re = _mm_loadu_ps(&yf[0][j]); | 56 const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]); |
56 const __m128 yf_im = _mm_loadu_ps(&yf[1][j]); | 57 const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]); |
57 const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re); | 58 const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re); |
58 const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im); | 59 const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im); |
59 const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im); | 60 const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im); |
60 const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re); | 61 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re); |
61 const __m128 e = _mm_sub_ps(a, b); | 62 const __m128 e = _mm_sub_ps(a, b); |
62 const __m128 f = _mm_add_ps(c, d); | 63 const __m128 f = _mm_add_ps(c, d); |
63 const __m128 g = _mm_add_ps(yf_re, e); | 64 const __m128 g = _mm_add_ps(y_fft_re, e); |
64 const __m128 h = _mm_add_ps(yf_im, f); | 65 const __m128 h = _mm_add_ps(y_fft_im, f); |
65 _mm_storeu_ps(&yf[0][j], g); | 66 _mm_storeu_ps(&y_fft[0][j], g); |
66 _mm_storeu_ps(&yf[1][j], h); | 67 _mm_storeu_ps(&y_fft[1][j], h); |
67 } | 68 } |
68 // scalar code for the remaining items. | 69 // scalar code for the remaining items. |
69 for (; j < PART_LEN1; j++) { | 70 for (; j < PART_LEN1; j++) { |
70 yf[0][j] += MulRe(xfBuf[0][xPos + j], | 71 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], |
71 xfBuf[1][xPos + j], | 72 x_fft_buf[1][xPos + j], |
72 wfBuf[0][pos + j], | 73 h_fft_buf[0][pos + j], |
73 wfBuf[1][pos + j]); | 74 h_fft_buf[1][pos + j]); |
74 yf[1][j] += MulIm(xfBuf[0][xPos + j], | 75 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], |
75 xfBuf[1][xPos + j], | 76 x_fft_buf[1][xPos + j], |
76 wfBuf[0][pos + j], | 77 h_fft_buf[0][pos + j], |
77 wfBuf[1][pos + j]); | 78 h_fft_buf[1][pos + j]); |
78 } | 79 } |
79 } | 80 } |
80 } | 81 } |
81 | 82 |
82 static void ScaleErrorSignalSSE2(int extended_filter_enabled, | 83 static void ScaleErrorSignalSSE2(int extended_filter_enabled, |
83 float normal_mu, | 84 float normal_mu, |
84 float normal_error_threshold, | 85 float normal_error_threshold, |
85 float *xPow, | 86 const float xPow[PART_LEN1], |
hlundin-webrtc
2015/11/24 13:51:44
x_pow
peah-webrtc
2015/11/26 05:55:17
Done.
| |
86 float ef[2][PART_LEN1]) { | 87 float ef[2][PART_LEN1]) { |
87 const __m128 k1e_10f = _mm_set1_ps(1e-10f); | 88 const __m128 k1e_10f = _mm_set1_ps(1e-10f); |
88 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) | 89 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) |
89 : _mm_set1_ps(normal_mu); | 90 : _mm_set1_ps(normal_mu); |
90 const __m128 kThresh = extended_filter_enabled | 91 const __m128 kThresh = extended_filter_enabled |
91 ? _mm_set1_ps(kExtendedErrorThreshold) | 92 ? _mm_set1_ps(kExtendedErrorThreshold) |
92 : _mm_set1_ps(normal_error_threshold); | 93 : _mm_set1_ps(normal_error_threshold); |
93 | 94 |
94 int i; | 95 int i; |
95 // vectorized code (four at once) | 96 // vectorized code (four at once) |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
141 ef[1][i] *= abs_ef; | 142 ef[1][i] *= abs_ef; |
142 } | 143 } |
143 | 144 |
144 // Stepsize factor | 145 // Stepsize factor |
145 ef[0][i] *= mu; | 146 ef[0][i] *= mu; |
146 ef[1][i] *= mu; | 147 ef[1][i] *= mu; |
147 } | 148 } |
148 } | 149 } |
149 } | 150 } |
150 | 151 |
151 static void FilterAdaptationSSE2(AecCore* aec, | 152 static void FilterAdaptationSSE2( |
152 float* fft, | 153 int num_partitions, |
153 float ef[2][PART_LEN1]) { | 154 int x_fft_buf_block_pos, |
155 const float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | |
156 const float e_fft[2][PART_LEN1], | |
157 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { | |
158 float fft[PART_LEN2]; | |
154 int i, j; | 159 int i, j; |
155 const int num_partitions = aec->num_partitions; | |
156 for (i = 0; i < num_partitions; i++) { | 160 for (i = 0; i < num_partitions; i++) { |
157 int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1); | 161 int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1); |
158 int pos = i * PART_LEN1; | 162 int pos = i * PART_LEN1; |
159 // Check for wrap | 163 // Check for wrap |
160 if (i + aec->xfBufBlockPos >= num_partitions) { | 164 if (i + x_fft_buf_block_pos >= num_partitions) { |
161 xPos -= num_partitions * PART_LEN1; | 165 xPos -= num_partitions * PART_LEN1; |
162 } | 166 } |
163 | 167 |
164 // Process the whole array... | 168 // Process the whole array... |
165 for (j = 0; j < PART_LEN; j += 4) { | 169 for (j = 0; j < PART_LEN; j += 4) { |
166 // Load xfBuf and ef. | 170 // Load x_fft_buf and e_fft. |
167 const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]); | 171 const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); |
168 const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]); | 172 const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); |
169 const __m128 ef_re = _mm_loadu_ps(&ef[0][j]); | 173 const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]); |
170 const __m128 ef_im = _mm_loadu_ps(&ef[1][j]); | 174 const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]); |
171 // Calculate the product of conjugate(xfBuf) by ef. | 175 // Calculate the product of conjugate(x_fft_buf) by e_fft. |
172 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm | 176 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm |
173 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe | 177 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe |
174 const __m128 a = _mm_mul_ps(xfBuf_re, ef_re); | 178 const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re); |
175 const __m128 b = _mm_mul_ps(xfBuf_im, ef_im); | 179 const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im); |
176 const __m128 c = _mm_mul_ps(xfBuf_re, ef_im); | 180 const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im); |
177 const __m128 d = _mm_mul_ps(xfBuf_im, ef_re); | 181 const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re); |
178 const __m128 e = _mm_add_ps(a, b); | 182 const __m128 e = _mm_add_ps(a, b); |
179 const __m128 f = _mm_sub_ps(c, d); | 183 const __m128 f = _mm_sub_ps(c, d); |
180 // Interleave real and imaginary parts. | 184 // Interleave real and imaginary parts. |
181 const __m128 g = _mm_unpacklo_ps(e, f); | 185 const __m128 g = _mm_unpacklo_ps(e, f); |
182 const __m128 h = _mm_unpackhi_ps(e, f); | 186 const __m128 h = _mm_unpackhi_ps(e, f); |
183 // Store | 187 // Store |
184 _mm_storeu_ps(&fft[2 * j + 0], g); | 188 _mm_storeu_ps(&fft[2 * j + 0], g); |
185 _mm_storeu_ps(&fft[2 * j + 4], h); | 189 _mm_storeu_ps(&fft[2 * j + 4], h); |
186 } | 190 } |
187 // ... and fixup the first imaginary entry. | 191 // ... and fixup the first imaginary entry. |
188 fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], | 192 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], |
189 -aec->xfBuf[1][xPos + PART_LEN], | 193 -x_fft_buf[1][xPos + PART_LEN], |
190 ef[0][PART_LEN], | 194 e_fft[0][PART_LEN], |
191 ef[1][PART_LEN]); | 195 e_fft[1][PART_LEN]); |
192 | 196 |
193 aec_rdft_inverse_128(fft); | 197 aec_rdft_inverse_128(fft); |
194 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | 198 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); |
195 | 199 |
196 // fft scaling | 200 // fft scaling |
197 { | 201 { |
198 float scale = 2.0f / PART_LEN2; | 202 float scale = 2.0f / PART_LEN2; |
199 const __m128 scale_ps = _mm_load_ps1(&scale); | 203 const __m128 scale_ps = _mm_load_ps1(&scale); |
200 for (j = 0; j < PART_LEN; j += 4) { | 204 for (j = 0; j < PART_LEN; j += 4) { |
201 const __m128 fft_ps = _mm_loadu_ps(&fft[j]); | 205 const __m128 fft_ps = _mm_loadu_ps(&fft[j]); |
202 const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); | 206 const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); |
203 _mm_storeu_ps(&fft[j], fft_scale); | 207 _mm_storeu_ps(&fft[j], fft_scale); |
204 } | 208 } |
205 } | 209 } |
206 aec_rdft_forward_128(fft); | 210 aec_rdft_forward_128(fft); |
207 | 211 |
208 { | 212 { |
209 float wt1 = aec->wfBuf[1][pos]; | 213 float wt1 = h_fft_buf[1][pos]; |
210 aec->wfBuf[0][pos + PART_LEN] += fft[1]; | 214 h_fft_buf[0][pos + PART_LEN] += fft[1]; |
211 for (j = 0; j < PART_LEN; j += 4) { | 215 for (j = 0; j < PART_LEN; j += 4) { |
212 __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); | 216 __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); |
213 __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); | 217 __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); |
214 const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); | 218 const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); |
215 const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); | 219 const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); |
216 const __m128 fft_re = | 220 const __m128 fft_re = |
217 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0)); | 221 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0)); |
218 const __m128 fft_im = | 222 const __m128 fft_im = |
219 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1)); | 223 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1)); |
220 wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); | 224 wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); |
221 wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); | 225 wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); |
222 _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re); | 226 _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re); |
223 _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im); | 227 _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im); |
224 } | 228 } |
225 aec->wfBuf[1][pos] = wt1; | 229 h_fft_buf[1][pos] = wt1; |
226 } | 230 } |
227 } | 231 } |
228 } | 232 } |
229 | 233 |
230 static __m128 mm_pow_ps(__m128 a, __m128 b) { | 234 static __m128 mm_pow_ps(__m128 a, __m128 b) { |
231 // a^b = exp2(b * log2(a)) | 235 // a^b = exp2(b * log2(a)) |
232 // exp2(x) and log2(x) are calculated using polynomial approximations. | 236 // exp2(x) and log2(x) are calculated using polynomial approximations. |
233 __m128 log2_a, b_log2_a, a_exp_b; | 237 __m128 log2_a, b_log2_a, a_exp_b; |
234 | 238 |
235 // Calculate log2(x), x = a. | 239 // Calculate log2(x), x = a. |
(...skipping 495 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
731 } | 735 } |
732 } | 736 } |
733 | 737 |
734 void WebRtcAec_InitAec_SSE2(void) { | 738 void WebRtcAec_InitAec_SSE2(void) { |
735 WebRtcAec_FilterFar = FilterFarSSE2; | 739 WebRtcAec_FilterFar = FilterFarSSE2; |
736 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 740 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
737 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 741 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
738 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 742 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
739 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 743 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
740 } | 744 } |
OLD | NEW |