OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 /* | |
12 * The core AEC algorithm, neon version of speed-critical functions. | |
13 * | |
14 * Based on aec_core_sse2.c. | |
15 */ | |
16 | |
17 #include <arm_neon.h> | |
18 #include <math.h> | |
19 #include <string.h> // memset | |
20 | |
21 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" | |
22 #include "webrtc/modules/audio_processing/aec/aec_common.h" | |
23 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | |
24 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | |
25 | |
26 enum { kShiftExponentIntoTopMantissa = 8 }; | |
27 enum { kFloatExponentShift = 23 }; | |
28 | |
29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | |
30 return aRe * bRe - aIm * bIm; | |
31 } | |
32 | |
33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | |
34 return aRe * bIm + aIm * bRe; | |
35 } | |
36 | |
37 static void FilterFarNEON(int num_partitions, | |
38 int x_fft_buf_block_pos, | |
39 float x_fft_buf[2] | |
40 [kExtendedNumPartitions * PART_LEN1], | |
41 float h_fft_buf[2] | |
42 [kExtendedNumPartitions * PART_LEN1], | |
43 float y_fft[2][PART_LEN1]) { | |
44 int i; | |
45 for (i = 0; i < num_partitions; i++) { | |
46 int j; | |
47 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | |
48 int pos = i * PART_LEN1; | |
49 // Check for wrap | |
50 if (i + x_fft_buf_block_pos >= num_partitions) { | |
51 xPos -= num_partitions * PART_LEN1; | |
52 } | |
53 | |
54 // vectorized code (four at once) | |
55 for (j = 0; j + 3 < PART_LEN1; j += 4) { | |
56 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); | |
57 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); | |
58 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | |
59 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | |
60 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); | |
61 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); | |
62 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); | |
63 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); | |
64 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); | |
65 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); | |
66 const float32x4_t g = vaddq_f32(y_fft_re, e); | |
67 const float32x4_t h = vaddq_f32(y_fft_im, f); | |
68 vst1q_f32(&y_fft[0][j], g); | |
69 vst1q_f32(&y_fft[1][j], h); | |
70 } | |
71 // scalar code for the remaining items. | |
72 for (; j < PART_LEN1; j++) { | |
73 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], | |
74 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); | |
75 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], | |
76 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); | |
77 } | |
78 } | |
79 } | |
80 | |
81 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. | |
82 #if !defined(WEBRTC_ARCH_ARM64) | |
83 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { | |
84 int i; | |
85 float32x4_t x = vrecpeq_f32(b); | |
86 // from arm documentation | |
87 // The Newton-Raphson iteration: | |
88 // x[n+1] = x[n] * (2 - d * x[n]) | |
89 // converges to (1/d) if x0 is the result of VRECPE applied to d. | |
90 // | |
91 // Note: The precision did not improve after 2 iterations. | |
92 for (i = 0; i < 2; i++) { | |
93 x = vmulq_f32(vrecpsq_f32(b, x), x); | |
94 } | |
95 // a/b = a*(1/b) | |
96 return vmulq_f32(a, x); | |
97 } | |
98 | |
99 static float32x4_t vsqrtq_f32(float32x4_t s) { | |
100 int i; | |
101 float32x4_t x = vrsqrteq_f32(s); | |
102 | |
103 // Code to handle sqrt(0). | |
104 // If the input to sqrtf() is zero, a zero will be returned. | |
105 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. | |
106 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); | |
107 // check for divide by zero | |
108 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); | |
109 // zero out the positive infinity results | |
110 x = vreinterpretq_f32_u32( | |
111 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); | |
112 // from arm documentation | |
113 // The Newton-Raphson iteration: | |
114 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) | |
115 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. | |
116 // | |
117 // Note: The precision did not improve after 2 iterations. | |
118 for (i = 0; i < 2; i++) { | |
119 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); | |
120 } | |
121 // sqrt(s) = s * 1/sqrt(s) | |
122 return vmulq_f32(s, x); | |
123 ; | |
124 } | |
125 #endif // WEBRTC_ARCH_ARM64 | |
126 | |
127 static void ScaleErrorSignalNEON(int extended_filter_enabled, | |
128 float normal_mu, | |
129 float normal_error_threshold, | |
130 float x_pow[PART_LEN1], | |
131 float ef[2][PART_LEN1]) { | |
132 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; | |
133 const float error_threshold = extended_filter_enabled | |
134 ? kExtendedErrorThreshold | |
135 : normal_error_threshold; | |
136 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); | |
137 const float32x4_t kMu = vmovq_n_f32(mu); | |
138 const float32x4_t kThresh = vmovq_n_f32(error_threshold); | |
139 int i; | |
140 // vectorized code (four at once) | |
141 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
142 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); | |
143 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); | |
144 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); | |
145 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); | |
146 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); | |
147 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); | |
148 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); | |
149 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); | |
150 const float32x4_t absEf = vsqrtq_f32(ef_sum2); | |
151 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); | |
152 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); | |
153 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); | |
154 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); | |
155 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); | |
156 uint32x4_t ef_re_u32 = | |
157 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); | |
158 uint32x4_t ef_im_u32 = | |
159 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); | |
160 ef_re_if = vandq_u32(bigger, ef_re_if); | |
161 ef_im_if = vandq_u32(bigger, ef_im_if); | |
162 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); | |
163 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); | |
164 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); | |
165 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); | |
166 vst1q_f32(&ef[0][i], ef_re); | |
167 vst1q_f32(&ef[1][i], ef_im); | |
168 } | |
169 // scalar code for the remaining items. | |
170 for (; i < PART_LEN1; i++) { | |
171 float abs_ef; | |
172 ef[0][i] /= (x_pow[i] + 1e-10f); | |
173 ef[1][i] /= (x_pow[i] + 1e-10f); | |
174 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); | |
175 | |
176 if (abs_ef > error_threshold) { | |
177 abs_ef = error_threshold / (abs_ef + 1e-10f); | |
178 ef[0][i] *= abs_ef; | |
179 ef[1][i] *= abs_ef; | |
180 } | |
181 | |
182 // Stepsize factor | |
183 ef[0][i] *= mu; | |
184 ef[1][i] *= mu; | |
185 } | |
186 } | |
187 | |
188 static void FilterAdaptationNEON( | |
189 int num_partitions, | |
190 int x_fft_buf_block_pos, | |
191 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | |
192 float e_fft[2][PART_LEN1], | |
193 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { | |
194 float fft[PART_LEN2]; | |
195 int i; | |
196 for (i = 0; i < num_partitions; i++) { | |
197 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | |
198 int pos = i * PART_LEN1; | |
199 int j; | |
200 // Check for wrap | |
201 if (i + x_fft_buf_block_pos >= num_partitions) { | |
202 xPos -= num_partitions * PART_LEN1; | |
203 } | |
204 | |
205 // Process the whole array... | |
206 for (j = 0; j < PART_LEN; j += 4) { | |
207 // Load x_fft_buf and e_fft. | |
208 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); | |
209 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); | |
210 const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]); | |
211 const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]); | |
212 // Calculate the product of conjugate(x_fft_buf) by e_fft. | |
213 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm | |
214 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe | |
215 const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re); | |
216 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); | |
217 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); | |
218 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); | |
219 // Interleave real and imaginary parts. | |
220 const float32x4x2_t g_n_h = vzipq_f32(e, f); | |
221 // Store | |
222 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); | |
223 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); | |
224 } | |
225 // ... and fixup the first imaginary entry. | |
226 fft[1] = | |
227 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], | |
228 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); | |
229 | |
230 aec_rdft_inverse_128(fft); | |
231 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | |
232 | |
233 // fft scaling | |
234 { | |
235 const float scale = 2.0f / PART_LEN2; | |
236 const float32x4_t scale_ps = vmovq_n_f32(scale); | |
237 for (j = 0; j < PART_LEN; j += 4) { | |
238 const float32x4_t fft_ps = vld1q_f32(&fft[j]); | |
239 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); | |
240 vst1q_f32(&fft[j], fft_scale); | |
241 } | |
242 } | |
243 aec_rdft_forward_128(fft); | |
244 | |
245 { | |
246 const float wt1 = h_fft_buf[1][pos]; | |
247 h_fft_buf[0][pos + PART_LEN] += fft[1]; | |
248 for (j = 0; j < PART_LEN; j += 4) { | |
249 float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | |
250 float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | |
251 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); | |
252 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); | |
253 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); | |
254 wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); | |
255 wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); | |
256 | |
257 vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re); | |
258 vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im); | |
259 } | |
260 h_fft_buf[1][pos] = wt1; | |
261 } | |
262 } | |
263 } | |
264 | |
265 static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { | |
266 // a^b = exp2(b * log2(a)) | |
267 // exp2(x) and log2(x) are calculated using polynomial approximations. | |
268 float32x4_t log2_a, b_log2_a, a_exp_b; | |
269 | |
270 // Calculate log2(x), x = a. | |
271 { | |
272 // To calculate log2(x), we decompose x like this: | |
273 // x = y * 2^n | |
274 // n is an integer | |
275 // y is in the [1.0, 2.0) range | |
276 // | |
277 // log2(x) = log2(y) + n | |
278 // n can be evaluated by playing with float representation. | |
279 // log2(y) in a small range can be approximated, this code uses an order | |
280 // five polynomial approximation. The coefficients have been | |
281 // estimated with the Remez algorithm and the resulting | |
282 // polynomial has a maximum relative error of 0.00086%. | |
283 | |
284 // Compute n. | |
285 // This is done by masking the exponent, shifting it into the top bit of | |
286 // the mantissa, putting eight into the biased exponent (to shift/ | |
287 // compensate the fact that the exponent has been shifted in the top/ | |
288 // fractional part and finally getting rid of the implicit leading one | |
289 // from the mantissa by substracting it out. | |
290 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); | |
291 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); | |
292 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); | |
293 const uint32x4_t two_n = | |
294 vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); | |
295 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); | |
296 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); | |
297 const float32x4_t n = | |
298 vsubq_f32(vreinterpretq_f32_u32(n_0), | |
299 vreinterpretq_f32_u32(vec_implicit_leading_one)); | |
300 // Compute y. | |
301 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); | |
302 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); | |
303 const uint32x4_t mantissa = | |
304 vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); | |
305 const float32x4_t y = vreinterpretq_f32_u32( | |
306 vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); | |
307 // Approximate log2(y) ~= (y - 1) * pol5(y). | |
308 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | |
309 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); | |
310 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); | |
311 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); | |
312 const float32x4_t C2 = vdupq_n_f32(2.5988452f); | |
313 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); | |
314 const float32x4_t C0 = vdupq_n_f32(3.1157899f); | |
315 float32x4_t pol5_y = C5; | |
316 pol5_y = vmlaq_f32(C4, y, pol5_y); | |
317 pol5_y = vmlaq_f32(C3, y, pol5_y); | |
318 pol5_y = vmlaq_f32(C2, y, pol5_y); | |
319 pol5_y = vmlaq_f32(C1, y, pol5_y); | |
320 pol5_y = vmlaq_f32(C0, y, pol5_y); | |
321 const float32x4_t y_minus_one = | |
322 vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); | |
323 const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); | |
324 | |
325 // Combine parts. | |
326 log2_a = vaddq_f32(n, log2_y); | |
327 } | |
328 | |
329 // b * log2(a) | |
330 b_log2_a = vmulq_f32(b, log2_a); | |
331 | |
332 // Calculate exp2(x), x = b * log2(a). | |
333 { | |
334 // To calculate 2^x, we decompose x like this: | |
335 // x = n + y | |
336 // n is an integer, the value of x - 0.5 rounded down, therefore | |
337 // y is in the [0.5, 1.5) range | |
338 // | |
339 // 2^x = 2^n * 2^y | |
340 // 2^n can be evaluated by playing with float representation. | |
341 // 2^y in a small range can be approximated, this code uses an order two | |
342 // polynomial approximation. The coefficients have been estimated | |
343 // with the Remez algorithm and the resulting polynomial has a | |
344 // maximum relative error of 0.17%. | |
345 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. | |
346 const float32x4_t max_input = vdupq_n_f32(129.f); | |
347 const float32x4_t min_input = vdupq_n_f32(-126.99999f); | |
348 const float32x4_t x_min = vminq_f32(b_log2_a, max_input); | |
349 const float32x4_t x_max = vmaxq_f32(x_min, min_input); | |
350 // Compute n. | |
351 const float32x4_t half = vdupq_n_f32(0.5f); | |
352 const float32x4_t x_minus_half = vsubq_f32(x_max, half); | |
353 const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); | |
354 | |
355 // Compute 2^n. | |
356 const int32x4_t float_exponent_bias = vdupq_n_s32(127); | |
357 const int32x4_t two_n_exponent = | |
358 vaddq_s32(x_minus_half_floor, float_exponent_bias); | |
359 const float32x4_t two_n = | |
360 vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); | |
361 // Compute y. | |
362 const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); | |
363 | |
364 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. | |
365 const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); | |
366 const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); | |
367 const float32x4_t C0 = vdupq_n_f32(1.0017247f); | |
368 float32x4_t exp2_y = C2; | |
369 exp2_y = vmlaq_f32(C1, y, exp2_y); | |
370 exp2_y = vmlaq_f32(C0, y, exp2_y); | |
371 | |
372 // Combine parts. | |
373 a_exp_b = vmulq_f32(exp2_y, two_n); | |
374 } | |
375 | |
376 return a_exp_b; | |
377 } | |
378 | |
379 static void OverdriveAndSuppressNEON(AecCore* aec, | |
380 float hNl[PART_LEN1], | |
381 const float hNlFb, | |
382 float efw[2][PART_LEN1]) { | |
383 int i; | |
384 const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); | |
385 const float32x4_t vec_one = vdupq_n_f32(1.0f); | |
386 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); | |
387 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); | |
388 | |
389 // vectorized code (four at once) | |
390 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
391 // Weight subbands | |
392 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); | |
393 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); | |
394 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); | |
395 const float32x4_t vec_weightCurve_hNlFb = | |
396 vmulq_f32(vec_weightCurve, vec_hNlFb); | |
397 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); | |
398 const float32x4_t vec_one_weightCurve_hNl = | |
399 vmulq_f32(vec_one_weightCurve, vec_hNl); | |
400 const uint32x4_t vec_if0 = | |
401 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); | |
402 const float32x4_t vec_one_weightCurve_add = | |
403 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); | |
404 const uint32x4_t vec_if1 = | |
405 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); | |
406 | |
407 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); | |
408 | |
409 { | |
410 const float32x4_t vec_overDriveCurve = | |
411 vld1q_f32(&WebRtcAec_overDriveCurve[i]); | |
412 const float32x4_t vec_overDriveSm_overDriveCurve = | |
413 vmulq_f32(vec_overDriveSm, vec_overDriveCurve); | |
414 vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); | |
415 vst1q_f32(&hNl[i], vec_hNl); | |
416 } | |
417 | |
418 // Suppress error signal | |
419 { | |
420 float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); | |
421 float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); | |
422 vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); | |
423 vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); | |
424 | |
425 // Ooura fft returns incorrect sign on imaginary component. It matters | |
426 // here because we are making an additive change with comfort noise. | |
427 vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); | |
428 vst1q_f32(&efw[0][i], vec_efw_re); | |
429 vst1q_f32(&efw[1][i], vec_efw_im); | |
430 } | |
431 } | |
432 | |
433 // scalar code for the remaining items. | |
434 for (; i < PART_LEN1; i++) { | |
435 // Weight subbands | |
436 if (hNl[i] > hNlFb) { | |
437 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + | |
438 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; | |
439 } | |
440 | |
441 hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); | |
442 | |
443 // Suppress error signal | |
444 efw[0][i] *= hNl[i]; | |
445 efw[1][i] *= hNl[i]; | |
446 | |
447 // Ooura fft returns incorrect sign on imaginary component. It matters | |
448 // here because we are making an additive change with comfort noise. | |
449 efw[1][i] *= -1; | |
450 } | |
451 } | |
452 | |
453 static int PartitionDelayNEON(const AecCore* aec) { | |
454 // Measures the energy in each filter partition and returns the partition with | |
455 // highest energy. | |
456 // TODO(bjornv): Spread computational cost by computing one partition per | |
457 // block? | |
458 float wfEnMax = 0; | |
459 int i; | |
460 int delay = 0; | |
461 | |
462 for (i = 0; i < aec->num_partitions; i++) { | |
463 int j; | |
464 int pos = i * PART_LEN1; | |
465 float wfEn = 0; | |
466 float32x4_t vec_wfEn = vdupq_n_f32(0.0f); | |
467 // vectorized code (four at once) | |
468 for (j = 0; j + 3 < PART_LEN1; j += 4) { | |
469 const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); | |
470 const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); | |
471 vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); | |
472 vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); | |
473 } | |
474 { | |
475 float32x2_t vec_total; | |
476 // A B C D | |
477 vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); | |
478 // A+B C+D | |
479 vec_total = vpadd_f32(vec_total, vec_total); | |
480 // A+B+C+D A+B+C+D | |
481 wfEn = vget_lane_f32(vec_total, 0); | |
482 } | |
483 | |
484 // scalar code for the remaining items. | |
485 for (; j < PART_LEN1; j++) { | |
486 wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + | |
487 aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; | |
488 } | |
489 | |
490 if (wfEn > wfEnMax) { | |
491 wfEnMax = wfEn; | |
492 delay = i; | |
493 } | |
494 } | |
495 return delay; | |
496 } | |
497 | |
498 // Updates the following smoothed Power Spectral Densities (PSD): | |
499 // - sd : near-end | |
500 // - se : residual echo | |
501 // - sx : far-end | |
502 // - sde : cross-PSD of near-end and residual echo | |
503 // - sxd : cross-PSD of near-end and far-end | |
504 // | |
505 // In addition to updating the PSDs, also the filter diverge state is determined | |
506 // upon actions are taken. | |
507 static void SmoothedPSD(AecCore* aec, | |
508 float efw[2][PART_LEN1], | |
509 float dfw[2][PART_LEN1], | |
510 float xfw[2][PART_LEN1], | |
511 int* extreme_filter_divergence) { | |
512 // Power estimate smoothing coefficients. | |
513 const float* ptrGCoh = | |
514 aec->extended_filter_enabled | |
515 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | |
516 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | |
517 int i; | |
518 float sdSum = 0, seSum = 0; | |
519 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); | |
520 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); | |
521 float32x4_t vec_seSum = vdupq_n_f32(0.0f); | |
522 | |
523 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
524 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); | |
525 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); | |
526 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); | |
527 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); | |
528 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); | |
529 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); | |
530 float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); | |
531 float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); | |
532 float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); | |
533 float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); | |
534 float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); | |
535 float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); | |
536 | |
537 vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); | |
538 vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); | |
539 vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); | |
540 vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); | |
541 vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); | |
542 vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); | |
543 vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); | |
544 | |
545 vst1q_f32(&aec->sd[i], vec_sd); | |
546 vst1q_f32(&aec->se[i], vec_se); | |
547 vst1q_f32(&aec->sx[i], vec_sx); | |
548 | |
549 { | |
550 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); | |
551 float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); | |
552 float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); | |
553 vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); | |
554 vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); | |
555 vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); | |
556 vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); | |
557 vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); | |
558 vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); | |
559 vst2q_f32(&aec->sde[i][0], vec_sde); | |
560 } | |
561 | |
562 { | |
563 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); | |
564 float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); | |
565 float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); | |
566 vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); | |
567 vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); | |
568 vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); | |
569 vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); | |
570 vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); | |
571 vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); | |
572 vst2q_f32(&aec->sxd[i][0], vec_sxd); | |
573 } | |
574 | |
575 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); | |
576 vec_seSum = vaddq_f32(vec_seSum, vec_se); | |
577 } | |
578 { | |
579 float32x2_t vec_sdSum_total; | |
580 float32x2_t vec_seSum_total; | |
581 // A B C D | |
582 vec_sdSum_total = | |
583 vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); | |
584 vec_seSum_total = | |
585 vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); | |
586 // A+B C+D | |
587 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); | |
588 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); | |
589 // A+B+C+D A+B+C+D | |
590 sdSum = vget_lane_f32(vec_sdSum_total, 0); | |
591 seSum = vget_lane_f32(vec_seSum_total, 0); | |
592 } | |
593 | |
594 // scalar code for the remaining items. | |
595 for (; i < PART_LEN1; i++) { | |
596 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | |
597 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | |
598 aec->se[i] = ptrGCoh[0] * aec->se[i] + | |
599 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | |
600 // We threshold here to protect against the ill-effects of a zero farend. | |
601 // The threshold is not arbitrarily chosen, but balances protection and | |
602 // adverse interaction with the algorithm's tuning. | |
603 // TODO(bjornv): investigate further why this is so sensitive. | |
604 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + | |
605 ptrGCoh[1] * WEBRTC_SPL_MAX( | |
606 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | |
607 WebRtcAec_kMinFarendPSD); | |
608 | |
609 aec->sde[i][0] = | |
610 ptrGCoh[0] * aec->sde[i][0] + | |
611 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | |
612 aec->sde[i][1] = | |
613 ptrGCoh[0] * aec->sde[i][1] + | |
614 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | |
615 | |
616 aec->sxd[i][0] = | |
617 ptrGCoh[0] * aec->sxd[i][0] + | |
618 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); | |
619 aec->sxd[i][1] = | |
620 ptrGCoh[0] * aec->sxd[i][1] + | |
621 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); | |
622 | |
623 sdSum += aec->sd[i]; | |
624 seSum += aec->se[i]; | |
625 } | |
626 | |
627 // Divergent filter safeguard update. | |
628 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; | |
629 | |
630 // Signal extreme filter divergence if the error is significantly larger | |
631 // than the nearend (13 dB). | |
632 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); | |
633 } | |
634 | |
635 // Window time domain data to be used by the fft. | |
636 static void WindowDataNEON(float* x_windowed, const float* x) { | |
637 int i; | |
638 for (i = 0; i < PART_LEN; i += 4) { | |
639 const float32x4_t vec_Buf1 = vld1q_f32(&x[i]); | |
640 const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]); | |
641 const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]); | |
642 // A B C D | |
643 float32x4_t vec_sqrtHanning_rev = | |
644 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | |
645 // B A D C | |
646 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); | |
647 // D C B A | |
648 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), | |
649 vget_low_f32(vec_sqrtHanning_rev)); | |
650 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); | |
651 vst1q_f32(&x_windowed[PART_LEN + i], | |
652 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); | |
653 } | |
654 } | |
655 | |
656 // Puts fft output data into a complex valued array. | |
657 static void StoreAsComplexNEON(const float* data, | |
658 float data_complex[2][PART_LEN1]) { | |
659 int i; | |
660 for (i = 0; i < PART_LEN; i += 4) { | |
661 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); | |
662 vst1q_f32(&data_complex[0][i], vec_data.val[0]); | |
663 vst1q_f32(&data_complex[1][i], vec_data.val[1]); | |
664 } | |
665 // fix beginning/end values | |
666 data_complex[1][0] = 0; | |
667 data_complex[1][PART_LEN] = 0; | |
668 data_complex[0][0] = data[0]; | |
669 data_complex[0][PART_LEN] = data[1]; | |
670 } | |
671 | |
672 static void SubbandCoherenceNEON(AecCore* aec, | |
673 float efw[2][PART_LEN1], | |
674 float dfw[2][PART_LEN1], | |
675 float xfw[2][PART_LEN1], | |
676 float* fft, | |
677 float* cohde, | |
678 float* cohxd, | |
679 int* extreme_filter_divergence) { | |
680 int i; | |
681 | |
682 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | |
683 | |
684 { | |
685 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); | |
686 | |
687 // Subband coherence | |
688 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
689 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); | |
690 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); | |
691 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); | |
692 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); | |
693 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); | |
694 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); | |
695 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); | |
696 float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); | |
697 float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); | |
698 vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); | |
699 vec_cohde = vdivq_f32(vec_cohde, vec_sdse); | |
700 vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); | |
701 vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); | |
702 | |
703 vst1q_f32(&cohde[i], vec_cohde); | |
704 vst1q_f32(&cohxd[i], vec_cohxd); | |
705 } | |
706 } | |
707 // scalar code for the remaining items. | |
708 for (; i < PART_LEN1; i++) { | |
709 cohde[i] = | |
710 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / | |
711 (aec->sd[i] * aec->se[i] + 1e-10f); | |
712 cohxd[i] = | |
713 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / | |
714 (aec->sx[i] * aec->sd[i] + 1e-10f); | |
715 } | |
716 } | |
717 | |
718 void WebRtcAec_InitAec_neon(void) { | |
719 WebRtcAec_FilterFar = FilterFarNEON; | |
720 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; | |
721 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; | |
722 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; | |
723 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; | |
724 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; | |
725 WebRtcAec_PartitionDelay = PartitionDelayNEON; | |
726 WebRtcAec_WindowData = WindowDataNEON; | |
727 } | |
OLD | NEW |