| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 /* | |
| 12 * The core AEC algorithm, neon version of speed-critical functions. | |
| 13 * | |
| 14 * Based on aec_core_sse2.c. | |
| 15 */ | |
| 16 | |
| 17 #include <arm_neon.h> | |
| 18 #include <math.h> | |
| 19 #include <string.h> // memset | |
| 20 | |
| 21 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" | |
| 22 #include "webrtc/modules/audio_processing/aec/aec_common.h" | |
| 23 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | |
| 24 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | |
| 25 | |
| 26 enum { kShiftExponentIntoTopMantissa = 8 }; | |
| 27 enum { kFloatExponentShift = 23 }; | |
| 28 | |
| 29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | |
| 30 return aRe * bRe - aIm * bIm; | |
| 31 } | |
| 32 | |
| 33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | |
| 34 return aRe * bIm + aIm * bRe; | |
| 35 } | |
| 36 | |
| 37 static void FilterFarNEON(int num_partitions, | |
| 38 int x_fft_buf_block_pos, | |
| 39 float x_fft_buf[2] | |
| 40 [kExtendedNumPartitions * PART_LEN1], | |
| 41 float h_fft_buf[2] | |
| 42 [kExtendedNumPartitions * PART_LEN1], | |
| 43 float y_fft[2][PART_LEN1]) { | |
| 44 int i; | |
| 45 for (i = 0; i < num_partitions; i++) { | |
| 46 int j; | |
| 47 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | |
| 48 int pos = i * PART_LEN1; | |
| 49 // Check for wrap | |
| 50 if (i + x_fft_buf_block_pos >= num_partitions) { | |
| 51 xPos -= num_partitions * PART_LEN1; | |
| 52 } | |
| 53 | |
| 54 // vectorized code (four at once) | |
| 55 for (j = 0; j + 3 < PART_LEN1; j += 4) { | |
| 56 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); | |
| 57 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); | |
| 58 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | |
| 59 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | |
| 60 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); | |
| 61 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); | |
| 62 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); | |
| 63 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); | |
| 64 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); | |
| 65 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); | |
| 66 const float32x4_t g = vaddq_f32(y_fft_re, e); | |
| 67 const float32x4_t h = vaddq_f32(y_fft_im, f); | |
| 68 vst1q_f32(&y_fft[0][j], g); | |
| 69 vst1q_f32(&y_fft[1][j], h); | |
| 70 } | |
| 71 // scalar code for the remaining items. | |
| 72 for (; j < PART_LEN1; j++) { | |
| 73 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], | |
| 74 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); | |
| 75 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], | |
| 76 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); | |
| 77 } | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. | |
| 82 #if !defined(WEBRTC_ARCH_ARM64) | |
| 83 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { | |
| 84 int i; | |
| 85 float32x4_t x = vrecpeq_f32(b); | |
| 86 // from arm documentation | |
| 87 // The Newton-Raphson iteration: | |
| 88 // x[n+1] = x[n] * (2 - d * x[n]) | |
| 89 // converges to (1/d) if x0 is the result of VRECPE applied to d. | |
| 90 // | |
| 91 // Note: The precision did not improve after 2 iterations. | |
| 92 for (i = 0; i < 2; i++) { | |
| 93 x = vmulq_f32(vrecpsq_f32(b, x), x); | |
| 94 } | |
| 95 // a/b = a*(1/b) | |
| 96 return vmulq_f32(a, x); | |
| 97 } | |
| 98 | |
| 99 static float32x4_t vsqrtq_f32(float32x4_t s) { | |
| 100 int i; | |
| 101 float32x4_t x = vrsqrteq_f32(s); | |
| 102 | |
| 103 // Code to handle sqrt(0). | |
| 104 // If the input to sqrtf() is zero, a zero will be returned. | |
| 105 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. | |
| 106 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); | |
| 107 // check for divide by zero | |
| 108 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); | |
| 109 // zero out the positive infinity results | |
| 110 x = vreinterpretq_f32_u32( | |
| 111 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); | |
| 112 // from arm documentation | |
| 113 // The Newton-Raphson iteration: | |
| 114 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) | |
| 115 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. | |
| 116 // | |
| 117 // Note: The precision did not improve after 2 iterations. | |
| 118 for (i = 0; i < 2; i++) { | |
| 119 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); | |
| 120 } | |
| 121 // sqrt(s) = s * 1/sqrt(s) | |
| 122 return vmulq_f32(s, x); | |
| 123 ; | |
| 124 } | |
| 125 #endif // WEBRTC_ARCH_ARM64 | |
| 126 | |
| 127 static void ScaleErrorSignalNEON(int extended_filter_enabled, | |
| 128 float normal_mu, | |
| 129 float normal_error_threshold, | |
| 130 float x_pow[PART_LEN1], | |
| 131 float ef[2][PART_LEN1]) { | |
| 132 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; | |
| 133 const float error_threshold = extended_filter_enabled | |
| 134 ? kExtendedErrorThreshold | |
| 135 : normal_error_threshold; | |
| 136 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); | |
| 137 const float32x4_t kMu = vmovq_n_f32(mu); | |
| 138 const float32x4_t kThresh = vmovq_n_f32(error_threshold); | |
| 139 int i; | |
| 140 // vectorized code (four at once) | |
| 141 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
| 142 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); | |
| 143 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); | |
| 144 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); | |
| 145 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); | |
| 146 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); | |
| 147 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); | |
| 148 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); | |
| 149 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); | |
| 150 const float32x4_t absEf = vsqrtq_f32(ef_sum2); | |
| 151 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); | |
| 152 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); | |
| 153 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); | |
| 154 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); | |
| 155 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); | |
| 156 uint32x4_t ef_re_u32 = | |
| 157 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); | |
| 158 uint32x4_t ef_im_u32 = | |
| 159 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); | |
| 160 ef_re_if = vandq_u32(bigger, ef_re_if); | |
| 161 ef_im_if = vandq_u32(bigger, ef_im_if); | |
| 162 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); | |
| 163 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); | |
| 164 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); | |
| 165 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); | |
| 166 vst1q_f32(&ef[0][i], ef_re); | |
| 167 vst1q_f32(&ef[1][i], ef_im); | |
| 168 } | |
| 169 // scalar code for the remaining items. | |
| 170 for (; i < PART_LEN1; i++) { | |
| 171 float abs_ef; | |
| 172 ef[0][i] /= (x_pow[i] + 1e-10f); | |
| 173 ef[1][i] /= (x_pow[i] + 1e-10f); | |
| 174 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); | |
| 175 | |
| 176 if (abs_ef > error_threshold) { | |
| 177 abs_ef = error_threshold / (abs_ef + 1e-10f); | |
| 178 ef[0][i] *= abs_ef; | |
| 179 ef[1][i] *= abs_ef; | |
| 180 } | |
| 181 | |
| 182 // Stepsize factor | |
| 183 ef[0][i] *= mu; | |
| 184 ef[1][i] *= mu; | |
| 185 } | |
| 186 } | |
| 187 | |
| 188 static void FilterAdaptationNEON( | |
| 189 int num_partitions, | |
| 190 int x_fft_buf_block_pos, | |
| 191 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | |
| 192 float e_fft[2][PART_LEN1], | |
| 193 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { | |
| 194 float fft[PART_LEN2]; | |
| 195 int i; | |
| 196 for (i = 0; i < num_partitions; i++) { | |
| 197 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | |
| 198 int pos = i * PART_LEN1; | |
| 199 int j; | |
| 200 // Check for wrap | |
| 201 if (i + x_fft_buf_block_pos >= num_partitions) { | |
| 202 xPos -= num_partitions * PART_LEN1; | |
| 203 } | |
| 204 | |
| 205 // Process the whole array... | |
| 206 for (j = 0; j < PART_LEN; j += 4) { | |
| 207 // Load x_fft_buf and e_fft. | |
| 208 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); | |
| 209 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); | |
| 210 const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]); | |
| 211 const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]); | |
| 212 // Calculate the product of conjugate(x_fft_buf) by e_fft. | |
| 213 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm | |
| 214 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe | |
| 215 const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re); | |
| 216 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); | |
| 217 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); | |
| 218 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); | |
| 219 // Interleave real and imaginary parts. | |
| 220 const float32x4x2_t g_n_h = vzipq_f32(e, f); | |
| 221 // Store | |
| 222 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); | |
| 223 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); | |
| 224 } | |
| 225 // ... and fixup the first imaginary entry. | |
| 226 fft[1] = | |
| 227 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], | |
| 228 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); | |
| 229 | |
| 230 aec_rdft_inverse_128(fft); | |
| 231 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | |
| 232 | |
| 233 // fft scaling | |
| 234 { | |
| 235 const float scale = 2.0f / PART_LEN2; | |
| 236 const float32x4_t scale_ps = vmovq_n_f32(scale); | |
| 237 for (j = 0; j < PART_LEN; j += 4) { | |
| 238 const float32x4_t fft_ps = vld1q_f32(&fft[j]); | |
| 239 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); | |
| 240 vst1q_f32(&fft[j], fft_scale); | |
| 241 } | |
| 242 } | |
| 243 aec_rdft_forward_128(fft); | |
| 244 | |
| 245 { | |
| 246 const float wt1 = h_fft_buf[1][pos]; | |
| 247 h_fft_buf[0][pos + PART_LEN] += fft[1]; | |
| 248 for (j = 0; j < PART_LEN; j += 4) { | |
| 249 float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | |
| 250 float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | |
| 251 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); | |
| 252 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); | |
| 253 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); | |
| 254 wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); | |
| 255 wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); | |
| 256 | |
| 257 vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re); | |
| 258 vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im); | |
| 259 } | |
| 260 h_fft_buf[1][pos] = wt1; | |
| 261 } | |
| 262 } | |
| 263 } | |
| 264 | |
| 265 static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { | |
| 266 // a^b = exp2(b * log2(a)) | |
| 267 // exp2(x) and log2(x) are calculated using polynomial approximations. | |
| 268 float32x4_t log2_a, b_log2_a, a_exp_b; | |
| 269 | |
| 270 // Calculate log2(x), x = a. | |
| 271 { | |
| 272 // To calculate log2(x), we decompose x like this: | |
| 273 // x = y * 2^n | |
| 274 // n is an integer | |
| 275 // y is in the [1.0, 2.0) range | |
| 276 // | |
| 277 // log2(x) = log2(y) + n | |
| 278 // n can be evaluated by playing with float representation. | |
| 279 // log2(y) in a small range can be approximated, this code uses an order | |
| 280 // five polynomial approximation. The coefficients have been | |
| 281 // estimated with the Remez algorithm and the resulting | |
| 282 // polynomial has a maximum relative error of 0.00086%. | |
| 283 | |
| 284 // Compute n. | |
| 285 // This is done by masking the exponent, shifting it into the top bit of | |
| 286 // the mantissa, putting eight into the biased exponent (to shift/ | |
| 287 // compensate the fact that the exponent has been shifted in the top/ | |
| 288 // fractional part and finally getting rid of the implicit leading one | |
| 289 // from the mantissa by substracting it out. | |
| 290 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); | |
| 291 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); | |
| 292 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); | |
| 293 const uint32x4_t two_n = | |
| 294 vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); | |
| 295 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); | |
| 296 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); | |
| 297 const float32x4_t n = | |
| 298 vsubq_f32(vreinterpretq_f32_u32(n_0), | |
| 299 vreinterpretq_f32_u32(vec_implicit_leading_one)); | |
| 300 // Compute y. | |
| 301 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); | |
| 302 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); | |
| 303 const uint32x4_t mantissa = | |
| 304 vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); | |
| 305 const float32x4_t y = vreinterpretq_f32_u32( | |
| 306 vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); | |
| 307 // Approximate log2(y) ~= (y - 1) * pol5(y). | |
| 308 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | |
| 309 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); | |
| 310 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); | |
| 311 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); | |
| 312 const float32x4_t C2 = vdupq_n_f32(2.5988452f); | |
| 313 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); | |
| 314 const float32x4_t C0 = vdupq_n_f32(3.1157899f); | |
| 315 float32x4_t pol5_y = C5; | |
| 316 pol5_y = vmlaq_f32(C4, y, pol5_y); | |
| 317 pol5_y = vmlaq_f32(C3, y, pol5_y); | |
| 318 pol5_y = vmlaq_f32(C2, y, pol5_y); | |
| 319 pol5_y = vmlaq_f32(C1, y, pol5_y); | |
| 320 pol5_y = vmlaq_f32(C0, y, pol5_y); | |
| 321 const float32x4_t y_minus_one = | |
| 322 vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); | |
| 323 const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); | |
| 324 | |
| 325 // Combine parts. | |
| 326 log2_a = vaddq_f32(n, log2_y); | |
| 327 } | |
| 328 | |
| 329 // b * log2(a) | |
| 330 b_log2_a = vmulq_f32(b, log2_a); | |
| 331 | |
| 332 // Calculate exp2(x), x = b * log2(a). | |
| 333 { | |
| 334 // To calculate 2^x, we decompose x like this: | |
| 335 // x = n + y | |
| 336 // n is an integer, the value of x - 0.5 rounded down, therefore | |
| 337 // y is in the [0.5, 1.5) range | |
| 338 // | |
| 339 // 2^x = 2^n * 2^y | |
| 340 // 2^n can be evaluated by playing with float representation. | |
| 341 // 2^y in a small range can be approximated, this code uses an order two | |
| 342 // polynomial approximation. The coefficients have been estimated | |
| 343 // with the Remez algorithm and the resulting polynomial has a | |
| 344 // maximum relative error of 0.17%. | |
| 345 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. | |
| 346 const float32x4_t max_input = vdupq_n_f32(129.f); | |
| 347 const float32x4_t min_input = vdupq_n_f32(-126.99999f); | |
| 348 const float32x4_t x_min = vminq_f32(b_log2_a, max_input); | |
| 349 const float32x4_t x_max = vmaxq_f32(x_min, min_input); | |
| 350 // Compute n. | |
| 351 const float32x4_t half = vdupq_n_f32(0.5f); | |
| 352 const float32x4_t x_minus_half = vsubq_f32(x_max, half); | |
| 353 const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); | |
| 354 | |
| 355 // Compute 2^n. | |
| 356 const int32x4_t float_exponent_bias = vdupq_n_s32(127); | |
| 357 const int32x4_t two_n_exponent = | |
| 358 vaddq_s32(x_minus_half_floor, float_exponent_bias); | |
| 359 const float32x4_t two_n = | |
| 360 vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); | |
| 361 // Compute y. | |
| 362 const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); | |
| 363 | |
| 364 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. | |
| 365 const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); | |
| 366 const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); | |
| 367 const float32x4_t C0 = vdupq_n_f32(1.0017247f); | |
| 368 float32x4_t exp2_y = C2; | |
| 369 exp2_y = vmlaq_f32(C1, y, exp2_y); | |
| 370 exp2_y = vmlaq_f32(C0, y, exp2_y); | |
| 371 | |
| 372 // Combine parts. | |
| 373 a_exp_b = vmulq_f32(exp2_y, two_n); | |
| 374 } | |
| 375 | |
| 376 return a_exp_b; | |
| 377 } | |
| 378 | |
| 379 static void OverdriveAndSuppressNEON(AecCore* aec, | |
| 380 float hNl[PART_LEN1], | |
| 381 const float hNlFb, | |
| 382 float efw[2][PART_LEN1]) { | |
| 383 int i; | |
| 384 const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); | |
| 385 const float32x4_t vec_one = vdupq_n_f32(1.0f); | |
| 386 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); | |
| 387 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); | |
| 388 | |
| 389 // vectorized code (four at once) | |
| 390 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
| 391 // Weight subbands | |
| 392 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); | |
| 393 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); | |
| 394 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); | |
| 395 const float32x4_t vec_weightCurve_hNlFb = | |
| 396 vmulq_f32(vec_weightCurve, vec_hNlFb); | |
| 397 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); | |
| 398 const float32x4_t vec_one_weightCurve_hNl = | |
| 399 vmulq_f32(vec_one_weightCurve, vec_hNl); | |
| 400 const uint32x4_t vec_if0 = | |
| 401 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); | |
| 402 const float32x4_t vec_one_weightCurve_add = | |
| 403 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); | |
| 404 const uint32x4_t vec_if1 = | |
| 405 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); | |
| 406 | |
| 407 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); | |
| 408 | |
| 409 { | |
| 410 const float32x4_t vec_overDriveCurve = | |
| 411 vld1q_f32(&WebRtcAec_overDriveCurve[i]); | |
| 412 const float32x4_t vec_overDriveSm_overDriveCurve = | |
| 413 vmulq_f32(vec_overDriveSm, vec_overDriveCurve); | |
| 414 vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); | |
| 415 vst1q_f32(&hNl[i], vec_hNl); | |
| 416 } | |
| 417 | |
| 418 // Suppress error signal | |
| 419 { | |
| 420 float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); | |
| 421 float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); | |
| 422 vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); | |
| 423 vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); | |
| 424 | |
| 425 // Ooura fft returns incorrect sign on imaginary component. It matters | |
| 426 // here because we are making an additive change with comfort noise. | |
| 427 vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); | |
| 428 vst1q_f32(&efw[0][i], vec_efw_re); | |
| 429 vst1q_f32(&efw[1][i], vec_efw_im); | |
| 430 } | |
| 431 } | |
| 432 | |
| 433 // scalar code for the remaining items. | |
| 434 for (; i < PART_LEN1; i++) { | |
| 435 // Weight subbands | |
| 436 if (hNl[i] > hNlFb) { | |
| 437 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + | |
| 438 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; | |
| 439 } | |
| 440 | |
| 441 hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); | |
| 442 | |
| 443 // Suppress error signal | |
| 444 efw[0][i] *= hNl[i]; | |
| 445 efw[1][i] *= hNl[i]; | |
| 446 | |
| 447 // Ooura fft returns incorrect sign on imaginary component. It matters | |
| 448 // here because we are making an additive change with comfort noise. | |
| 449 efw[1][i] *= -1; | |
| 450 } | |
| 451 } | |
| 452 | |
| 453 static int PartitionDelayNEON(const AecCore* aec) { | |
| 454 // Measures the energy in each filter partition and returns the partition with | |
| 455 // highest energy. | |
| 456 // TODO(bjornv): Spread computational cost by computing one partition per | |
| 457 // block? | |
| 458 float wfEnMax = 0; | |
| 459 int i; | |
| 460 int delay = 0; | |
| 461 | |
| 462 for (i = 0; i < aec->num_partitions; i++) { | |
| 463 int j; | |
| 464 int pos = i * PART_LEN1; | |
| 465 float wfEn = 0; | |
| 466 float32x4_t vec_wfEn = vdupq_n_f32(0.0f); | |
| 467 // vectorized code (four at once) | |
| 468 for (j = 0; j + 3 < PART_LEN1; j += 4) { | |
| 469 const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); | |
| 470 const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); | |
| 471 vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); | |
| 472 vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); | |
| 473 } | |
| 474 { | |
| 475 float32x2_t vec_total; | |
| 476 // A B C D | |
| 477 vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); | |
| 478 // A+B C+D | |
| 479 vec_total = vpadd_f32(vec_total, vec_total); | |
| 480 // A+B+C+D A+B+C+D | |
| 481 wfEn = vget_lane_f32(vec_total, 0); | |
| 482 } | |
| 483 | |
| 484 // scalar code for the remaining items. | |
| 485 for (; j < PART_LEN1; j++) { | |
| 486 wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + | |
| 487 aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; | |
| 488 } | |
| 489 | |
| 490 if (wfEn > wfEnMax) { | |
| 491 wfEnMax = wfEn; | |
| 492 delay = i; | |
| 493 } | |
| 494 } | |
| 495 return delay; | |
| 496 } | |
| 497 | |
| 498 // Updates the following smoothed Power Spectral Densities (PSD): | |
| 499 // - sd : near-end | |
| 500 // - se : residual echo | |
| 501 // - sx : far-end | |
| 502 // - sde : cross-PSD of near-end and residual echo | |
| 503 // - sxd : cross-PSD of near-end and far-end | |
| 504 // | |
| 505 // In addition to updating the PSDs, also the filter diverge state is determined | |
| 506 // upon actions are taken. | |
| 507 static void SmoothedPSD(AecCore* aec, | |
| 508 float efw[2][PART_LEN1], | |
| 509 float dfw[2][PART_LEN1], | |
| 510 float xfw[2][PART_LEN1], | |
| 511 int* extreme_filter_divergence) { | |
| 512 // Power estimate smoothing coefficients. | |
| 513 const float* ptrGCoh = | |
| 514 aec->extended_filter_enabled | |
| 515 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | |
| 516 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | |
| 517 int i; | |
| 518 float sdSum = 0, seSum = 0; | |
| 519 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); | |
| 520 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); | |
| 521 float32x4_t vec_seSum = vdupq_n_f32(0.0f); | |
| 522 | |
| 523 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
| 524 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); | |
| 525 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); | |
| 526 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); | |
| 527 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); | |
| 528 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); | |
| 529 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); | |
| 530 float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); | |
| 531 float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); | |
| 532 float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); | |
| 533 float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); | |
| 534 float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); | |
| 535 float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); | |
| 536 | |
| 537 vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); | |
| 538 vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); | |
| 539 vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); | |
| 540 vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); | |
| 541 vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); | |
| 542 vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); | |
| 543 vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); | |
| 544 | |
| 545 vst1q_f32(&aec->sd[i], vec_sd); | |
| 546 vst1q_f32(&aec->se[i], vec_se); | |
| 547 vst1q_f32(&aec->sx[i], vec_sx); | |
| 548 | |
| 549 { | |
| 550 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); | |
| 551 float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); | |
| 552 float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); | |
| 553 vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); | |
| 554 vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); | |
| 555 vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); | |
| 556 vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); | |
| 557 vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); | |
| 558 vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); | |
| 559 vst2q_f32(&aec->sde[i][0], vec_sde); | |
| 560 } | |
| 561 | |
| 562 { | |
| 563 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); | |
| 564 float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); | |
| 565 float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); | |
| 566 vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); | |
| 567 vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); | |
| 568 vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); | |
| 569 vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); | |
| 570 vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); | |
| 571 vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); | |
| 572 vst2q_f32(&aec->sxd[i][0], vec_sxd); | |
| 573 } | |
| 574 | |
| 575 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); | |
| 576 vec_seSum = vaddq_f32(vec_seSum, vec_se); | |
| 577 } | |
| 578 { | |
| 579 float32x2_t vec_sdSum_total; | |
| 580 float32x2_t vec_seSum_total; | |
| 581 // A B C D | |
| 582 vec_sdSum_total = | |
| 583 vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); | |
| 584 vec_seSum_total = | |
| 585 vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); | |
| 586 // A+B C+D | |
| 587 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); | |
| 588 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); | |
| 589 // A+B+C+D A+B+C+D | |
| 590 sdSum = vget_lane_f32(vec_sdSum_total, 0); | |
| 591 seSum = vget_lane_f32(vec_seSum_total, 0); | |
| 592 } | |
| 593 | |
| 594 // scalar code for the remaining items. | |
| 595 for (; i < PART_LEN1; i++) { | |
| 596 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | |
| 597 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | |
| 598 aec->se[i] = ptrGCoh[0] * aec->se[i] + | |
| 599 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | |
| 600 // We threshold here to protect against the ill-effects of a zero farend. | |
| 601 // The threshold is not arbitrarily chosen, but balances protection and | |
| 602 // adverse interaction with the algorithm's tuning. | |
| 603 // TODO(bjornv): investigate further why this is so sensitive. | |
| 604 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + | |
| 605 ptrGCoh[1] * WEBRTC_SPL_MAX( | |
| 606 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | |
| 607 WebRtcAec_kMinFarendPSD); | |
| 608 | |
| 609 aec->sde[i][0] = | |
| 610 ptrGCoh[0] * aec->sde[i][0] + | |
| 611 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | |
| 612 aec->sde[i][1] = | |
| 613 ptrGCoh[0] * aec->sde[i][1] + | |
| 614 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | |
| 615 | |
| 616 aec->sxd[i][0] = | |
| 617 ptrGCoh[0] * aec->sxd[i][0] + | |
| 618 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); | |
| 619 aec->sxd[i][1] = | |
| 620 ptrGCoh[0] * aec->sxd[i][1] + | |
| 621 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); | |
| 622 | |
| 623 sdSum += aec->sd[i]; | |
| 624 seSum += aec->se[i]; | |
| 625 } | |
| 626 | |
| 627 // Divergent filter safeguard update. | |
| 628 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; | |
| 629 | |
| 630 // Signal extreme filter divergence if the error is significantly larger | |
| 631 // than the nearend (13 dB). | |
| 632 *extreme_filter_divergence = (seSum > (19.95f * sdSum)); | |
| 633 } | |
| 634 | |
| 635 // Window time domain data to be used by the fft. | |
| 636 static void WindowDataNEON(float* x_windowed, const float* x) { | |
| 637 int i; | |
| 638 for (i = 0; i < PART_LEN; i += 4) { | |
| 639 const float32x4_t vec_Buf1 = vld1q_f32(&x[i]); | |
| 640 const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]); | |
| 641 const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]); | |
| 642 // A B C D | |
| 643 float32x4_t vec_sqrtHanning_rev = | |
| 644 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | |
| 645 // B A D C | |
| 646 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); | |
| 647 // D C B A | |
| 648 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), | |
| 649 vget_low_f32(vec_sqrtHanning_rev)); | |
| 650 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); | |
| 651 vst1q_f32(&x_windowed[PART_LEN + i], | |
| 652 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); | |
| 653 } | |
| 654 } | |
| 655 | |
| 656 // Puts fft output data into a complex valued array. | |
| 657 static void StoreAsComplexNEON(const float* data, | |
| 658 float data_complex[2][PART_LEN1]) { | |
| 659 int i; | |
| 660 for (i = 0; i < PART_LEN; i += 4) { | |
| 661 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); | |
| 662 vst1q_f32(&data_complex[0][i], vec_data.val[0]); | |
| 663 vst1q_f32(&data_complex[1][i], vec_data.val[1]); | |
| 664 } | |
| 665 // fix beginning/end values | |
| 666 data_complex[1][0] = 0; | |
| 667 data_complex[1][PART_LEN] = 0; | |
| 668 data_complex[0][0] = data[0]; | |
| 669 data_complex[0][PART_LEN] = data[1]; | |
| 670 } | |
| 671 | |
| 672 static void SubbandCoherenceNEON(AecCore* aec, | |
| 673 float efw[2][PART_LEN1], | |
| 674 float dfw[2][PART_LEN1], | |
| 675 float xfw[2][PART_LEN1], | |
| 676 float* fft, | |
| 677 float* cohde, | |
| 678 float* cohxd, | |
| 679 int* extreme_filter_divergence) { | |
| 680 int i; | |
| 681 | |
| 682 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | |
| 683 | |
| 684 { | |
| 685 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); | |
| 686 | |
| 687 // Subband coherence | |
| 688 for (i = 0; i + 3 < PART_LEN1; i += 4) { | |
| 689 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); | |
| 690 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); | |
| 691 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); | |
| 692 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); | |
| 693 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); | |
| 694 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); | |
| 695 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); | |
| 696 float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); | |
| 697 float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); | |
| 698 vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); | |
| 699 vec_cohde = vdivq_f32(vec_cohde, vec_sdse); | |
| 700 vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); | |
| 701 vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); | |
| 702 | |
| 703 vst1q_f32(&cohde[i], vec_cohde); | |
| 704 vst1q_f32(&cohxd[i], vec_cohxd); | |
| 705 } | |
| 706 } | |
| 707 // scalar code for the remaining items. | |
| 708 for (; i < PART_LEN1; i++) { | |
| 709 cohde[i] = | |
| 710 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / | |
| 711 (aec->sd[i] * aec->se[i] + 1e-10f); | |
| 712 cohxd[i] = | |
| 713 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / | |
| 714 (aec->sx[i] * aec->sd[i] + 1e-10f); | |
| 715 } | |
| 716 } | |
| 717 | |
| 718 void WebRtcAec_InitAec_neon(void) { | |
| 719 WebRtcAec_FilterFar = FilterFarNEON; | |
| 720 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; | |
| 721 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; | |
| 722 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; | |
| 723 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; | |
| 724 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; | |
| 725 WebRtcAec_PartitionDelay = PartitionDelayNEON; | |
| 726 WebRtcAec_WindowData = WindowDataNEON; | |
| 727 } | |
| OLD | NEW |