| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 27 enum { kFloatExponentShift = 23 }; | 27 enum { kFloatExponentShift = 23 }; |
| 28 | 28 |
| 29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
| 30 return aRe * bRe - aIm * bIm; | 30 return aRe * bRe - aIm * bIm; |
| 31 } | 31 } |
| 32 | 32 |
| 33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
| 34 return aRe * bIm + aIm * bRe; | 34 return aRe * bIm + aIm * bRe; |
| 35 } | 35 } |
| 36 | 36 |
| 37 static void FilterFarNEON( | 37 static void FilterFarNEON(int num_partitions, |
| 38 int num_partitions, | 38 int x_fft_buf_block_pos, |
| 39 int x_fft_buf_block_pos, | 39 float x_fft_buf[2] |
| 40 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 40 [kExtendedNumPartitions * PART_LEN1], |
| 41 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 41 float h_fft_buf[2] |
| 42 float y_fft[2][PART_LEN1]) { | 42 [kExtendedNumPartitions * PART_LEN1], |
| 43 float y_fft[2][PART_LEN1]) { |
| 43 int i; | 44 int i; |
| 44 for (i = 0; i < num_partitions; i++) { | 45 for (i = 0; i < num_partitions; i++) { |
| 45 int j; | 46 int j; |
| 46 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | 47 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; |
| 47 int pos = i * PART_LEN1; | 48 int pos = i * PART_LEN1; |
| 48 // Check for wrap | 49 // Check for wrap |
| 49 if (i + x_fft_buf_block_pos >= num_partitions) { | 50 if (i + x_fft_buf_block_pos >= num_partitions) { |
| 50 xPos -= num_partitions * PART_LEN1; | 51 xPos -= num_partitions * PART_LEN1; |
| 51 } | 52 } |
| 52 | 53 |
| 53 // vectorized code (four at once) | 54 // vectorized code (four at once) |
| 54 for (j = 0; j + 3 < PART_LEN1; j += 4) { | 55 for (j = 0; j + 3 < PART_LEN1; j += 4) { |
| 55 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); | 56 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); |
| 56 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); | 57 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); |
| 57 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | 58 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); |
| 58 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | 59 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); |
| 59 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); | 60 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); |
| 60 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); | 61 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); |
| 61 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); | 62 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); |
| 62 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); | 63 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); |
| 63 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); | 64 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); |
| 64 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); | 65 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); |
| 65 const float32x4_t g = vaddq_f32(y_fft_re, e); | 66 const float32x4_t g = vaddq_f32(y_fft_re, e); |
| 66 const float32x4_t h = vaddq_f32(y_fft_im, f); | 67 const float32x4_t h = vaddq_f32(y_fft_im, f); |
| 67 vst1q_f32(&y_fft[0][j], g); | 68 vst1q_f32(&y_fft[0][j], g); |
| 68 vst1q_f32(&y_fft[1][j], h); | 69 vst1q_f32(&y_fft[1][j], h); |
| 69 } | 70 } |
| 70 // scalar code for the remaining items. | 71 // scalar code for the remaining items. |
| 71 for (; j < PART_LEN1; j++) { | 72 for (; j < PART_LEN1; j++) { |
| 72 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], | 73 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], |
| 73 x_fft_buf[1][xPos + j], | 74 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); |
| 74 h_fft_buf[0][pos + j], | 75 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], |
| 75 h_fft_buf[1][pos + j]); | 76 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); |
| 76 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], | |
| 77 x_fft_buf[1][xPos + j], | |
| 78 h_fft_buf[0][pos + j], | |
| 79 h_fft_buf[1][pos + j]); | |
| 80 } | 77 } |
| 81 } | 78 } |
| 82 } | 79 } |
| 83 | 80 |
| 84 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. | 81 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. |
| 85 #if !defined (WEBRTC_ARCH_ARM64) | 82 #if !defined(WEBRTC_ARCH_ARM64) |
| 86 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { | 83 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { |
| 87 int i; | 84 int i; |
| 88 float32x4_t x = vrecpeq_f32(b); | 85 float32x4_t x = vrecpeq_f32(b); |
| 89 // from arm documentation | 86 // from arm documentation |
| 90 // The Newton-Raphson iteration: | 87 // The Newton-Raphson iteration: |
| 91 // x[n+1] = x[n] * (2 - d * x[n]) | 88 // x[n+1] = x[n] * (2 - d * x[n]) |
| 92 // converges to (1/d) if x0 is the result of VRECPE applied to d. | 89 // converges to (1/d) if x0 is the result of VRECPE applied to d. |
| 93 // | 90 // |
| 94 // Note: The precision did not improve after 2 iterations. | 91 // Note: The precision did not improve after 2 iterations. |
| 95 for (i = 0; i < 2; i++) { | 92 for (i = 0; i < 2; i++) { |
| 96 x = vmulq_f32(vrecpsq_f32(b, x), x); | 93 x = vmulq_f32(vrecpsq_f32(b, x), x); |
| 97 } | 94 } |
| 98 // a/b = a*(1/b) | 95 // a/b = a*(1/b) |
| 99 return vmulq_f32(a, x); | 96 return vmulq_f32(a, x); |
| 100 } | 97 } |
| 101 | 98 |
| 102 static float32x4_t vsqrtq_f32(float32x4_t s) { | 99 static float32x4_t vsqrtq_f32(float32x4_t s) { |
| 103 int i; | 100 int i; |
| 104 float32x4_t x = vrsqrteq_f32(s); | 101 float32x4_t x = vrsqrteq_f32(s); |
| 105 | 102 |
| 106 // Code to handle sqrt(0). | 103 // Code to handle sqrt(0). |
| 107 // If the input to sqrtf() is zero, a zero will be returned. | 104 // If the input to sqrtf() is zero, a zero will be returned. |
| 108 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. | 105 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. |
| 109 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); | 106 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); |
| 110 // check for divide by zero | 107 // check for divide by zero |
| 111 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); | 108 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); |
| 112 // zero out the positive infinity results | 109 // zero out the positive infinity results |
| 113 x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), | 110 x = vreinterpretq_f32_u32( |
| 114 vreinterpretq_u32_f32(x))); | 111 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); |
| 115 // from arm documentation | 112 // from arm documentation |
| 116 // The Newton-Raphson iteration: | 113 // The Newton-Raphson iteration: |
| 117 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) | 114 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) |
| 118 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. | 115 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. |
| 119 // | 116 // |
| 120 // Note: The precision did not improve after 2 iterations. | 117 // Note: The precision did not improve after 2 iterations. |
| 121 for (i = 0; i < 2; i++) { | 118 for (i = 0; i < 2; i++) { |
| 122 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); | 119 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); |
| 123 } | 120 } |
| 124 // sqrt(s) = s * 1/sqrt(s) | 121 // sqrt(s) = s * 1/sqrt(s) |
| 125 return vmulq_f32(s, x);; | 122 return vmulq_f32(s, x); |
| 123 ; |
| 126 } | 124 } |
| 127 #endif // WEBRTC_ARCH_ARM64 | 125 #endif // WEBRTC_ARCH_ARM64 |
| 128 | 126 |
| 129 static void ScaleErrorSignalNEON(int extended_filter_enabled, | 127 static void ScaleErrorSignalNEON(int extended_filter_enabled, |
| 130 float normal_mu, | 128 float normal_mu, |
| 131 float normal_error_threshold, | 129 float normal_error_threshold, |
| 132 float x_pow[PART_LEN1], | 130 float x_pow[PART_LEN1], |
| 133 float ef[2][PART_LEN1]) { | 131 float ef[2][PART_LEN1]) { |
| 134 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; | 132 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; |
| 135 const float error_threshold = extended_filter_enabled ? | 133 const float error_threshold = extended_filter_enabled |
| 136 kExtendedErrorThreshold : normal_error_threshold; | 134 ? kExtendedErrorThreshold |
| 135 : normal_error_threshold; |
| 137 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); | 136 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); |
| 138 const float32x4_t kMu = vmovq_n_f32(mu); | 137 const float32x4_t kMu = vmovq_n_f32(mu); |
| 139 const float32x4_t kThresh = vmovq_n_f32(error_threshold); | 138 const float32x4_t kThresh = vmovq_n_f32(error_threshold); |
| 140 int i; | 139 int i; |
| 141 // vectorized code (four at once) | 140 // vectorized code (four at once) |
| 142 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 141 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 143 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); | 142 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); |
| 144 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); | 143 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); |
| 145 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); | 144 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); |
| 146 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); | 145 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); |
| 147 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); | 146 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); |
| 148 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); | 147 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); |
| 149 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); | 148 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); |
| 150 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); | 149 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); |
| 151 const float32x4_t absEf = vsqrtq_f32(ef_sum2); | 150 const float32x4_t absEf = vsqrtq_f32(ef_sum2); |
| 152 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); | 151 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); |
| 153 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); | 152 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); |
| 154 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); | 153 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); |
| 155 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); | 154 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); |
| 156 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); | 155 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); |
| 157 uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), | 156 uint32x4_t ef_re_u32 = |
| 158 vreinterpretq_u32_f32(ef_re)); | 157 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); |
| 159 uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), | 158 uint32x4_t ef_im_u32 = |
| 160 vreinterpretq_u32_f32(ef_im)); | 159 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); |
| 161 ef_re_if = vandq_u32(bigger, ef_re_if); | 160 ef_re_if = vandq_u32(bigger, ef_re_if); |
| 162 ef_im_if = vandq_u32(bigger, ef_im_if); | 161 ef_im_if = vandq_u32(bigger, ef_im_if); |
| 163 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); | 162 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); |
| 164 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); | 163 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); |
| 165 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); | 164 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); |
| 166 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); | 165 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); |
| 167 vst1q_f32(&ef[0][i], ef_re); | 166 vst1q_f32(&ef[0][i], ef_re); |
| 168 vst1q_f32(&ef[1][i], ef_im); | 167 vst1q_f32(&ef[1][i], ef_im); |
| 169 } | 168 } |
| 170 // scalar code for the remaining items. | 169 // scalar code for the remaining items. |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 217 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); | 216 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); |
| 218 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); | 217 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); |
| 219 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); | 218 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); |
| 220 // Interleave real and imaginary parts. | 219 // Interleave real and imaginary parts. |
| 221 const float32x4x2_t g_n_h = vzipq_f32(e, f); | 220 const float32x4x2_t g_n_h = vzipq_f32(e, f); |
| 222 // Store | 221 // Store |
| 223 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); | 222 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); |
| 224 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); | 223 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); |
| 225 } | 224 } |
| 226 // ... and fixup the first imaginary entry. | 225 // ... and fixup the first imaginary entry. |
| 227 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], | 226 fft[1] = |
| 228 -x_fft_buf[1][xPos + PART_LEN], | 227 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], |
| 229 e_fft[0][PART_LEN], | 228 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); |
| 230 e_fft[1][PART_LEN]); | |
| 231 | 229 |
| 232 aec_rdft_inverse_128(fft); | 230 aec_rdft_inverse_128(fft); |
| 233 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | 231 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); |
| 234 | 232 |
| 235 // fft scaling | 233 // fft scaling |
| 236 { | 234 { |
| 237 const float scale = 2.0f / PART_LEN2; | 235 const float scale = 2.0f / PART_LEN2; |
| 238 const float32x4_t scale_ps = vmovq_n_f32(scale); | 236 const float32x4_t scale_ps = vmovq_n_f32(scale); |
| 239 for (j = 0; j < PART_LEN; j += 4) { | 237 for (j = 0; j < PART_LEN; j += 4) { |
| 240 const float32x4_t fft_ps = vld1q_f32(&fft[j]); | 238 const float32x4_t fft_ps = vld1q_f32(&fft[j]); |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 285 | 283 |
| 286 // Compute n. | 284 // Compute n. |
| 287 // This is done by masking the exponent, shifting it into the top bit of | 285 // This is done by masking the exponent, shifting it into the top bit of |
| 288 // the mantissa, putting eight into the biased exponent (to shift/ | 286 // the mantissa, putting eight into the biased exponent (to shift/ |
| 289 // compensate the fact that the exponent has been shifted in the top/ | 287 // compensate the fact that the exponent has been shifted in the top/ |
| 290 // fractional part and finally getting rid of the implicit leading one | 288 // fractional part and finally getting rid of the implicit leading one |
| 291 // from the mantissa by substracting it out. | 289 // from the mantissa by substracting it out. |
| 292 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); | 290 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); |
| 293 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); | 291 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); |
| 294 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); | 292 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); |
| 295 const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), | 293 const uint32x4_t two_n = |
| 296 vec_float_exponent_mask); | 294 vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); |
| 297 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); | 295 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); |
| 298 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); | 296 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); |
| 299 const float32x4_t n = | 297 const float32x4_t n = |
| 300 vsubq_f32(vreinterpretq_f32_u32(n_0), | 298 vsubq_f32(vreinterpretq_f32_u32(n_0), |
| 301 vreinterpretq_f32_u32(vec_implicit_leading_one)); | 299 vreinterpretq_f32_u32(vec_implicit_leading_one)); |
| 302 // Compute y. | 300 // Compute y. |
| 303 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); | 301 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); |
| 304 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); | 302 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); |
| 305 const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), | 303 const uint32x4_t mantissa = |
| 306 vec_mantissa_mask); | 304 vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); |
| 307 const float32x4_t y = | 305 const float32x4_t y = vreinterpretq_f32_u32( |
| 308 vreinterpretq_f32_u32(vorrq_u32(mantissa, | 306 vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); |
| 309 vec_zero_biased_exponent_is_one)); | |
| 310 // Approximate log2(y) ~= (y - 1) * pol5(y). | 307 // Approximate log2(y) ~= (y - 1) * pol5(y). |
| 311 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | 308 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 |
| 312 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); | 309 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); |
| 313 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); | 310 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); |
| 314 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); | 311 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); |
| 315 const float32x4_t C2 = vdupq_n_f32(2.5988452f); | 312 const float32x4_t C2 = vdupq_n_f32(2.5988452f); |
| 316 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); | 313 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); |
| 317 const float32x4_t C0 = vdupq_n_f32(3.1157899f); | 314 const float32x4_t C0 = vdupq_n_f32(3.1157899f); |
| 318 float32x4_t pol5_y = C5; | 315 float32x4_t pol5_y = C5; |
| 319 pol5_y = vmlaq_f32(C4, y, pol5_y); | 316 pol5_y = vmlaq_f32(C4, y, pol5_y); |
| (...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 388 const float32x4_t vec_one = vdupq_n_f32(1.0f); | 385 const float32x4_t vec_one = vdupq_n_f32(1.0f); |
| 389 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); | 386 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); |
| 390 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); | 387 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); |
| 391 | 388 |
| 392 // vectorized code (four at once) | 389 // vectorized code (four at once) |
| 393 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 390 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 394 // Weight subbands | 391 // Weight subbands |
| 395 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); | 392 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); |
| 396 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); | 393 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); |
| 397 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); | 394 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); |
| 398 const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, | 395 const float32x4_t vec_weightCurve_hNlFb = |
| 399 vec_hNlFb); | 396 vmulq_f32(vec_weightCurve, vec_hNlFb); |
| 400 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); | 397 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); |
| 401 const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, | 398 const float32x4_t vec_one_weightCurve_hNl = |
| 402 vec_hNl); | 399 vmulq_f32(vec_one_weightCurve, vec_hNl); |
| 403 const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), | 400 const uint32x4_t vec_if0 = |
| 404 vreinterpretq_u32_f32(vec_hNl)); | 401 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); |
| 405 const float32x4_t vec_one_weightCurve_add = | 402 const float32x4_t vec_one_weightCurve_add = |
| 406 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); | 403 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); |
| 407 const uint32x4_t vec_if1 = | 404 const uint32x4_t vec_if1 = |
| 408 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); | 405 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); |
| 409 | 406 |
| 410 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); | 407 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); |
| 411 | 408 |
| 412 { | 409 { |
| 413 const float32x4_t vec_overDriveCurve = | 410 const float32x4_t vec_overDriveCurve = |
| 414 vld1q_f32(&WebRtcAec_overDriveCurve[i]); | 411 vld1q_f32(&WebRtcAec_overDriveCurve[i]); |
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 506 // - sxd : cross-PSD of near-end and far-end | 503 // - sxd : cross-PSD of near-end and far-end |
| 507 // | 504 // |
| 508 // In addition to updating the PSDs, also the filter diverge state is determined | 505 // In addition to updating the PSDs, also the filter diverge state is determined |
| 509 // upon actions are taken. | 506 // upon actions are taken. |
| 510 static void SmoothedPSD(AecCore* aec, | 507 static void SmoothedPSD(AecCore* aec, |
| 511 float efw[2][PART_LEN1], | 508 float efw[2][PART_LEN1], |
| 512 float dfw[2][PART_LEN1], | 509 float dfw[2][PART_LEN1], |
| 513 float xfw[2][PART_LEN1], | 510 float xfw[2][PART_LEN1], |
| 514 int* extreme_filter_divergence) { | 511 int* extreme_filter_divergence) { |
| 515 // Power estimate smoothing coefficients. | 512 // Power estimate smoothing coefficients. |
| 516 const float* ptrGCoh = aec->extended_filter_enabled | 513 const float* ptrGCoh = |
| 517 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | 514 aec->extended_filter_enabled |
| 518 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | 515 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] |
| 516 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; |
| 519 int i; | 517 int i; |
| 520 float sdSum = 0, seSum = 0; | 518 float sdSum = 0, seSum = 0; |
| 521 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); | 519 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); |
| 522 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); | 520 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); |
| 523 float32x4_t vec_seSum = vdupq_n_f32(0.0f); | 521 float32x4_t vec_seSum = vdupq_n_f32(0.0f); |
| 524 | 522 |
| 525 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 523 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 526 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); | 524 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); |
| 527 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); | 525 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); |
| 528 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); | 526 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); |
| 529 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); | 527 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); |
| 530 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); | 528 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); |
| 531 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); | 529 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 574 vst2q_f32(&aec->sxd[i][0], vec_sxd); | 572 vst2q_f32(&aec->sxd[i][0], vec_sxd); |
| 575 } | 573 } |
| 576 | 574 |
| 577 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); | 575 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); |
| 578 vec_seSum = vaddq_f32(vec_seSum, vec_se); | 576 vec_seSum = vaddq_f32(vec_seSum, vec_se); |
| 579 } | 577 } |
| 580 { | 578 { |
| 581 float32x2_t vec_sdSum_total; | 579 float32x2_t vec_sdSum_total; |
| 582 float32x2_t vec_seSum_total; | 580 float32x2_t vec_seSum_total; |
| 583 // A B C D | 581 // A B C D |
| 584 vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), | 582 vec_sdSum_total = |
| 585 vget_high_f32(vec_sdSum)); | 583 vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); |
| 586 vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), | 584 vec_seSum_total = |
| 587 vget_high_f32(vec_seSum)); | 585 vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); |
| 588 // A+B C+D | 586 // A+B C+D |
| 589 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); | 587 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); |
| 590 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); | 588 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); |
| 591 // A+B+C+D A+B+C+D | 589 // A+B+C+D A+B+C+D |
| 592 sdSum = vget_lane_f32(vec_sdSum_total, 0); | 590 sdSum = vget_lane_f32(vec_sdSum_total, 0); |
| 593 seSum = vget_lane_f32(vec_seSum_total, 0); | 591 seSum = vget_lane_f32(vec_seSum_total, 0); |
| 594 } | 592 } |
| 595 | 593 |
| 596 // scalar code for the remaining items. | 594 // scalar code for the remaining items. |
| 597 for (; i < PART_LEN1; i++) { | 595 for (; i < PART_LEN1; i++) { |
| 598 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | 596 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + |
| 599 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | 597 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); |
| 600 aec->se[i] = ptrGCoh[0] * aec->se[i] + | 598 aec->se[i] = ptrGCoh[0] * aec->se[i] + |
| 601 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | 599 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); |
| 602 // We threshold here to protect against the ill-effects of a zero farend. | 600 // We threshold here to protect against the ill-effects of a zero farend. |
| 603 // The threshold is not arbitrarily chosen, but balances protection and | 601 // The threshold is not arbitrarily chosen, but balances protection and |
| 604 // adverse interaction with the algorithm's tuning. | 602 // adverse interaction with the algorithm's tuning. |
| 605 // TODO(bjornv): investigate further why this is so sensitive. | 603 // TODO(bjornv): investigate further why this is so sensitive. |
| 606 aec->sx[i] = | 604 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + |
| 607 ptrGCoh[0] * aec->sx[i] + | 605 ptrGCoh[1] * WEBRTC_SPL_MAX( |
| 608 ptrGCoh[1] * WEBRTC_SPL_MAX( | 606 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], |
| 609 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | 607 WebRtcAec_kMinFarendPSD); |
| 610 WebRtcAec_kMinFarendPSD); | |
| 611 | 608 |
| 612 aec->sde[i][0] = | 609 aec->sde[i][0] = |
| 613 ptrGCoh[0] * aec->sde[i][0] + | 610 ptrGCoh[0] * aec->sde[i][0] + |
| 614 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | 611 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); |
| 615 aec->sde[i][1] = | 612 aec->sde[i][1] = |
| 616 ptrGCoh[0] * aec->sde[i][1] + | 613 ptrGCoh[0] * aec->sde[i][1] + |
| 617 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | 614 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); |
| 618 | 615 |
| 619 aec->sxd[i][0] = | 616 aec->sxd[i][0] = |
| 620 ptrGCoh[0] * aec->sxd[i][0] + | 617 ptrGCoh[0] * aec->sxd[i][0] + |
| (...skipping 24 matching lines...) Expand all Loading... |
| 645 // A B C D | 642 // A B C D |
| 646 float32x4_t vec_sqrtHanning_rev = | 643 float32x4_t vec_sqrtHanning_rev = |
| 647 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | 644 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); |
| 648 // B A D C | 645 // B A D C |
| 649 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); | 646 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); |
| 650 // D C B A | 647 // D C B A |
| 651 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), | 648 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), |
| 652 vget_low_f32(vec_sqrtHanning_rev)); | 649 vget_low_f32(vec_sqrtHanning_rev)); |
| 653 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); | 650 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); |
| 654 vst1q_f32(&x_windowed[PART_LEN + i], | 651 vst1q_f32(&x_windowed[PART_LEN + i], |
| 655 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); | 652 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); |
| 656 } | 653 } |
| 657 } | 654 } |
| 658 | 655 |
| 659 // Puts fft output data into a complex valued array. | 656 // Puts fft output data into a complex valued array. |
| 660 static void StoreAsComplexNEON(const float* data, | 657 static void StoreAsComplexNEON(const float* data, |
| 661 float data_complex[2][PART_LEN1]) { | 658 float data_complex[2][PART_LEN1]) { |
| 662 int i; | 659 int i; |
| 663 for (i = 0; i < PART_LEN; i += 4) { | 660 for (i = 0; i < PART_LEN; i += 4) { |
| 664 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); | 661 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); |
| 665 vst1q_f32(&data_complex[0][i], vec_data.val[0]); | 662 vst1q_f32(&data_complex[0][i], vec_data.val[0]); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 678 float xfw[2][PART_LEN1], | 675 float xfw[2][PART_LEN1], |
| 679 float* fft, | 676 float* fft, |
| 680 float* cohde, | 677 float* cohde, |
| 681 float* cohxd, | 678 float* cohxd, |
| 682 int* extreme_filter_divergence) { | 679 int* extreme_filter_divergence) { |
| 683 int i; | 680 int i; |
| 684 | 681 |
| 685 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | 682 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); |
| 686 | 683 |
| 687 { | 684 { |
| 688 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); | 685 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); |
| 689 | 686 |
| 690 // Subband coherence | 687 // Subband coherence |
| 691 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 688 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
| 692 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); | 689 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); |
| 693 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); | 690 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); |
| 694 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); | 691 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); |
| 695 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); | 692 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); |
| 696 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); | 693 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); |
| 697 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); | 694 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); |
| 698 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); | 695 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); |
| (...skipping 22 matching lines...) Expand all Loading... |
| 721 void WebRtcAec_InitAec_neon(void) { | 718 void WebRtcAec_InitAec_neon(void) { |
| 722 WebRtcAec_FilterFar = FilterFarNEON; | 719 WebRtcAec_FilterFar = FilterFarNEON; |
| 723 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; | 720 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; |
| 724 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; | 721 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; |
| 725 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; | 722 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; |
| 726 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; | 723 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; |
| 727 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; | 724 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; |
| 728 WebRtcAec_PartitionDelay = PartitionDelayNEON; | 725 WebRtcAec_PartitionDelay = PartitionDelayNEON; |
| 729 WebRtcAec_WindowData = WindowDataNEON; | 726 WebRtcAec_WindowData = WindowDataNEON; |
| 730 } | 727 } |
| OLD | NEW |