OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 16 matching lines...) Expand all Loading... |
27 enum { kFloatExponentShift = 23 }; | 27 enum { kFloatExponentShift = 23 }; |
28 | 28 |
29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
30 return aRe * bRe - aIm * bIm; | 30 return aRe * bRe - aIm * bIm; |
31 } | 31 } |
32 | 32 |
33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
34 return aRe * bIm + aIm * bRe; | 34 return aRe * bIm + aIm * bRe; |
35 } | 35 } |
36 | 36 |
37 static void FilterFarNEON( | 37 static void FilterFarNEON(int num_partitions, |
38 int num_partitions, | 38 int x_fft_buf_block_pos, |
39 int x_fft_buf_block_pos, | 39 float x_fft_buf[2] |
40 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 40 [kExtendedNumPartitions * PART_LEN1], |
41 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 41 float h_fft_buf[2] |
42 float y_fft[2][PART_LEN1]) { | 42 [kExtendedNumPartitions * PART_LEN1], |
| 43 float y_fft[2][PART_LEN1]) { |
43 int i; | 44 int i; |
44 for (i = 0; i < num_partitions; i++) { | 45 for (i = 0; i < num_partitions; i++) { |
45 int j; | 46 int j; |
46 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | 47 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; |
47 int pos = i * PART_LEN1; | 48 int pos = i * PART_LEN1; |
48 // Check for wrap | 49 // Check for wrap |
49 if (i + x_fft_buf_block_pos >= num_partitions) { | 50 if (i + x_fft_buf_block_pos >= num_partitions) { |
50 xPos -= num_partitions * PART_LEN1; | 51 xPos -= num_partitions * PART_LEN1; |
51 } | 52 } |
52 | 53 |
53 // vectorized code (four at once) | 54 // vectorized code (four at once) |
54 for (j = 0; j + 3 < PART_LEN1; j += 4) { | 55 for (j = 0; j + 3 < PART_LEN1; j += 4) { |
55 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); | 56 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); |
56 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); | 57 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); |
57 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | 58 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); |
58 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | 59 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); |
59 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); | 60 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); |
60 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); | 61 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); |
61 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); | 62 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); |
62 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); | 63 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); |
63 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); | 64 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); |
64 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); | 65 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); |
65 const float32x4_t g = vaddq_f32(y_fft_re, e); | 66 const float32x4_t g = vaddq_f32(y_fft_re, e); |
66 const float32x4_t h = vaddq_f32(y_fft_im, f); | 67 const float32x4_t h = vaddq_f32(y_fft_im, f); |
67 vst1q_f32(&y_fft[0][j], g); | 68 vst1q_f32(&y_fft[0][j], g); |
68 vst1q_f32(&y_fft[1][j], h); | 69 vst1q_f32(&y_fft[1][j], h); |
69 } | 70 } |
70 // scalar code for the remaining items. | 71 // scalar code for the remaining items. |
71 for (; j < PART_LEN1; j++) { | 72 for (; j < PART_LEN1; j++) { |
72 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], | 73 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], |
73 x_fft_buf[1][xPos + j], | 74 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); |
74 h_fft_buf[0][pos + j], | 75 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], |
75 h_fft_buf[1][pos + j]); | 76 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); |
76 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], | |
77 x_fft_buf[1][xPos + j], | |
78 h_fft_buf[0][pos + j], | |
79 h_fft_buf[1][pos + j]); | |
80 } | 77 } |
81 } | 78 } |
82 } | 79 } |
83 | 80 |
84 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. | 81 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. |
85 #if !defined (WEBRTC_ARCH_ARM64) | 82 #if !defined(WEBRTC_ARCH_ARM64) |
86 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { | 83 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { |
87 int i; | 84 int i; |
88 float32x4_t x = vrecpeq_f32(b); | 85 float32x4_t x = vrecpeq_f32(b); |
89 // from arm documentation | 86 // from arm documentation |
90 // The Newton-Raphson iteration: | 87 // The Newton-Raphson iteration: |
91 // x[n+1] = x[n] * (2 - d * x[n]) | 88 // x[n+1] = x[n] * (2 - d * x[n]) |
92 // converges to (1/d) if x0 is the result of VRECPE applied to d. | 89 // converges to (1/d) if x0 is the result of VRECPE applied to d. |
93 // | 90 // |
94 // Note: The precision did not improve after 2 iterations. | 91 // Note: The precision did not improve after 2 iterations. |
95 for (i = 0; i < 2; i++) { | 92 for (i = 0; i < 2; i++) { |
96 x = vmulq_f32(vrecpsq_f32(b, x), x); | 93 x = vmulq_f32(vrecpsq_f32(b, x), x); |
97 } | 94 } |
98 // a/b = a*(1/b) | 95 // a/b = a*(1/b) |
99 return vmulq_f32(a, x); | 96 return vmulq_f32(a, x); |
100 } | 97 } |
101 | 98 |
102 static float32x4_t vsqrtq_f32(float32x4_t s) { | 99 static float32x4_t vsqrtq_f32(float32x4_t s) { |
103 int i; | 100 int i; |
104 float32x4_t x = vrsqrteq_f32(s); | 101 float32x4_t x = vrsqrteq_f32(s); |
105 | 102 |
106 // Code to handle sqrt(0). | 103 // Code to handle sqrt(0). |
107 // If the input to sqrtf() is zero, a zero will be returned. | 104 // If the input to sqrtf() is zero, a zero will be returned. |
108 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. | 105 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. |
109 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); | 106 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); |
110 // check for divide by zero | 107 // check for divide by zero |
111 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); | 108 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); |
112 // zero out the positive infinity results | 109 // zero out the positive infinity results |
113 x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), | 110 x = vreinterpretq_f32_u32( |
114 vreinterpretq_u32_f32(x))); | 111 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); |
115 // from arm documentation | 112 // from arm documentation |
116 // The Newton-Raphson iteration: | 113 // The Newton-Raphson iteration: |
117 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) | 114 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) |
118 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. | 115 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. |
119 // | 116 // |
120 // Note: The precision did not improve after 2 iterations. | 117 // Note: The precision did not improve after 2 iterations. |
121 for (i = 0; i < 2; i++) { | 118 for (i = 0; i < 2; i++) { |
122 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); | 119 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); |
123 } | 120 } |
124 // sqrt(s) = s * 1/sqrt(s) | 121 // sqrt(s) = s * 1/sqrt(s) |
125 return vmulq_f32(s, x);; | 122 return vmulq_f32(s, x); |
| 123 ; |
126 } | 124 } |
127 #endif // WEBRTC_ARCH_ARM64 | 125 #endif // WEBRTC_ARCH_ARM64 |
128 | 126 |
129 static void ScaleErrorSignalNEON(int extended_filter_enabled, | 127 static void ScaleErrorSignalNEON(int extended_filter_enabled, |
130 float normal_mu, | 128 float normal_mu, |
131 float normal_error_threshold, | 129 float normal_error_threshold, |
132 float x_pow[PART_LEN1], | 130 float x_pow[PART_LEN1], |
133 float ef[2][PART_LEN1]) { | 131 float ef[2][PART_LEN1]) { |
134 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; | 132 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; |
135 const float error_threshold = extended_filter_enabled ? | 133 const float error_threshold = extended_filter_enabled |
136 kExtendedErrorThreshold : normal_error_threshold; | 134 ? kExtendedErrorThreshold |
| 135 : normal_error_threshold; |
137 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); | 136 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); |
138 const float32x4_t kMu = vmovq_n_f32(mu); | 137 const float32x4_t kMu = vmovq_n_f32(mu); |
139 const float32x4_t kThresh = vmovq_n_f32(error_threshold); | 138 const float32x4_t kThresh = vmovq_n_f32(error_threshold); |
140 int i; | 139 int i; |
141 // vectorized code (four at once) | 140 // vectorized code (four at once) |
142 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 141 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
143 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); | 142 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); |
144 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); | 143 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); |
145 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); | 144 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); |
146 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); | 145 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); |
147 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); | 146 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); |
148 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); | 147 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); |
149 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); | 148 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); |
150 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); | 149 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); |
151 const float32x4_t absEf = vsqrtq_f32(ef_sum2); | 150 const float32x4_t absEf = vsqrtq_f32(ef_sum2); |
152 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); | 151 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); |
153 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); | 152 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); |
154 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); | 153 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); |
155 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); | 154 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); |
156 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); | 155 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); |
157 uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), | 156 uint32x4_t ef_re_u32 = |
158 vreinterpretq_u32_f32(ef_re)); | 157 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); |
159 uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), | 158 uint32x4_t ef_im_u32 = |
160 vreinterpretq_u32_f32(ef_im)); | 159 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); |
161 ef_re_if = vandq_u32(bigger, ef_re_if); | 160 ef_re_if = vandq_u32(bigger, ef_re_if); |
162 ef_im_if = vandq_u32(bigger, ef_im_if); | 161 ef_im_if = vandq_u32(bigger, ef_im_if); |
163 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); | 162 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); |
164 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); | 163 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); |
165 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); | 164 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); |
166 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); | 165 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); |
167 vst1q_f32(&ef[0][i], ef_re); | 166 vst1q_f32(&ef[0][i], ef_re); |
168 vst1q_f32(&ef[1][i], ef_im); | 167 vst1q_f32(&ef[1][i], ef_im); |
169 } | 168 } |
170 // scalar code for the remaining items. | 169 // scalar code for the remaining items. |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
217 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); | 216 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); |
218 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); | 217 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); |
219 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); | 218 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); |
220 // Interleave real and imaginary parts. | 219 // Interleave real and imaginary parts. |
221 const float32x4x2_t g_n_h = vzipq_f32(e, f); | 220 const float32x4x2_t g_n_h = vzipq_f32(e, f); |
222 // Store | 221 // Store |
223 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); | 222 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); |
224 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); | 223 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); |
225 } | 224 } |
226 // ... and fixup the first imaginary entry. | 225 // ... and fixup the first imaginary entry. |
227 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], | 226 fft[1] = |
228 -x_fft_buf[1][xPos + PART_LEN], | 227 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], |
229 e_fft[0][PART_LEN], | 228 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); |
230 e_fft[1][PART_LEN]); | |
231 | 229 |
232 aec_rdft_inverse_128(fft); | 230 aec_rdft_inverse_128(fft); |
233 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | 231 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); |
234 | 232 |
235 // fft scaling | 233 // fft scaling |
236 { | 234 { |
237 const float scale = 2.0f / PART_LEN2; | 235 const float scale = 2.0f / PART_LEN2; |
238 const float32x4_t scale_ps = vmovq_n_f32(scale); | 236 const float32x4_t scale_ps = vmovq_n_f32(scale); |
239 for (j = 0; j < PART_LEN; j += 4) { | 237 for (j = 0; j < PART_LEN; j += 4) { |
240 const float32x4_t fft_ps = vld1q_f32(&fft[j]); | 238 const float32x4_t fft_ps = vld1q_f32(&fft[j]); |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
285 | 283 |
286 // Compute n. | 284 // Compute n. |
287 // This is done by masking the exponent, shifting it into the top bit of | 285 // This is done by masking the exponent, shifting it into the top bit of |
288 // the mantissa, putting eight into the biased exponent (to shift/ | 286 // the mantissa, putting eight into the biased exponent (to shift/ |
289 // compensate the fact that the exponent has been shifted in the top/ | 287 // compensate the fact that the exponent has been shifted in the top/ |
290 // fractional part and finally getting rid of the implicit leading one | 288 // fractional part and finally getting rid of the implicit leading one |
291 // from the mantissa by substracting it out. | 289 // from the mantissa by substracting it out. |
292 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); | 290 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); |
293 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); | 291 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); |
294 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); | 292 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); |
295 const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), | 293 const uint32x4_t two_n = |
296 vec_float_exponent_mask); | 294 vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); |
297 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); | 295 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); |
298 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); | 296 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); |
299 const float32x4_t n = | 297 const float32x4_t n = |
300 vsubq_f32(vreinterpretq_f32_u32(n_0), | 298 vsubq_f32(vreinterpretq_f32_u32(n_0), |
301 vreinterpretq_f32_u32(vec_implicit_leading_one)); | 299 vreinterpretq_f32_u32(vec_implicit_leading_one)); |
302 // Compute y. | 300 // Compute y. |
303 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); | 301 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); |
304 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); | 302 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); |
305 const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), | 303 const uint32x4_t mantissa = |
306 vec_mantissa_mask); | 304 vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); |
307 const float32x4_t y = | 305 const float32x4_t y = vreinterpretq_f32_u32( |
308 vreinterpretq_f32_u32(vorrq_u32(mantissa, | 306 vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); |
309 vec_zero_biased_exponent_is_one)); | |
310 // Approximate log2(y) ~= (y - 1) * pol5(y). | 307 // Approximate log2(y) ~= (y - 1) * pol5(y). |
311 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 | 308 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 |
312 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); | 309 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); |
313 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); | 310 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); |
314 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); | 311 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); |
315 const float32x4_t C2 = vdupq_n_f32(2.5988452f); | 312 const float32x4_t C2 = vdupq_n_f32(2.5988452f); |
316 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); | 313 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); |
317 const float32x4_t C0 = vdupq_n_f32(3.1157899f); | 314 const float32x4_t C0 = vdupq_n_f32(3.1157899f); |
318 float32x4_t pol5_y = C5; | 315 float32x4_t pol5_y = C5; |
319 pol5_y = vmlaq_f32(C4, y, pol5_y); | 316 pol5_y = vmlaq_f32(C4, y, pol5_y); |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
388 const float32x4_t vec_one = vdupq_n_f32(1.0f); | 385 const float32x4_t vec_one = vdupq_n_f32(1.0f); |
389 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); | 386 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); |
390 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); | 387 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); |
391 | 388 |
392 // vectorized code (four at once) | 389 // vectorized code (four at once) |
393 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 390 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
394 // Weight subbands | 391 // Weight subbands |
395 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); | 392 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); |
396 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); | 393 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); |
397 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); | 394 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); |
398 const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, | 395 const float32x4_t vec_weightCurve_hNlFb = |
399 vec_hNlFb); | 396 vmulq_f32(vec_weightCurve, vec_hNlFb); |
400 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); | 397 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); |
401 const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, | 398 const float32x4_t vec_one_weightCurve_hNl = |
402 vec_hNl); | 399 vmulq_f32(vec_one_weightCurve, vec_hNl); |
403 const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), | 400 const uint32x4_t vec_if0 = |
404 vreinterpretq_u32_f32(vec_hNl)); | 401 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); |
405 const float32x4_t vec_one_weightCurve_add = | 402 const float32x4_t vec_one_weightCurve_add = |
406 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); | 403 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); |
407 const uint32x4_t vec_if1 = | 404 const uint32x4_t vec_if1 = |
408 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); | 405 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); |
409 | 406 |
410 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); | 407 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); |
411 | 408 |
412 { | 409 { |
413 const float32x4_t vec_overDriveCurve = | 410 const float32x4_t vec_overDriveCurve = |
414 vld1q_f32(&WebRtcAec_overDriveCurve[i]); | 411 vld1q_f32(&WebRtcAec_overDriveCurve[i]); |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
506 // - sxd : cross-PSD of near-end and far-end | 503 // - sxd : cross-PSD of near-end and far-end |
507 // | 504 // |
508 // In addition to updating the PSDs, also the filter diverge state is determined | 505 // In addition to updating the PSDs, also the filter diverge state is determined |
509 // upon actions are taken. | 506 // upon actions are taken. |
510 static void SmoothedPSD(AecCore* aec, | 507 static void SmoothedPSD(AecCore* aec, |
511 float efw[2][PART_LEN1], | 508 float efw[2][PART_LEN1], |
512 float dfw[2][PART_LEN1], | 509 float dfw[2][PART_LEN1], |
513 float xfw[2][PART_LEN1], | 510 float xfw[2][PART_LEN1], |
514 int* extreme_filter_divergence) { | 511 int* extreme_filter_divergence) { |
515 // Power estimate smoothing coefficients. | 512 // Power estimate smoothing coefficients. |
516 const float* ptrGCoh = aec->extended_filter_enabled | 513 const float* ptrGCoh = |
517 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | 514 aec->extended_filter_enabled |
518 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | 515 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] |
| 516 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; |
519 int i; | 517 int i; |
520 float sdSum = 0, seSum = 0; | 518 float sdSum = 0, seSum = 0; |
521 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); | 519 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); |
522 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); | 520 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); |
523 float32x4_t vec_seSum = vdupq_n_f32(0.0f); | 521 float32x4_t vec_seSum = vdupq_n_f32(0.0f); |
524 | 522 |
525 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 523 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
526 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); | 524 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); |
527 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); | 525 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); |
528 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); | 526 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); |
529 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); | 527 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); |
530 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); | 528 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); |
531 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); | 529 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
574 vst2q_f32(&aec->sxd[i][0], vec_sxd); | 572 vst2q_f32(&aec->sxd[i][0], vec_sxd); |
575 } | 573 } |
576 | 574 |
577 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); | 575 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); |
578 vec_seSum = vaddq_f32(vec_seSum, vec_se); | 576 vec_seSum = vaddq_f32(vec_seSum, vec_se); |
579 } | 577 } |
580 { | 578 { |
581 float32x2_t vec_sdSum_total; | 579 float32x2_t vec_sdSum_total; |
582 float32x2_t vec_seSum_total; | 580 float32x2_t vec_seSum_total; |
583 // A B C D | 581 // A B C D |
584 vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), | 582 vec_sdSum_total = |
585 vget_high_f32(vec_sdSum)); | 583 vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); |
586 vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), | 584 vec_seSum_total = |
587 vget_high_f32(vec_seSum)); | 585 vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); |
588 // A+B C+D | 586 // A+B C+D |
589 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); | 587 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); |
590 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); | 588 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); |
591 // A+B+C+D A+B+C+D | 589 // A+B+C+D A+B+C+D |
592 sdSum = vget_lane_f32(vec_sdSum_total, 0); | 590 sdSum = vget_lane_f32(vec_sdSum_total, 0); |
593 seSum = vget_lane_f32(vec_seSum_total, 0); | 591 seSum = vget_lane_f32(vec_seSum_total, 0); |
594 } | 592 } |
595 | 593 |
596 // scalar code for the remaining items. | 594 // scalar code for the remaining items. |
597 for (; i < PART_LEN1; i++) { | 595 for (; i < PART_LEN1; i++) { |
598 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | 596 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + |
599 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | 597 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); |
600 aec->se[i] = ptrGCoh[0] * aec->se[i] + | 598 aec->se[i] = ptrGCoh[0] * aec->se[i] + |
601 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | 599 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); |
602 // We threshold here to protect against the ill-effects of a zero farend. | 600 // We threshold here to protect against the ill-effects of a zero farend. |
603 // The threshold is not arbitrarily chosen, but balances protection and | 601 // The threshold is not arbitrarily chosen, but balances protection and |
604 // adverse interaction with the algorithm's tuning. | 602 // adverse interaction with the algorithm's tuning. |
605 // TODO(bjornv): investigate further why this is so sensitive. | 603 // TODO(bjornv): investigate further why this is so sensitive. |
606 aec->sx[i] = | 604 aec->sx[i] = ptrGCoh[0] * aec->sx[i] + |
607 ptrGCoh[0] * aec->sx[i] + | 605 ptrGCoh[1] * WEBRTC_SPL_MAX( |
608 ptrGCoh[1] * WEBRTC_SPL_MAX( | 606 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], |
609 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | 607 WebRtcAec_kMinFarendPSD); |
610 WebRtcAec_kMinFarendPSD); | |
611 | 608 |
612 aec->sde[i][0] = | 609 aec->sde[i][0] = |
613 ptrGCoh[0] * aec->sde[i][0] + | 610 ptrGCoh[0] * aec->sde[i][0] + |
614 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | 611 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); |
615 aec->sde[i][1] = | 612 aec->sde[i][1] = |
616 ptrGCoh[0] * aec->sde[i][1] + | 613 ptrGCoh[0] * aec->sde[i][1] + |
617 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | 614 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); |
618 | 615 |
619 aec->sxd[i][0] = | 616 aec->sxd[i][0] = |
620 ptrGCoh[0] * aec->sxd[i][0] + | 617 ptrGCoh[0] * aec->sxd[i][0] + |
(...skipping 24 matching lines...) Expand all Loading... |
645 // A B C D | 642 // A B C D |
646 float32x4_t vec_sqrtHanning_rev = | 643 float32x4_t vec_sqrtHanning_rev = |
647 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | 644 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); |
648 // B A D C | 645 // B A D C |
649 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); | 646 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); |
650 // D C B A | 647 // D C B A |
651 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), | 648 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), |
652 vget_low_f32(vec_sqrtHanning_rev)); | 649 vget_low_f32(vec_sqrtHanning_rev)); |
653 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); | 650 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); |
654 vst1q_f32(&x_windowed[PART_LEN + i], | 651 vst1q_f32(&x_windowed[PART_LEN + i], |
655 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); | 652 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); |
656 } | 653 } |
657 } | 654 } |
658 | 655 |
659 // Puts fft output data into a complex valued array. | 656 // Puts fft output data into a complex valued array. |
660 static void StoreAsComplexNEON(const float* data, | 657 static void StoreAsComplexNEON(const float* data, |
661 float data_complex[2][PART_LEN1]) { | 658 float data_complex[2][PART_LEN1]) { |
662 int i; | 659 int i; |
663 for (i = 0; i < PART_LEN; i += 4) { | 660 for (i = 0; i < PART_LEN; i += 4) { |
664 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); | 661 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); |
665 vst1q_f32(&data_complex[0][i], vec_data.val[0]); | 662 vst1q_f32(&data_complex[0][i], vec_data.val[0]); |
(...skipping 12 matching lines...) Expand all Loading... |
678 float xfw[2][PART_LEN1], | 675 float xfw[2][PART_LEN1], |
679 float* fft, | 676 float* fft, |
680 float* cohde, | 677 float* cohde, |
681 float* cohxd, | 678 float* cohxd, |
682 int* extreme_filter_divergence) { | 679 int* extreme_filter_divergence) { |
683 int i; | 680 int i; |
684 | 681 |
685 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); | 682 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); |
686 | 683 |
687 { | 684 { |
688 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); | 685 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); |
689 | 686 |
690 // Subband coherence | 687 // Subband coherence |
691 for (i = 0; i + 3 < PART_LEN1; i += 4) { | 688 for (i = 0; i + 3 < PART_LEN1; i += 4) { |
692 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); | 689 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); |
693 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); | 690 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); |
694 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); | 691 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); |
695 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); | 692 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); |
696 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); | 693 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); |
697 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); | 694 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); |
698 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); | 695 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); |
(...skipping 22 matching lines...) Expand all Loading... |
721 void WebRtcAec_InitAec_neon(void) { | 718 void WebRtcAec_InitAec_neon(void) { |
722 WebRtcAec_FilterFar = FilterFarNEON; | 719 WebRtcAec_FilterFar = FilterFarNEON; |
723 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; | 720 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; |
724 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; | 721 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; |
725 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; | 722 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; |
726 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; | 723 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; |
727 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; | 724 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; |
728 WebRtcAec_PartitionDelay = PartitionDelayNEON; | 725 WebRtcAec_PartitionDelay = PartitionDelayNEON; |
729 WebRtcAec_WindowData = WindowDataNEON; | 726 WebRtcAec_WindowData = WindowDataNEON; |
730 } | 727 } |
OLD | NEW |