Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_core_neon.c

Issue 1639283002: Clang format changes (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 16 matching lines...) Expand all
27 enum { kFloatExponentShift = 23 }; 27 enum { kFloatExponentShift = 23 };
28 28
29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { 29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
30 return aRe * bRe - aIm * bIm; 30 return aRe * bRe - aIm * bIm;
31 } 31 }
32 32
33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { 33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
34 return aRe * bIm + aIm * bRe; 34 return aRe * bIm + aIm * bRe;
35 } 35 }
36 36
37 static void FilterFarNEON( 37 static void FilterFarNEON(int num_partitions,
38 int num_partitions, 38 int x_fft_buf_block_pos,
39 int x_fft_buf_block_pos, 39 float x_fft_buf[2]
40 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], 40 [kExtendedNumPartitions * PART_LEN1],
41 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], 41 float h_fft_buf[2]
42 float y_fft[2][PART_LEN1]) { 42 [kExtendedNumPartitions * PART_LEN1],
43 float y_fft[2][PART_LEN1]) {
43 int i; 44 int i;
44 for (i = 0; i < num_partitions; i++) { 45 for (i = 0; i < num_partitions; i++) {
45 int j; 46 int j;
46 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; 47 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
47 int pos = i * PART_LEN1; 48 int pos = i * PART_LEN1;
48 // Check for wrap 49 // Check for wrap
49 if (i + x_fft_buf_block_pos >= num_partitions) { 50 if (i + x_fft_buf_block_pos >= num_partitions) {
50 xPos -= num_partitions * PART_LEN1; 51 xPos -= num_partitions * PART_LEN1;
51 } 52 }
52 53
53 // vectorized code (four at once) 54 // vectorized code (four at once)
54 for (j = 0; j + 3 < PART_LEN1; j += 4) { 55 for (j = 0; j + 3 < PART_LEN1; j += 4) {
55 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); 56 const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
56 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); 57 const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
57 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); 58 const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
58 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); 59 const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
59 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); 60 const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]);
60 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); 61 const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]);
61 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); 62 const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re);
62 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); 63 const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im);
63 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); 64 const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im);
64 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); 65 const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re);
65 const float32x4_t g = vaddq_f32(y_fft_re, e); 66 const float32x4_t g = vaddq_f32(y_fft_re, e);
66 const float32x4_t h = vaddq_f32(y_fft_im, f); 67 const float32x4_t h = vaddq_f32(y_fft_im, f);
67 vst1q_f32(&y_fft[0][j], g); 68 vst1q_f32(&y_fft[0][j], g);
68 vst1q_f32(&y_fft[1][j], h); 69 vst1q_f32(&y_fft[1][j], h);
69 } 70 }
70 // scalar code for the remaining items. 71 // scalar code for the remaining items.
71 for (; j < PART_LEN1; j++) { 72 for (; j < PART_LEN1; j++) {
72 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], 73 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j],
73 x_fft_buf[1][xPos + j], 74 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]);
74 h_fft_buf[0][pos + j], 75 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j],
75 h_fft_buf[1][pos + j]); 76 h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]);
76 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],
77 x_fft_buf[1][xPos + j],
78 h_fft_buf[0][pos + j],
79 h_fft_buf[1][pos + j]);
80 } 77 }
81 } 78 }
82 } 79 }
83 80
84 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32. 81 // ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32.
85 #if !defined (WEBRTC_ARCH_ARM64) 82 #if !defined(WEBRTC_ARCH_ARM64)
86 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { 83 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
87 int i; 84 int i;
88 float32x4_t x = vrecpeq_f32(b); 85 float32x4_t x = vrecpeq_f32(b);
89 // from arm documentation 86 // from arm documentation
90 // The Newton-Raphson iteration: 87 // The Newton-Raphson iteration:
91 // x[n+1] = x[n] * (2 - d * x[n]) 88 // x[n+1] = x[n] * (2 - d * x[n])
92 // converges to (1/d) if x0 is the result of VRECPE applied to d. 89 // converges to (1/d) if x0 is the result of VRECPE applied to d.
93 // 90 //
94 // Note: The precision did not improve after 2 iterations. 91 // Note: The precision did not improve after 2 iterations.
95 for (i = 0; i < 2; i++) { 92 for (i = 0; i < 2; i++) {
96 x = vmulq_f32(vrecpsq_f32(b, x), x); 93 x = vmulq_f32(vrecpsq_f32(b, x), x);
97 } 94 }
98 // a/b = a*(1/b) 95 // a/b = a*(1/b)
99 return vmulq_f32(a, x); 96 return vmulq_f32(a, x);
100 } 97 }
101 98
102 static float32x4_t vsqrtq_f32(float32x4_t s) { 99 static float32x4_t vsqrtq_f32(float32x4_t s) {
103 int i; 100 int i;
104 float32x4_t x = vrsqrteq_f32(s); 101 float32x4_t x = vrsqrteq_f32(s);
105 102
106 // Code to handle sqrt(0). 103 // Code to handle sqrt(0).
107 // If the input to sqrtf() is zero, a zero will be returned. 104 // If the input to sqrtf() is zero, a zero will be returned.
108 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. 105 // If the input to vrsqrteq_f32() is zero, positive infinity is returned.
109 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); 106 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
110 // check for divide by zero 107 // check for divide by zero
111 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); 108 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
112 // zero out the positive infinity results 109 // zero out the positive infinity results
113 x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), 110 x = vreinterpretq_f32_u32(
114 vreinterpretq_u32_f32(x))); 111 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x)));
115 // from arm documentation 112 // from arm documentation
116 // The Newton-Raphson iteration: 113 // The Newton-Raphson iteration:
117 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) 114 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
118 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. 115 // converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
119 // 116 //
120 // Note: The precision did not improve after 2 iterations. 117 // Note: The precision did not improve after 2 iterations.
121 for (i = 0; i < 2; i++) { 118 for (i = 0; i < 2; i++) {
122 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); 119 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
123 } 120 }
124 // sqrt(s) = s * 1/sqrt(s) 121 // sqrt(s) = s * 1/sqrt(s)
125 return vmulq_f32(s, x);; 122 return vmulq_f32(s, x);
123 ;
126 } 124 }
127 #endif // WEBRTC_ARCH_ARM64 125 #endif // WEBRTC_ARCH_ARM64
128 126
129 static void ScaleErrorSignalNEON(int extended_filter_enabled, 127 static void ScaleErrorSignalNEON(int extended_filter_enabled,
130 float normal_mu, 128 float normal_mu,
131 float normal_error_threshold, 129 float normal_error_threshold,
132 float x_pow[PART_LEN1], 130 float x_pow[PART_LEN1],
133 float ef[2][PART_LEN1]) { 131 float ef[2][PART_LEN1]) {
134 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; 132 const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
135 const float error_threshold = extended_filter_enabled ? 133 const float error_threshold = extended_filter_enabled
136 kExtendedErrorThreshold : normal_error_threshold; 134 ? kExtendedErrorThreshold
135 : normal_error_threshold;
137 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); 136 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
138 const float32x4_t kMu = vmovq_n_f32(mu); 137 const float32x4_t kMu = vmovq_n_f32(mu);
139 const float32x4_t kThresh = vmovq_n_f32(error_threshold); 138 const float32x4_t kThresh = vmovq_n_f32(error_threshold);
140 int i; 139 int i;
141 // vectorized code (four at once) 140 // vectorized code (four at once)
142 for (i = 0; i + 3 < PART_LEN1; i += 4) { 141 for (i = 0; i + 3 < PART_LEN1; i += 4) {
143 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); 142 const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]);
144 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); 143 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
145 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); 144 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
146 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); 145 const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f);
147 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); 146 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
148 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); 147 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
149 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); 148 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
150 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); 149 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
151 const float32x4_t absEf = vsqrtq_f32(ef_sum2); 150 const float32x4_t absEf = vsqrtq_f32(ef_sum2);
152 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); 151 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
153 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); 152 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
154 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); 153 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
155 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); 154 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
156 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); 155 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
157 uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), 156 uint32x4_t ef_re_u32 =
158 vreinterpretq_u32_f32(ef_re)); 157 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re));
159 uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), 158 uint32x4_t ef_im_u32 =
160 vreinterpretq_u32_f32(ef_im)); 159 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im));
161 ef_re_if = vandq_u32(bigger, ef_re_if); 160 ef_re_if = vandq_u32(bigger, ef_re_if);
162 ef_im_if = vandq_u32(bigger, ef_im_if); 161 ef_im_if = vandq_u32(bigger, ef_im_if);
163 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); 162 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
164 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); 163 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
165 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); 164 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
166 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); 165 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
167 vst1q_f32(&ef[0][i], ef_re); 166 vst1q_f32(&ef[0][i], ef_re);
168 vst1q_f32(&ef[1][i], ef_im); 167 vst1q_f32(&ef[1][i], ef_im);
169 } 168 }
170 // scalar code for the remaining items. 169 // scalar code for the remaining items.
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
217 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); 216 const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im);
218 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); 217 const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im);
219 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); 218 const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re);
220 // Interleave real and imaginary parts. 219 // Interleave real and imaginary parts.
221 const float32x4x2_t g_n_h = vzipq_f32(e, f); 220 const float32x4x2_t g_n_h = vzipq_f32(e, f);
222 // Store 221 // Store
223 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); 222 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
224 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); 223 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
225 } 224 }
226 // ... and fixup the first imaginary entry. 225 // ... and fixup the first imaginary entry.
227 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], 226 fft[1] =
228 -x_fft_buf[1][xPos + PART_LEN], 227 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN],
229 e_fft[0][PART_LEN], 228 e_fft[0][PART_LEN], e_fft[1][PART_LEN]);
230 e_fft[1][PART_LEN]);
231 229
232 aec_rdft_inverse_128(fft); 230 aec_rdft_inverse_128(fft);
233 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); 231 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
234 232
235 // fft scaling 233 // fft scaling
236 { 234 {
237 const float scale = 2.0f / PART_LEN2; 235 const float scale = 2.0f / PART_LEN2;
238 const float32x4_t scale_ps = vmovq_n_f32(scale); 236 const float32x4_t scale_ps = vmovq_n_f32(scale);
239 for (j = 0; j < PART_LEN; j += 4) { 237 for (j = 0; j < PART_LEN; j += 4) {
240 const float32x4_t fft_ps = vld1q_f32(&fft[j]); 238 const float32x4_t fft_ps = vld1q_f32(&fft[j]);
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
285 283
286 // Compute n. 284 // Compute n.
287 // This is done by masking the exponent, shifting it into the top bit of 285 // This is done by masking the exponent, shifting it into the top bit of
288 // the mantissa, putting eight into the biased exponent (to shift/ 286 // the mantissa, putting eight into the biased exponent (to shift/
289 // compensate the fact that the exponent has been shifted in the top/ 287 // compensate the fact that the exponent has been shifted in the top/
290 // fractional part and finally getting rid of the implicit leading one 288 // fractional part and finally getting rid of the implicit leading one
291 // from the mantissa by substracting it out. 289 // from the mantissa by substracting it out.
292 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); 290 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
293 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); 291 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
294 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); 292 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
295 const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), 293 const uint32x4_t two_n =
296 vec_float_exponent_mask); 294 vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask);
297 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); 295 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
298 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); 296 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
299 const float32x4_t n = 297 const float32x4_t n =
300 vsubq_f32(vreinterpretq_f32_u32(n_0), 298 vsubq_f32(vreinterpretq_f32_u32(n_0),
301 vreinterpretq_f32_u32(vec_implicit_leading_one)); 299 vreinterpretq_f32_u32(vec_implicit_leading_one));
302 // Compute y. 300 // Compute y.
303 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); 301 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
304 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); 302 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
305 const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), 303 const uint32x4_t mantissa =
306 vec_mantissa_mask); 304 vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask);
307 const float32x4_t y = 305 const float32x4_t y = vreinterpretq_f32_u32(
308 vreinterpretq_f32_u32(vorrq_u32(mantissa, 306 vorrq_u32(mantissa, vec_zero_biased_exponent_is_one));
309 vec_zero_biased_exponent_is_one));
310 // Approximate log2(y) ~= (y - 1) * pol5(y). 307 // Approximate log2(y) ~= (y - 1) * pol5(y).
311 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 308 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
312 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); 309 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
313 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); 310 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
314 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); 311 const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
315 const float32x4_t C2 = vdupq_n_f32(2.5988452f); 312 const float32x4_t C2 = vdupq_n_f32(2.5988452f);
316 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); 313 const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
317 const float32x4_t C0 = vdupq_n_f32(3.1157899f); 314 const float32x4_t C0 = vdupq_n_f32(3.1157899f);
318 float32x4_t pol5_y = C5; 315 float32x4_t pol5_y = C5;
319 pol5_y = vmlaq_f32(C4, y, pol5_y); 316 pol5_y = vmlaq_f32(C4, y, pol5_y);
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
388 const float32x4_t vec_one = vdupq_n_f32(1.0f); 385 const float32x4_t vec_one = vdupq_n_f32(1.0f);
389 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); 386 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
390 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); 387 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);
391 388
392 // vectorized code (four at once) 389 // vectorized code (four at once)
393 for (i = 0; i + 3 < PART_LEN1; i += 4) { 390 for (i = 0; i + 3 < PART_LEN1; i += 4) {
394 // Weight subbands 391 // Weight subbands
395 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); 392 float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
396 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); 393 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
397 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); 394 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
398 const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, 395 const float32x4_t vec_weightCurve_hNlFb =
399 vec_hNlFb); 396 vmulq_f32(vec_weightCurve, vec_hNlFb);
400 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); 397 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
401 const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, 398 const float32x4_t vec_one_weightCurve_hNl =
402 vec_hNl); 399 vmulq_f32(vec_one_weightCurve, vec_hNl);
403 const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), 400 const uint32x4_t vec_if0 =
404 vreinterpretq_u32_f32(vec_hNl)); 401 vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl));
405 const float32x4_t vec_one_weightCurve_add = 402 const float32x4_t vec_one_weightCurve_add =
406 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); 403 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
407 const uint32x4_t vec_if1 = 404 const uint32x4_t vec_if1 =
408 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); 405 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));
409 406
410 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); 407 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));
411 408
412 { 409 {
413 const float32x4_t vec_overDriveCurve = 410 const float32x4_t vec_overDriveCurve =
414 vld1q_f32(&WebRtcAec_overDriveCurve[i]); 411 vld1q_f32(&WebRtcAec_overDriveCurve[i]);
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
506 // - sxd : cross-PSD of near-end and far-end 503 // - sxd : cross-PSD of near-end and far-end
507 // 504 //
508 // In addition to updating the PSDs, also the filter diverge state is determined 505 // In addition to updating the PSDs, also the filter diverge state is determined
509 // upon actions are taken. 506 // upon actions are taken.
510 static void SmoothedPSD(AecCore* aec, 507 static void SmoothedPSD(AecCore* aec,
511 float efw[2][PART_LEN1], 508 float efw[2][PART_LEN1],
512 float dfw[2][PART_LEN1], 509 float dfw[2][PART_LEN1],
513 float xfw[2][PART_LEN1], 510 float xfw[2][PART_LEN1],
514 int* extreme_filter_divergence) { 511 int* extreme_filter_divergence) {
515 // Power estimate smoothing coefficients. 512 // Power estimate smoothing coefficients.
516 const float* ptrGCoh = aec->extended_filter_enabled 513 const float* ptrGCoh =
517 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] 514 aec->extended_filter_enabled
518 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; 515 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
516 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
519 int i; 517 int i;
520 float sdSum = 0, seSum = 0; 518 float sdSum = 0, seSum = 0;
521 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); 519 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD);
522 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); 520 float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
523 float32x4_t vec_seSum = vdupq_n_f32(0.0f); 521 float32x4_t vec_seSum = vdupq_n_f32(0.0f);
524 522
525 for (i = 0; i + 3 < PART_LEN1; i += 4) { 523 for (i = 0; i + 3 < PART_LEN1; i += 4) {
526 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); 524 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
527 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); 525 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
528 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); 526 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
529 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); 527 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
530 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); 528 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
531 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); 529 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
574 vst2q_f32(&aec->sxd[i][0], vec_sxd); 572 vst2q_f32(&aec->sxd[i][0], vec_sxd);
575 } 573 }
576 574
577 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); 575 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
578 vec_seSum = vaddq_f32(vec_seSum, vec_se); 576 vec_seSum = vaddq_f32(vec_seSum, vec_se);
579 } 577 }
580 { 578 {
581 float32x2_t vec_sdSum_total; 579 float32x2_t vec_sdSum_total;
582 float32x2_t vec_seSum_total; 580 float32x2_t vec_seSum_total;
583 // A B C D 581 // A B C D
584 vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), 582 vec_sdSum_total =
585 vget_high_f32(vec_sdSum)); 583 vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum));
586 vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), 584 vec_seSum_total =
587 vget_high_f32(vec_seSum)); 585 vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum));
588 // A+B C+D 586 // A+B C+D
589 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); 587 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
590 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); 588 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
591 // A+B+C+D A+B+C+D 589 // A+B+C+D A+B+C+D
592 sdSum = vget_lane_f32(vec_sdSum_total, 0); 590 sdSum = vget_lane_f32(vec_sdSum_total, 0);
593 seSum = vget_lane_f32(vec_seSum_total, 0); 591 seSum = vget_lane_f32(vec_seSum_total, 0);
594 } 592 }
595 593
596 // scalar code for the remaining items. 594 // scalar code for the remaining items.
597 for (; i < PART_LEN1; i++) { 595 for (; i < PART_LEN1; i++) {
598 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + 596 aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
599 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); 597 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
600 aec->se[i] = ptrGCoh[0] * aec->se[i] + 598 aec->se[i] = ptrGCoh[0] * aec->se[i] +
601 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); 599 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
602 // We threshold here to protect against the ill-effects of a zero farend. 600 // We threshold here to protect against the ill-effects of a zero farend.
603 // The threshold is not arbitrarily chosen, but balances protection and 601 // The threshold is not arbitrarily chosen, but balances protection and
604 // adverse interaction with the algorithm's tuning. 602 // adverse interaction with the algorithm's tuning.
605 // TODO(bjornv): investigate further why this is so sensitive. 603 // TODO(bjornv): investigate further why this is so sensitive.
606 aec->sx[i] = 604 aec->sx[i] = ptrGCoh[0] * aec->sx[i] +
607 ptrGCoh[0] * aec->sx[i] + 605 ptrGCoh[1] * WEBRTC_SPL_MAX(
608 ptrGCoh[1] * WEBRTC_SPL_MAX( 606 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
609 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], 607 WebRtcAec_kMinFarendPSD);
610 WebRtcAec_kMinFarendPSD);
611 608
612 aec->sde[i][0] = 609 aec->sde[i][0] =
613 ptrGCoh[0] * aec->sde[i][0] + 610 ptrGCoh[0] * aec->sde[i][0] +
614 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); 611 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
615 aec->sde[i][1] = 612 aec->sde[i][1] =
616 ptrGCoh[0] * aec->sde[i][1] + 613 ptrGCoh[0] * aec->sde[i][1] +
617 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); 614 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
618 615
619 aec->sxd[i][0] = 616 aec->sxd[i][0] =
620 ptrGCoh[0] * aec->sxd[i][0] + 617 ptrGCoh[0] * aec->sxd[i][0] +
(...skipping 24 matching lines...) Expand all
645 // A B C D 642 // A B C D
646 float32x4_t vec_sqrtHanning_rev = 643 float32x4_t vec_sqrtHanning_rev =
647 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); 644 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
648 // B A D C 645 // B A D C
649 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); 646 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
650 // D C B A 647 // D C B A
651 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), 648 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
652 vget_low_f32(vec_sqrtHanning_rev)); 649 vget_low_f32(vec_sqrtHanning_rev));
653 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); 650 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
654 vst1q_f32(&x_windowed[PART_LEN + i], 651 vst1q_f32(&x_windowed[PART_LEN + i],
655 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); 652 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
656 } 653 }
657 } 654 }
658 655
659 // Puts fft output data into a complex valued array. 656 // Puts fft output data into a complex valued array.
660 static void StoreAsComplexNEON(const float* data, 657 static void StoreAsComplexNEON(const float* data,
661 float data_complex[2][PART_LEN1]) { 658 float data_complex[2][PART_LEN1]) {
662 int i; 659 int i;
663 for (i = 0; i < PART_LEN; i += 4) { 660 for (i = 0; i < PART_LEN; i += 4) {
664 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); 661 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
665 vst1q_f32(&data_complex[0][i], vec_data.val[0]); 662 vst1q_f32(&data_complex[0][i], vec_data.val[0]);
(...skipping 12 matching lines...) Expand all
678 float xfw[2][PART_LEN1], 675 float xfw[2][PART_LEN1],
679 float* fft, 676 float* fft,
680 float* cohde, 677 float* cohde,
681 float* cohxd, 678 float* cohxd,
682 int* extreme_filter_divergence) { 679 int* extreme_filter_divergence) {
683 int i; 680 int i;
684 681
685 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); 682 SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);
686 683
687 { 684 {
688 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); 685 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f);
689 686
690 // Subband coherence 687 // Subband coherence
691 for (i = 0; i + 3 < PART_LEN1; i += 4) { 688 for (i = 0; i + 3 < PART_LEN1; i += 4) {
692 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); 689 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
693 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); 690 const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
694 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); 691 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
695 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); 692 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
696 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); 693 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
697 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); 694 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
698 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); 695 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
(...skipping 22 matching lines...) Expand all
721 void WebRtcAec_InitAec_neon(void) { 718 void WebRtcAec_InitAec_neon(void) {
722 WebRtcAec_FilterFar = FilterFarNEON; 719 WebRtcAec_FilterFar = FilterFarNEON;
723 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; 720 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
724 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; 721 WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
725 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; 722 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
726 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; 723 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
727 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; 724 WebRtcAec_StoreAsComplex = StoreAsComplexNEON;
728 WebRtcAec_PartitionDelay = PartitionDelayNEON; 725 WebRtcAec_PartitionDelay = PartitionDelayNEON;
729 WebRtcAec_WindowData = WindowDataNEON; 726 WebRtcAec_WindowData = WindowDataNEON;
730 } 727 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698