OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 /* | 11 /* |
12 * The rdft AEC algorithm, neon version of speed-critical functions. | 12 * The rdft AEC algorithm, neon version of speed-critical functions. |
13 * | 13 * |
14 * Based on the sse2 version. | 14 * Based on the sse2 version. |
15 */ | 15 */ |
16 | 16 |
17 | 17 #include "webrtc/modules/audio_processing/utility/ooura_fft.h" |
18 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | |
19 | 18 |
20 #include <arm_neon.h> | 19 #include <arm_neon.h> |
21 | 20 |
22 static const ALIGN16_BEG float ALIGN16_END | 21 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
23 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; | 22 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" |
24 | 23 |
25 static void cft1st_128_neon(float* a) { | 24 namespace webrtc { |
| 25 |
| 26 #if defined(WEBRTC_HAS_NEON) |
| 27 void cft1st_128_neon(float* a) { |
26 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); | 28 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); |
27 int j, k2; | 29 int j, k2; |
28 | 30 |
29 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { | 31 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { |
30 float32x4_t a00v = vld1q_f32(&a[j + 0]); | 32 float32x4_t a00v = vld1q_f32(&a[j + 0]); |
31 float32x4_t a04v = vld1q_f32(&a[j + 4]); | 33 float32x4_t a04v = vld1q_f32(&a[j + 4]); |
32 float32x4_t a08v = vld1q_f32(&a[j + 8]); | 34 float32x4_t a08v = vld1q_f32(&a[j + 8]); |
33 float32x4_t a12v = vld1q_f32(&a[j + 12]); | 35 float32x4_t a12v = vld1q_f32(&a[j + 12]); |
34 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); | 36 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); |
35 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); | 37 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); |
(...skipping 28 matching lines...) Expand all Loading... |
64 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); | 66 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); |
65 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); | 67 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); |
66 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); | 68 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); |
67 vst1q_f32(&a[j + 0], a00v); | 69 vst1q_f32(&a[j + 0], a00v); |
68 vst1q_f32(&a[j + 4], a04v); | 70 vst1q_f32(&a[j + 4], a04v); |
69 vst1q_f32(&a[j + 8], a08v); | 71 vst1q_f32(&a[j + 8], a08v); |
70 vst1q_f32(&a[j + 12], a12v); | 72 vst1q_f32(&a[j + 12], a12v); |
71 } | 73 } |
72 } | 74 } |
73 | 75 |
74 static void cftmdl_128_neon(float* a) { | 76 void cftmdl_128_neon(float* a) { |
75 int j; | 77 int j; |
76 const int l = 8; | 78 const int l = 8; |
77 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); | 79 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); |
78 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); | 80 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); |
79 | 81 |
80 for (j = 0; j < l; j += 2) { | 82 for (j = 0; j < l; j += 2) { |
81 const float32x2_t a_00 = vld1_f32(&a[j + 0]); | 83 const float32x2_t a_00 = vld1_f32(&a[j + 0]); |
82 const float32x2_t a_08 = vld1_f32(&a[j + 8]); | 84 const float32x2_t a_08 = vld1_f32(&a[j + 8]); |
83 const float32x2_t a_32 = vld1_f32(&a[j + 32]); | 85 const float32x2_t a_32 = vld1_f32(&a[j + 32]); |
84 const float32x2_t a_40 = vld1_f32(&a[j + 40]); | 86 const float32x2_t a_40 = vld1_f32(&a[j + 40]); |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
178 } | 180 } |
179 } | 181 } |
180 | 182 |
181 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) { | 183 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) { |
182 // A B C D -> C D A B | 184 // A B C D -> C D A B |
183 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in)); | 185 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in)); |
184 // C D A B -> D C B A | 186 // C D A B -> D C B A |
185 return vrev64q_f32(rev); | 187 return vrev64q_f32(rev); |
186 } | 188 } |
187 | 189 |
188 static void rftfsub_128_neon(float* a) { | 190 void rftfsub_128_neon(float* a) { |
189 const float* c = rdft_w + 32; | 191 const float* c = rdft_w + 32; |
190 int j1, j2; | 192 int j1, j2; |
191 const float32x4_t mm_half = vdupq_n_f32(0.5f); | 193 const float32x4_t mm_half = vdupq_n_f32(0.5f); |
192 | 194 |
193 // Vectorized code (four at once). | 195 // Vectorized code (four at once). |
194 // Note: commented number are indexes for the first iteration of the loop. | 196 // Note: commented number are indexes for the first iteration of the loop. |
195 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 197 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { |
196 // Load 'wk'. | 198 // Load 'wk'. |
197 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, | 199 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, |
198 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, | 200 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
257 const float xi = a[j2 + 1] + a[k2 + 1]; | 259 const float xi = a[j2 + 1] + a[k2 + 1]; |
258 const float yr = wkr * xr - wki * xi; | 260 const float yr = wkr * xr - wki * xi; |
259 const float yi = wkr * xi + wki * xr; | 261 const float yi = wkr * xi + wki * xr; |
260 a[j2 + 0] -= yr; | 262 a[j2 + 0] -= yr; |
261 a[j2 + 1] -= yi; | 263 a[j2 + 1] -= yi; |
262 a[k2 + 0] += yr; | 264 a[k2 + 0] += yr; |
263 a[k2 + 1] -= yi; | 265 a[k2 + 1] -= yi; |
264 } | 266 } |
265 } | 267 } |
266 | 268 |
267 static void rftbsub_128_neon(float* a) { | 269 void rftbsub_128_neon(float* a) { |
268 const float* c = rdft_w + 32; | 270 const float* c = rdft_w + 32; |
269 int j1, j2; | 271 int j1, j2; |
270 const float32x4_t mm_half = vdupq_n_f32(0.5f); | 272 const float32x4_t mm_half = vdupq_n_f32(0.5f); |
271 | 273 |
272 a[1] = -a[1]; | 274 a[1] = -a[1]; |
273 // Vectorized code (four at once). | 275 // Vectorized code (four at once). |
274 // Note: commented number are indexes for the first iteration of the loop. | 276 // Note: commented number are indexes for the first iteration of the loop. |
275 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 277 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { |
276 // Load 'wk'. | 278 // Load 'wk'. |
277 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, | 279 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, |
278 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, | 280 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, |
279 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, | 281 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, |
280 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, | 282 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, |
281 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, | 283 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, |
282 // Load and shuffle 'a'. | 284 // Load and shuffle 'a'. |
283 // 2, 4, 6, 8, 3, 5, 7, 9 | 285 // 2, 4, 6, 8, 3, 5, 7, 9 |
284 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); | 286 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); |
285 // 120, 122, 124, 126, 121, 123, 125, 127, | 287 // 120, 122, 124, 126, 121, 123, 125, 127, |
286 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); | 288 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); |
287 // 126, 124, 122, 120 | 289 // 126, 124, 122, 120 |
288 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); | 290 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); |
289 // 127, 125, 123, 121 | 291 // 127, 125, 123, 121 |
290 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); | 292 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); |
291 // Calculate 'x'. | 293 // Calculate 'x'. |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
338 const float xi = a[j2 + 1] + a[k2 + 1]; | 340 const float xi = a[j2 + 1] + a[k2 + 1]; |
339 const float yr = wkr * xr + wki * xi; | 341 const float yr = wkr * xr + wki * xi; |
340 const float yi = wkr * xi - wki * xr; | 342 const float yi = wkr * xi - wki * xr; |
341 a[j2 + 0] = a[j2 + 0] - yr; | 343 a[j2 + 0] = a[j2 + 0] - yr; |
342 a[j2 + 1] = yi - a[j2 + 1]; | 344 a[j2 + 1] = yi - a[j2 + 1]; |
343 a[k2 + 0] = yr + a[k2 + 0]; | 345 a[k2 + 0] = yr + a[k2 + 0]; |
344 a[k2 + 1] = yi - a[k2 + 1]; | 346 a[k2 + 1] = yi - a[k2 + 1]; |
345 } | 347 } |
346 a[65] = -a[65]; | 348 a[65] = -a[65]; |
347 } | 349 } |
| 350 #endif |
348 | 351 |
349 void aec_rdft_init_neon(void) { | 352 } // namespace webrtc |
350 cft1st_128 = cft1st_128_neon; | |
351 cftmdl_128 = cftmdl_128_neon; | |
352 rftfsub_128 = rftfsub_128_neon; | |
353 rftbsub_128 = rftbsub_128_neon; | |
354 } | |
355 | |
OLD | NEW |