Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(384)

Side by Side Diff: webrtc/modules/audio_processing/utility/ooura_fft_neon.cc

Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe. (Closed)
Patch Set: Rebase Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 /* 11 /*
12 * The rdft AEC algorithm, neon version of speed-critical functions. 12 * The rdft AEC algorithm, neon version of speed-critical functions.
13 * 13 *
14 * Based on the sse2 version. 14 * Based on the sse2 version.
15 */ 15 */
16 16
17 17 #include "webrtc/modules/audio_processing/utility/ooura_fft.h"
18 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
19 18
20 #include <arm_neon.h> 19 #include <arm_neon.h>
21 20
22 static const ALIGN16_BEG float ALIGN16_END 21 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h"
23 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; 22 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"
24 23
25 static void cft1st_128_neon(float* a) { 24 namespace webrtc {
25
26 #if defined(WEBRTC_HAS_NEON)
27 void cft1st_128_neon(float* a) {
26 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); 28 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
27 int j, k2; 29 int j, k2;
28 30
29 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { 31 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
30 float32x4_t a00v = vld1q_f32(&a[j + 0]); 32 float32x4_t a00v = vld1q_f32(&a[j + 0]);
31 float32x4_t a04v = vld1q_f32(&a[j + 4]); 33 float32x4_t a04v = vld1q_f32(&a[j + 4]);
32 float32x4_t a08v = vld1q_f32(&a[j + 8]); 34 float32x4_t a08v = vld1q_f32(&a[j + 8]);
33 float32x4_t a12v = vld1q_f32(&a[j + 12]); 35 float32x4_t a12v = vld1q_f32(&a[j + 12]);
34 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); 36 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
35 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); 37 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
(...skipping 28 matching lines...) Expand all
64 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); 66 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
65 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); 67 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
66 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); 68 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
67 vst1q_f32(&a[j + 0], a00v); 69 vst1q_f32(&a[j + 0], a00v);
68 vst1q_f32(&a[j + 4], a04v); 70 vst1q_f32(&a[j + 4], a04v);
69 vst1q_f32(&a[j + 8], a08v); 71 vst1q_f32(&a[j + 8], a08v);
70 vst1q_f32(&a[j + 12], a12v); 72 vst1q_f32(&a[j + 12], a12v);
71 } 73 }
72 } 74 }
73 75
74 static void cftmdl_128_neon(float* a) { 76 void cftmdl_128_neon(float* a) {
75 int j; 77 int j;
76 const int l = 8; 78 const int l = 8;
77 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); 79 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
78 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); 80 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
79 81
80 for (j = 0; j < l; j += 2) { 82 for (j = 0; j < l; j += 2) {
81 const float32x2_t a_00 = vld1_f32(&a[j + 0]); 83 const float32x2_t a_00 = vld1_f32(&a[j + 0]);
82 const float32x2_t a_08 = vld1_f32(&a[j + 8]); 84 const float32x2_t a_08 = vld1_f32(&a[j + 8]);
83 const float32x2_t a_32 = vld1_f32(&a[j + 32]); 85 const float32x2_t a_32 = vld1_f32(&a[j + 32]);
84 const float32x2_t a_40 = vld1_f32(&a[j + 40]); 86 const float32x2_t a_40 = vld1_f32(&a[j + 40]);
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
178 } 180 }
179 } 181 }
180 182
181 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) { 183 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
182 // A B C D -> C D A B 184 // A B C D -> C D A B
183 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in)); 185 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
184 // C D A B -> D C B A 186 // C D A B -> D C B A
185 return vrev64q_f32(rev); 187 return vrev64q_f32(rev);
186 } 188 }
187 189
188 static void rftfsub_128_neon(float* a) { 190 void rftfsub_128_neon(float* a) {
189 const float* c = rdft_w + 32; 191 const float* c = rdft_w + 32;
190 int j1, j2; 192 int j1, j2;
191 const float32x4_t mm_half = vdupq_n_f32(0.5f); 193 const float32x4_t mm_half = vdupq_n_f32(0.5f);
192 194
193 // Vectorized code (four at once). 195 // Vectorized code (four at once).
194 // Note: commented number are indexes for the first iteration of the loop. 196 // Note: commented number are indexes for the first iteration of the loop.
195 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { 197 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
196 // Load 'wk'. 198 // Load 'wk'.
197 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, 199 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
198 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, 200 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
257 const float xi = a[j2 + 1] + a[k2 + 1]; 259 const float xi = a[j2 + 1] + a[k2 + 1];
258 const float yr = wkr * xr - wki * xi; 260 const float yr = wkr * xr - wki * xi;
259 const float yi = wkr * xi + wki * xr; 261 const float yi = wkr * xi + wki * xr;
260 a[j2 + 0] -= yr; 262 a[j2 + 0] -= yr;
261 a[j2 + 1] -= yi; 263 a[j2 + 1] -= yi;
262 a[k2 + 0] += yr; 264 a[k2 + 0] += yr;
263 a[k2 + 1] -= yi; 265 a[k2 + 1] -= yi;
264 } 266 }
265 } 267 }
266 268
267 static void rftbsub_128_neon(float* a) { 269 void rftbsub_128_neon(float* a) {
268 const float* c = rdft_w + 32; 270 const float* c = rdft_w + 32;
269 int j1, j2; 271 int j1, j2;
270 const float32x4_t mm_half = vdupq_n_f32(0.5f); 272 const float32x4_t mm_half = vdupq_n_f32(0.5f);
271 273
272 a[1] = -a[1]; 274 a[1] = -a[1];
273 // Vectorized code (four at once). 275 // Vectorized code (four at once).
274 // Note: commented number are indexes for the first iteration of the loop. 276 // Note: commented number are indexes for the first iteration of the loop.
275 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { 277 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
276 // Load 'wk'. 278 // Load 'wk'.
277 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, 279 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
278 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, 280 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
279 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, 281 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
280 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, 282 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
281 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, 283 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
282 // Load and shuffle 'a'. 284 // Load and shuffle 'a'.
283 // 2, 4, 6, 8, 3, 5, 7, 9 285 // 2, 4, 6, 8, 3, 5, 7, 9
284 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); 286 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
285 // 120, 122, 124, 126, 121, 123, 125, 127, 287 // 120, 122, 124, 126, 121, 123, 125, 127,
286 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); 288 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
287 // 126, 124, 122, 120 289 // 126, 124, 122, 120
288 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); 290 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
289 // 127, 125, 123, 121 291 // 127, 125, 123, 121
290 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); 292 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
291 // Calculate 'x'. 293 // Calculate 'x'.
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
338 const float xi = a[j2 + 1] + a[k2 + 1]; 340 const float xi = a[j2 + 1] + a[k2 + 1];
339 const float yr = wkr * xr + wki * xi; 341 const float yr = wkr * xr + wki * xi;
340 const float yi = wkr * xi - wki * xr; 342 const float yi = wkr * xi - wki * xr;
341 a[j2 + 0] = a[j2 + 0] - yr; 343 a[j2 + 0] = a[j2 + 0] - yr;
342 a[j2 + 1] = yi - a[j2 + 1]; 344 a[j2 + 1] = yi - a[j2 + 1];
343 a[k2 + 0] = yr + a[k2 + 0]; 345 a[k2 + 0] = yr + a[k2 + 0];
344 a[k2 + 1] = yi - a[k2 + 1]; 346 a[k2 + 1] = yi - a[k2 + 1];
345 } 347 }
346 a[65] = -a[65]; 348 a[65] = -a[65];
347 } 349 }
350 #endif
348 351
349 void aec_rdft_init_neon(void) { 352 } // namespace webrtc
350 cft1st_128 = cft1st_128_neon;
351 cftmdl_128 = cftmdl_128_neon;
352 rftfsub_128 = rftfsub_128_neon;
353 rftbsub_128 = rftbsub_128_neon;
354 }
355
OLDNEW
« no previous file with comments | « webrtc/modules/audio_processing/utility/ooura_fft_mips.cc ('k') | webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698