webrtc/modules/audio_processing/utility/ooura_fft_neon.cc - Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe.

Side by Side Diff: webrtc/modules/audio_processing/utility/ooura_fft_neon.cc

Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe. (Closed)

Patch Set: Rebase Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 /*	11 /*

12 * The rdft AEC algorithm, neon version of speed-critical functions.	12 * The rdft AEC algorithm, neon version of speed-critical functions.

13 *	13 *

14 * Based on the sse2 version.	14 * Based on the sse2 version.

15 */	15 */

16	16

17	17 #include "webrtc/modules/audio_processing/utility/ooura_fft.h"

18 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"

19	18

20 #include <arm_neon.h>	19 #include <arm_neon.h>

21	20

22 static const ALIGN16_BEG float ALIGN16_END	21 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h"

23 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};	22 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"

24	23

25 static void cft1st_128_neon(float* a) {	24 namespace webrtc {

	25

	26 #if defined(WEBRTC_HAS_NEON)

	27 void cft1st_128_neon(float* a) {

26 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);	28 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);

27 int j, k2;	29 int j, k2;

28	30

29 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {	31 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {

30 float32x4_t a00v = vld1q_f32(&a[j + 0]);	32 float32x4_t a00v = vld1q_f32(&a[j + 0]);

31 float32x4_t a04v = vld1q_f32(&a[j + 4]);	33 float32x4_t a04v = vld1q_f32(&a[j + 4]);

32 float32x4_t a08v = vld1q_f32(&a[j + 8]);	34 float32x4_t a08v = vld1q_f32(&a[j + 8]);

33 float32x4_t a12v = vld1q_f32(&a[j + 12]);	35 float32x4_t a12v = vld1q_f32(&a[j + 12]);

34 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));	36 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));

35 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));	37 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));

(...skipping 28 matching lines...) Expand all Loading...
64 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));	66 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));

65 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));	67 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));

66 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));	68 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));

67 vst1q_f32(&a[j + 0], a00v);	69 vst1q_f32(&a[j + 0], a00v);

68 vst1q_f32(&a[j + 4], a04v);	70 vst1q_f32(&a[j + 4], a04v);

69 vst1q_f32(&a[j + 8], a08v);	71 vst1q_f32(&a[j + 8], a08v);

70 vst1q_f32(&a[j + 12], a12v);	72 vst1q_f32(&a[j + 12], a12v);

71 }	73 }

72 }	74 }

73	75

74 static void cftmdl_128_neon(float* a) {	76 void cftmdl_128_neon(float* a) {

75 int j;	77 int j;

76 const int l = 8;	78 const int l = 8;

77 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);	79 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);

78 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);	80 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);

79	81

80 for (j = 0; j < l; j += 2) {	82 for (j = 0; j < l; j += 2) {

81 const float32x2_t a_00 = vld1_f32(&a[j + 0]);	83 const float32x2_t a_00 = vld1_f32(&a[j + 0]);

82 const float32x2_t a_08 = vld1_f32(&a[j + 8]);	84 const float32x2_t a_08 = vld1_f32(&a[j + 8]);

83 const float32x2_t a_32 = vld1_f32(&a[j + 32]);	85 const float32x2_t a_32 = vld1_f32(&a[j + 32]);

84 const float32x2_t a_40 = vld1_f32(&a[j + 40]);	86 const float32x2_t a_40 = vld1_f32(&a[j + 40]);

(...skipping 93 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
178 }	180 }

179 }	181 }

180	182

181 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) {	183 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) {

182 // A B C D -> C D A B	184 // A B C D -> C D A B

183 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));	185 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));

184 // C D A B -> D C B A	186 // C D A B -> D C B A

185 return vrev64q_f32(rev);	187 return vrev64q_f32(rev);

186 }	188 }

187	189

188 static void rftfsub_128_neon(float* a) {	190 void rftfsub_128_neon(float* a) {

189 const float* c = rdft_w + 32;	191 const float* c = rdft_w + 32;

190 int j1, j2;	192 int j1, j2;

191 const float32x4_t mm_half = vdupq_n_f32(0.5f);	193 const float32x4_t mm_half = vdupq_n_f32(0.5f);

192	194

193 // Vectorized code (four at once).	195 // Vectorized code (four at once).

194 // Note: commented number are indexes for the first iteration of the loop.	196 // Note: commented number are indexes for the first iteration of the loop.

195 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {	197 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {

196 // Load 'wk'.	198 // Load 'wk'.

197 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,	199 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,

198 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,	200 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
257 const float xi = a[j2 + 1] + a[k2 + 1];	259 const float xi = a[j2 + 1] + a[k2 + 1];

258 const float yr = wkr * xr - wki * xi;	260 const float yr = wkr * xr - wki * xi;

259 const float yi = wkr * xi + wki * xr;	261 const float yi = wkr * xi + wki * xr;

260 a[j2 + 0] -= yr;	262 a[j2 + 0] -= yr;

261 a[j2 + 1] -= yi;	263 a[j2 + 1] -= yi;

262 a[k2 + 0] += yr;	264 a[k2 + 0] += yr;

263 a[k2 + 1] -= yi;	265 a[k2 + 1] -= yi;

264 }	266 }

265 }	267 }

266	268

267 static void rftbsub_128_neon(float* a) {	269 void rftbsub_128_neon(float* a) {

268 const float* c = rdft_w + 32;	270 const float* c = rdft_w + 32;

269 int j1, j2;	271 int j1, j2;

270 const float32x4_t mm_half = vdupq_n_f32(0.5f);	272 const float32x4_t mm_half = vdupq_n_f32(0.5f);

271	273

272 a[1] = -a[1];	274 a[1] = -a[1];

273 // Vectorized code (four at once).	275 // Vectorized code (four at once).

274 // Note: commented number are indexes for the first iteration of the loop.	276 // Note: commented number are indexes for the first iteration of the loop.

275 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {	277 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {

276 // Load 'wk'.	278 // Load 'wk'.

277 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,	279 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,

278 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,	280 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,

279 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,	281 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,

280 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,	282 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,

281 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,	283 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,

282 // Load and shuffle 'a'.	284 // Load and shuffle 'a'.

283 // 2, 4, 6, 8, 3, 5, 7, 9	285 // 2, 4, 6, 8, 3, 5, 7, 9

284 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);	286 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);

285 // 120, 122, 124, 126, 121, 123, 125, 127,	287 // 120, 122, 124, 126, 121, 123, 125, 127,

286 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);	288 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);

287 // 126, 124, 122, 120	289 // 126, 124, 122, 120

288 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);	290 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);

289 // 127, 125, 123, 121	291 // 127, 125, 123, 121

290 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);	292 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);

291 // Calculate 'x'.	293 // Calculate 'x'.

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
338 const float xi = a[j2 + 1] + a[k2 + 1];	340 const float xi = a[j2 + 1] + a[k2 + 1];

339 const float yr = wkr * xr + wki * xi;	341 const float yr = wkr * xr + wki * xi;

340 const float yi = wkr * xi - wki * xr;	342 const float yi = wkr * xi - wki * xr;

341 a[j2 + 0] = a[j2 + 0] - yr;	343 a[j2 + 0] = a[j2 + 0] - yr;

342 a[j2 + 1] = yi - a[j2 + 1];	344 a[j2 + 1] = yi - a[j2 + 1];

343 a[k2 + 0] = yr + a[k2 + 0];	345 a[k2 + 0] = yr + a[k2 + 0];

344 a[k2 + 1] = yi - a[k2 + 1];	346 a[k2 + 1] = yi - a[k2 + 1];

345 }	347 }

346 a[65] = -a[65];	348 a[65] = -a[65];

347 }	349 }

	350 #endif

348	351

349 void aec_rdft_init_neon(void) {	352 } // namespace webrtc

350 cft1st_128 = cft1st_128_neon;

351 cftmdl_128 = cftmdl_128_neon;

352 rftfsub_128 = rftfsub_128_neon;

353 rftbsub_128 = rftbsub_128_neon;

354 }

355

OLD	NEW