webrtc/modules/audio_processing/aec/aec_core_neon.cc - Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe.

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_core_neon.cc

Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe. (Closed)

Patch Set: Rebase Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 /*	11 /*

12 * The core AEC algorithm, neon version of speed-critical functions.	12 * The core AEC algorithm, neon version of speed-critical functions.

13 *	13 *

14 * Based on aec_core_sse2.c.	14 * Based on aec_core_sse2.c.

15 */	15 */

16	16

17 #include <arm_neon.h>	17 #include <arm_neon.h>

18 #include <math.h>	18 #include <math.h>

19 #include <string.h> // memset	19 #include <string.h> // memset

20	20

21 extern "C" {	21 extern "C" {

22 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar y.h"	22 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar y.h"

23 }	23 }

24 #include "webrtc/modules/audio_processing/aec/aec_common.h"	24 #include "webrtc/modules/audio_processing/aec/aec_common.h"

25 #include "webrtc/modules/audio_processing/aec/aec_core_optimized_methods.h"	25 #include "webrtc/modules/audio_processing/aec/aec_core_optimized_methods.h"

26 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"	26 #include "webrtc/modules/audio_processing/utility/ooura_fft.h"

27	27

28 namespace webrtc {	28 namespace webrtc {

29	29

30 enum { kShiftExponentIntoTopMantissa = 8 };	30 enum { kShiftExponentIntoTopMantissa = 8 };

31 enum { kFloatExponentShift = 23 };	31 enum { kFloatExponentShift = 23 };

32	32

33 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {	33 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {

34 return aRe * bRe - aIm * bIm;	34 return aRe * bRe - aIm * bIm;

35 }	35 }

36	36

(...skipping 140 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
177 ef[1][i] *= abs_ef;	177 ef[1][i] *= abs_ef;

178 }	178 }

179	179

180 // Stepsize factor	180 // Stepsize factor

181 ef[0][i] *= mu;	181 ef[0][i] *= mu;

182 ef[1][i] *= mu;	182 ef[1][i] *= mu;

183 }	183 }

184 }	184 }

185	185

186 static void FilterAdaptationNEON(	186 static void FilterAdaptationNEON(

	187 const OouraFft& ooura_fft,

187 int num_partitions,	188 int num_partitions,

188 int x_fft_buf_block_pos,	189 int x_fft_buf_block_pos,

189 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],	190 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],

190 float e_fft[2][PART_LEN1],	191 float e_fft[2][PART_LEN1],

191 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {	192 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {

192 float fft[PART_LEN2];	193 float fft[PART_LEN2];

193 int i;	194 int i;

194 for (i = 0; i < num_partitions; i++) {	195 for (i = 0; i < num_partitions; i++) {

195 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;	196 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;

196 int pos = i * PART_LEN1;	197 int pos = i * PART_LEN1;

(...skipping 21 matching lines...) Expand all Loading...
218 const float32x4x2_t g_n_h = vzipq_f32(e, f);	219 const float32x4x2_t g_n_h = vzipq_f32(e, f);

219 // Store	220 // Store

220 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);	221 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);

221 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);	222 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);

222 }	223 }

223 // ... and fixup the first imaginary entry.	224 // ... and fixup the first imaginary entry.

224 fft[1] =	225 fft[1] =

225 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN],	226 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN],

226 e_fft[0][PART_LEN], e_fft[1][PART_LEN]);	227 e_fft[0][PART_LEN], e_fft[1][PART_LEN]);

227	228

228 aec_rdft_inverse_128(fft);	229 ooura_fft.InverseFft(fft);

229 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);	230 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

230	231

231 // fft scaling	232 // fft scaling

232 {	233 {

233 const float scale = 2.0f / PART_LEN2;	234 const float scale = 2.0f / PART_LEN2;

234 const float32x4_t scale_ps = vmovq_n_f32(scale);	235 const float32x4_t scale_ps = vmovq_n_f32(scale);

235 for (j = 0; j < PART_LEN; j += 4) {	236 for (j = 0; j < PART_LEN; j += 4) {

236 const float32x4_t fft_ps = vld1q_f32(&fft[j]);	237 const float32x4_t fft_ps = vld1q_f32(&fft[j]);

237 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);	238 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);

238 vst1q_f32(&fft[j], fft_scale);	239 vst1q_f32(&fft[j], fft_scale);

239 }	240 }

240 }	241 }

241 aec_rdft_forward_128(fft);	242 ooura_fft.Fft(fft);

242	243

243 {	244 {

244 const float wt1 = h_fft_buf[1][pos];	245 const float wt1 = h_fft_buf[1][pos];

245 h_fft_buf[0][pos + PART_LEN] += fft[1];	246 h_fft_buf[0][pos + PART_LEN] += fft[1];

246 for (j = 0; j < PART_LEN; j += 4) {	247 for (j = 0; j < PART_LEN; j += 4) {

247 float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]);	248 float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]);

248 float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]);	249 float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]);

249 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);	250 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);

250 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);	251 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);

251 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);	252 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);

(...skipping 475 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
727 WebRtcAec_FilterAdaptation = FilterAdaptationNEON;	728 WebRtcAec_FilterAdaptation = FilterAdaptationNEON;

728 WebRtcAec_Overdrive = OverdriveNEON;	729 WebRtcAec_Overdrive = OverdriveNEON;

729 WebRtcAec_Suppress = SuppressNEON;	730 WebRtcAec_Suppress = SuppressNEON;

730 WebRtcAec_ComputeCoherence = ComputeCoherenceNEON;	731 WebRtcAec_ComputeCoherence = ComputeCoherenceNEON;

731 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraNEON;	732 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraNEON;

732 WebRtcAec_StoreAsComplex = StoreAsComplexNEON;	733 WebRtcAec_StoreAsComplex = StoreAsComplexNEON;

733 WebRtcAec_PartitionDelay = PartitionDelayNEON;	734 WebRtcAec_PartitionDelay = PartitionDelayNEON;

734 WebRtcAec_WindowData = WindowDataNEON;	735 WebRtcAec_WindowData = WindowDataNEON;

735 }	736 }

736 } // namespace webrtc	737 } // namespace webrtc

OLD	NEW