| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 /* | 11 /* |
| 12 * The core AEC algorithm, neon version of speed-critical functions. | 12 * The core AEC algorithm, neon version of speed-critical functions. |
| 13 * | 13 * |
| 14 * Based on aec_core_sse2.c. | 14 * Based on aec_core_sse2.c. |
| 15 */ | 15 */ |
| 16 | 16 |
| 17 #include <arm_neon.h> | 17 #include <arm_neon.h> |
| 18 #include <math.h> | 18 #include <math.h> |
| 19 #include <string.h> // memset | 19 #include <string.h> // memset |
| 20 | 20 |
| 21 extern "C" { | 21 extern "C" { |
| 22 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" | 22 #include "webrtc/common_audio/signal_processing/include/signal_processing_librar
y.h" |
| 23 } | 23 } |
| 24 #include "webrtc/modules/audio_processing/aec/aec_common.h" | 24 #include "webrtc/modules/audio_processing/aec/aec_common.h" |
| 25 #include "webrtc/modules/audio_processing/aec/aec_core_optimized_methods.h" | 25 #include "webrtc/modules/audio_processing/aec/aec_core_optimized_methods.h" |
| 26 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 26 #include "webrtc/modules/audio_processing/utility/ooura_fft.h" |
| 27 | 27 |
| 28 namespace webrtc { | 28 namespace webrtc { |
| 29 | 29 |
| 30 enum { kShiftExponentIntoTopMantissa = 8 }; | 30 enum { kShiftExponentIntoTopMantissa = 8 }; |
| 31 enum { kFloatExponentShift = 23 }; | 31 enum { kFloatExponentShift = 23 }; |
| 32 | 32 |
| 33 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 33 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
| 34 return aRe * bRe - aIm * bIm; | 34 return aRe * bRe - aIm * bIm; |
| 35 } | 35 } |
| 36 | 36 |
| (...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 177 ef[1][i] *= abs_ef; | 177 ef[1][i] *= abs_ef; |
| 178 } | 178 } |
| 179 | 179 |
| 180 // Stepsize factor | 180 // Stepsize factor |
| 181 ef[0][i] *= mu; | 181 ef[0][i] *= mu; |
| 182 ef[1][i] *= mu; | 182 ef[1][i] *= mu; |
| 183 } | 183 } |
| 184 } | 184 } |
| 185 | 185 |
| 186 static void FilterAdaptationNEON( | 186 static void FilterAdaptationNEON( |
| 187 const OouraFft& ooura_fft, |
| 187 int num_partitions, | 188 int num_partitions, |
| 188 int x_fft_buf_block_pos, | 189 int x_fft_buf_block_pos, |
| 189 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], | 190 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], |
| 190 float e_fft[2][PART_LEN1], | 191 float e_fft[2][PART_LEN1], |
| 191 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { | 192 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { |
| 192 float fft[PART_LEN2]; | 193 float fft[PART_LEN2]; |
| 193 int i; | 194 int i; |
| 194 for (i = 0; i < num_partitions; i++) { | 195 for (i = 0; i < num_partitions; i++) { |
| 195 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; | 196 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; |
| 196 int pos = i * PART_LEN1; | 197 int pos = i * PART_LEN1; |
| (...skipping 21 matching lines...) Expand all Loading... |
| 218 const float32x4x2_t g_n_h = vzipq_f32(e, f); | 219 const float32x4x2_t g_n_h = vzipq_f32(e, f); |
| 219 // Store | 220 // Store |
| 220 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); | 221 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); |
| 221 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); | 222 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); |
| 222 } | 223 } |
| 223 // ... and fixup the first imaginary entry. | 224 // ... and fixup the first imaginary entry. |
| 224 fft[1] = | 225 fft[1] = |
| 225 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], | 226 MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], |
| 226 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); | 227 e_fft[0][PART_LEN], e_fft[1][PART_LEN]); |
| 227 | 228 |
| 228 aec_rdft_inverse_128(fft); | 229 ooura_fft.InverseFft(fft); |
| 229 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | 230 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); |
| 230 | 231 |
| 231 // fft scaling | 232 // fft scaling |
| 232 { | 233 { |
| 233 const float scale = 2.0f / PART_LEN2; | 234 const float scale = 2.0f / PART_LEN2; |
| 234 const float32x4_t scale_ps = vmovq_n_f32(scale); | 235 const float32x4_t scale_ps = vmovq_n_f32(scale); |
| 235 for (j = 0; j < PART_LEN; j += 4) { | 236 for (j = 0; j < PART_LEN; j += 4) { |
| 236 const float32x4_t fft_ps = vld1q_f32(&fft[j]); | 237 const float32x4_t fft_ps = vld1q_f32(&fft[j]); |
| 237 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); | 238 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); |
| 238 vst1q_f32(&fft[j], fft_scale); | 239 vst1q_f32(&fft[j], fft_scale); |
| 239 } | 240 } |
| 240 } | 241 } |
| 241 aec_rdft_forward_128(fft); | 242 ooura_fft.Fft(fft); |
| 242 | 243 |
| 243 { | 244 { |
| 244 const float wt1 = h_fft_buf[1][pos]; | 245 const float wt1 = h_fft_buf[1][pos]; |
| 245 h_fft_buf[0][pos + PART_LEN] += fft[1]; | 246 h_fft_buf[0][pos + PART_LEN] += fft[1]; |
| 246 for (j = 0; j < PART_LEN; j += 4) { | 247 for (j = 0; j < PART_LEN; j += 4) { |
| 247 float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); | 248 float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); |
| 248 float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); | 249 float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); |
| 249 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); | 250 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); |
| 250 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); | 251 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); |
| 251 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); | 252 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); |
| (...skipping 475 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 727 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; | 728 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; |
| 728 WebRtcAec_Overdrive = OverdriveNEON; | 729 WebRtcAec_Overdrive = OverdriveNEON; |
| 729 WebRtcAec_Suppress = SuppressNEON; | 730 WebRtcAec_Suppress = SuppressNEON; |
| 730 WebRtcAec_ComputeCoherence = ComputeCoherenceNEON; | 731 WebRtcAec_ComputeCoherence = ComputeCoherenceNEON; |
| 731 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraNEON; | 732 WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraNEON; |
| 732 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; | 733 WebRtcAec_StoreAsComplex = StoreAsComplexNEON; |
| 733 WebRtcAec_PartitionDelay = PartitionDelayNEON; | 734 WebRtcAec_PartitionDelay = PartitionDelayNEON; |
| 734 WebRtcAec_WindowData = WindowDataNEON; | 735 WebRtcAec_WindowData = WindowDataNEON; |
| 735 } | 736 } |
| 736 } // namespace webrtc | 737 } // namespace webrtc |
| OLD | NEW |