OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" | 11 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" |
12 | 12 |
13 #include <math.h> | 13 #include <math.h> |
14 #include <stdio.h> | 14 #include <stdio.h> |
15 | 15 |
| 16 #include "webrtc/base/checks.h" |
16 #include "webrtc/common_audio/fft4g.h" | 17 #include "webrtc/common_audio/fft4g.h" |
17 #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h" | 18 #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h" |
18 #include "webrtc/modules/audio_processing/vad/pitch_internal.h" | 19 #include "webrtc/modules/audio_processing/vad/pitch_internal.h" |
19 #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" | 20 #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" |
20 extern "C" { | 21 extern "C" { |
21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h" | 22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h" |
22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h" | 23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h" |
23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" | 24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" |
24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h" | 25 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h" |
25 } | 26 } |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
88 // classification. | 89 // classification. |
89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, | 90 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, |
90 &audio_buffer_[num_buffer_samples_]) != 0) { | 91 &audio_buffer_[num_buffer_samples_]) != 0) { |
91 return -1; | 92 return -1; |
92 } | 93 } |
93 | 94 |
94 num_buffer_samples_ += kNumSubframeSamples; | 95 num_buffer_samples_ += kNumSubframeSamples; |
95 if (num_buffer_samples_ < kBufferLength) { | 96 if (num_buffer_samples_ < kBufferLength) { |
96 return 0; | 97 return 0; |
97 } | 98 } |
98 assert(num_buffer_samples_ == kBufferLength); | 99 RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength); |
99 features->num_frames = kNum10msSubframes; | 100 features->num_frames = kNum10msSubframes; |
100 features->silence = false; | 101 features->silence = false; |
101 | 102 |
102 Rms(features->rms, kMaxNumFrames); | 103 Rms(features->rms, kMaxNumFrames); |
103 for (size_t i = 0; i < kNum10msSubframes; ++i) { | 104 for (size_t i = 0; i < kNum10msSubframes; ++i) { |
104 if (features->rms[i] < kSilenceRms) { | 105 if (features->rms[i] < kSilenceRms) { |
105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. | 106 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. |
106 // Bail out here instead. | 107 // Bail out here instead. |
107 features->silence = true; | 108 features->silence = true; |
108 ResetBuffer(); | 109 ResetBuffer(); |
109 return 0; | 110 return 0; |
110 } | 111 } |
111 } | 112 } |
112 | 113 |
113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, | 114 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, |
114 kMaxNumFrames); | 115 kMaxNumFrames); |
115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); | 116 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); |
116 ResetBuffer(); | 117 ResetBuffer(); |
117 return 0; | 118 return 0; |
118 } | 119 } |
119 | 120 |
120 // Computes |kLpcOrder + 1| correlation coefficients. | 121 // Computes |kLpcOrder + 1| correlation coefficients. |
121 void VadAudioProc::SubframeCorrelation(double* corr, | 122 void VadAudioProc::SubframeCorrelation(double* corr, |
122 size_t length_corr, | 123 size_t length_corr, |
123 size_t subframe_index) { | 124 size_t subframe_index) { |
124 assert(length_corr >= kLpcOrder + 1); | 125 RTC_DCHECK_GE(length_corr, kLpcOrder + 1); |
125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; | 126 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; |
126 size_t buffer_index = subframe_index * kNumSubframeSamples; | 127 size_t buffer_index = subframe_index * kNumSubframeSamples; |
127 | 128 |
128 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) | 129 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) |
129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; | 130 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; |
130 | 131 |
131 WebRtcIsac_AutoCorr(corr, windowed_audio, | 132 WebRtcIsac_AutoCorr(corr, windowed_audio, |
132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); | 133 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); |
133 } | 134 } |
134 | 135 |
135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. | 136 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. |
136 // The analysis window is 15 ms long and it is centered on the first half of | 137 // The analysis window is 15 ms long and it is centered on the first half of |
137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the | 138 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the |
138 // first half of each 10 ms subframe. | 139 // first half of each 10 ms subframe. |
139 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { | 140 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { |
140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); | 141 RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1)); |
141 double corr[kLpcOrder + 1]; | 142 double corr[kLpcOrder + 1]; |
142 double reflec_coeff[kLpcOrder]; | 143 double reflec_coeff[kLpcOrder]; |
143 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; | 144 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; |
144 i++, offset_lpc += kLpcOrder + 1) { | 145 i++, offset_lpc += kLpcOrder + 1) { |
145 SubframeCorrelation(corr, kLpcOrder + 1, i); | 146 SubframeCorrelation(corr, kLpcOrder + 1, i); |
146 corr[0] *= 1.0001; | 147 corr[0] *= 1.0001; |
147 // This makes Lev-Durb a bit more stable. | 148 // This makes Lev-Durb a bit more stable. |
148 for (size_t k = 0; k < kLpcOrder + 1; k++) { | 149 for (size_t k = 0; k < kLpcOrder + 1; k++) { |
149 corr[k] *= kCorrWeight[k]; | 150 corr[k] *= kCorrWeight[k]; |
150 } | 151 } |
151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); | 152 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); |
152 } | 153 } |
153 } | 154 } |
154 | 155 |
155 // Fit a second order curve to these 3 points and find the location of the | 156 // Fit a second order curve to these 3 points and find the location of the |
156 // extremum. The points are inverted before curve fitting. | 157 // extremum. The points are inverted before curve fitting. |
157 static float QuadraticInterpolation(float prev_val, | 158 static float QuadraticInterpolation(float prev_val, |
158 float curr_val, | 159 float curr_val, |
159 float next_val) { | 160 float next_val) { |
160 // Doing the interpolation in |1 / A(z)|^2. | 161 // Doing the interpolation in |1 / A(z)|^2. |
161 float fractional_index = 0; | 162 float fractional_index = 0; |
162 next_val = 1.0f / next_val; | 163 next_val = 1.0f / next_val; |
163 prev_val = 1.0f / prev_val; | 164 prev_val = 1.0f / prev_val; |
164 curr_val = 1.0f / curr_val; | 165 curr_val = 1.0f / curr_val; |
165 | 166 |
166 fractional_index = | 167 fractional_index = |
167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); | 168 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); |
168 assert(fabs(fractional_index) < 1); | 169 RTC_DCHECK_LT(fabs(fractional_index), 1); |
169 return fractional_index; | 170 return fractional_index; |
170 } | 171 } |
171 | 172 |
172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope | 173 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope |
173 // of the input signal. The local maximum of the spectral envelope corresponds | 174 // of the input signal. The local maximum of the spectral envelope corresponds |
174 // with the local minimum of A(z). It saves complexity, as we save one | 175 // with the local minimum of A(z). It saves complexity, as we save one |
175 // inversion. Furthermore, we find the first local maximum of magnitude squared, | 176 // inversion. Furthermore, we find the first local maximum of magnitude squared, |
176 // to save on one square root. | 177 // to save on one square root. |
177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, | 178 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, |
178 size_t length_f_peak) { | 179 size_t length_f_peak) { |
179 assert(length_f_peak >= kNum10msSubframes); | 180 RTC_DCHECK_GE(length_f_peak, kNum10msSubframes); |
180 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; | 181 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; |
181 // For all sub-frames. | 182 // For all sub-frames. |
182 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); | 183 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); |
183 | 184 |
184 const size_t kNumDftCoefficients = kDftSize / 2 + 1; | 185 const size_t kNumDftCoefficients = kDftSize / 2 + 1; |
185 float data[kDftSize]; | 186 float data[kDftSize]; |
186 | 187 |
187 for (size_t i = 0; i < kNum10msSubframes; i++) { | 188 for (size_t i = 0; i < kNum10msSubframes; i++) { |
188 // Convert to float with zero pad. | 189 // Convert to float with zero pad. |
189 memset(data, 0, sizeof(data)); | 190 memset(data, 0, sizeof(data)); |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
225 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; | 226 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; |
226 } | 227 } |
227 } | 228 } |
228 | 229 |
229 // Using iSAC functions to estimate pitch gains & lags. | 230 // Using iSAC functions to estimate pitch gains & lags. |
230 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, | 231 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, |
231 double* pitch_lags_hz, | 232 double* pitch_lags_hz, |
232 size_t length) { | 233 size_t length) { |
233 // TODO(turajs): This can be "imported" from iSAC & and the next two | 234 // TODO(turajs): This can be "imported" from iSAC & and the next two |
234 // constants. | 235 // constants. |
235 assert(length >= kNum10msSubframes); | 236 RTC_DCHECK_GE(length, kNum10msSubframes); |
236 const int kNumPitchSubframes = 4; | 237 const int kNumPitchSubframes = 4; |
237 double gains[kNumPitchSubframes]; | 238 double gains[kNumPitchSubframes]; |
238 double lags[kNumPitchSubframes]; | 239 double lags[kNumPitchSubframes]; |
239 | 240 |
240 const int kNumSubbandFrameSamples = 240; | 241 const int kNumSubbandFrameSamples = 240; |
241 const int kNumLookaheadSamples = 24; | 242 const int kNumLookaheadSamples = 24; |
242 | 243 |
243 float lower[kNumSubbandFrameSamples]; | 244 float lower[kNumSubbandFrameSamples]; |
244 float upper[kNumSubbandFrameSamples]; | 245 float upper[kNumSubbandFrameSamples]; |
245 double lower_lookahead[kNumSubbandFrameSamples]; | 246 double lower_lookahead[kNumSubbandFrameSamples]; |
246 double upper_lookahead[kNumSubbandFrameSamples]; | 247 double upper_lookahead[kNumSubbandFrameSamples]; |
247 double lower_lookahead_pre_filter[kNumSubbandFrameSamples + | 248 double lower_lookahead_pre_filter[kNumSubbandFrameSamples + |
248 kNumLookaheadSamples]; | 249 kNumLookaheadSamples]; |
249 | 250 |
250 // Split signal to lower and upper bands | 251 // Split signal to lower and upper bands |
251 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, | 252 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, |
252 upper, lower_lookahead, upper_lookahead, | 253 upper, lower_lookahead, upper_lookahead, |
253 pre_filter_handle_.get()); | 254 pre_filter_handle_.get()); |
254 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, | 255 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, |
255 pitch_analysis_handle_.get(), lags, gains); | 256 pitch_analysis_handle_.get(), lags, gains); |
256 | 257 |
257 // Lags are computed on lower-band signal with sampling rate half of the | 258 // Lags are computed on lower-band signal with sampling rate half of the |
258 // input signal. | 259 // input signal. |
259 GetSubframesPitchParameters( | 260 GetSubframesPitchParameters( |
260 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, | 261 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, |
261 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); | 262 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); |
262 } | 263 } |
263 | 264 |
264 void VadAudioProc::Rms(double* rms, size_t length_rms) { | 265 void VadAudioProc::Rms(double* rms, size_t length_rms) { |
265 assert(length_rms >= kNum10msSubframes); | 266 RTC_DCHECK_GE(length_rms, kNum10msSubframes); |
266 size_t offset = kNumPastSignalSamples; | 267 size_t offset = kNumPastSignalSamples; |
267 for (size_t i = 0; i < kNum10msSubframes; i++) { | 268 for (size_t i = 0; i < kNum10msSubframes; i++) { |
268 rms[i] = 0; | 269 rms[i] = 0; |
269 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) | 270 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) |
270 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; | 271 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; |
271 rms[i] = sqrt(rms[i] / kNumSubframeSamples); | 272 rms[i] = sqrt(rms[i] / kNumSubframeSamples); |
272 } | 273 } |
273 } | 274 } |
274 | 275 |
275 } // namespace webrtc | 276 } // namespace webrtc |
OLD | NEW |