OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" | |
12 | |
13 #include <math.h> | |
14 #include <stdio.h> | |
15 | |
16 #include "webrtc/common_audio/fft4g.h" | |
17 #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h" | |
18 #include "webrtc/modules/audio_processing/vad/pitch_internal.h" | |
19 #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" | |
20 extern "C" { | |
21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h" | |
22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h" | |
23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" | |
24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h" | |
25 } | |
26 #include "webrtc/modules/interface/module_common_types.h" | |
27 | |
28 namespace webrtc { | |
29 | |
30 // The following structures are declared anonymous in iSAC's structs.h. To | |
31 // forward declare them, we use this derived class trick. | |
32 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; | |
33 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; | |
34 | |
35 static const float kFrequencyResolution = | |
36 kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize); | |
37 static const int kSilenceRms = 5; | |
38 | |
39 // TODO(turajs): Make a Create or Init for VadAudioProc. | |
40 VadAudioProc::VadAudioProc() | |
41 : audio_buffer_(), | |
42 num_buffer_samples_(kNumPastSignalSamples), | |
43 log_old_gain_(-2), | |
44 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). | |
45 pitch_analysis_handle_(new PitchAnalysisStruct), | |
46 pre_filter_handle_(new PreFiltBankstr), | |
47 high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, | |
48 kFilterOrder, | |
49 kCoeffDenominator, | |
50 kFilterOrder)) { | |
51 static_assert(kNumPastSignalSamples + kNumSubframeSamples == | |
52 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), | |
53 "lpc analysis window incorrect size"); | |
54 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), | |
55 "correlation weight incorrect size"); | |
56 | |
57 // TODO(turajs): Are we doing too much in the constructor? | |
58 float data[kDftSize]; | |
59 // Make FFT to initialize. | |
60 ip_[0] = 0; | |
61 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | |
62 // TODO(turajs): Need to initialize high-pass filter. | |
63 | |
64 // Initialize iSAC components. | |
65 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); | |
66 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); | |
67 } | |
68 | |
69 VadAudioProc::~VadAudioProc() { | |
70 } | |
71 | |
72 void VadAudioProc::ResetBuffer() { | |
73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], | |
74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples); | |
75 num_buffer_samples_ = kNumPastSignalSamples; | |
76 } | |
77 | |
78 int VadAudioProc::ExtractFeatures(const int16_t* frame, | |
79 int length, | |
80 AudioFeatures* features) { | |
81 features->num_frames = 0; | |
82 if (length != kNumSubframeSamples) { | |
83 return -1; | |
84 } | |
85 | |
86 // High-pass filter to remove the DC component and very low frequency content. | |
87 // We have experienced that this high-pass filtering improves voice/non-voiced | |
88 // classification. | |
89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, | |
90 &audio_buffer_[num_buffer_samples_]) != 0) { | |
91 return -1; | |
92 } | |
93 | |
94 num_buffer_samples_ += kNumSubframeSamples; | |
95 if (num_buffer_samples_ < kBufferLength) { | |
96 return 0; | |
97 } | |
98 assert(num_buffer_samples_ == kBufferLength); | |
99 features->num_frames = kNum10msSubframes; | |
100 features->silence = false; | |
101 | |
102 Rms(features->rms, kMaxNumFrames); | |
103 for (int i = 0; i < kNum10msSubframes; ++i) { | |
104 if (features->rms[i] < kSilenceRms) { | |
105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. | |
106 // Bail out here instead. | |
107 features->silence = true; | |
108 ResetBuffer(); | |
109 return 0; | |
110 } | |
111 } | |
112 | |
113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, | |
114 kMaxNumFrames); | |
115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); | |
116 ResetBuffer(); | |
117 return 0; | |
118 } | |
119 | |
120 // Computes |kLpcOrder + 1| correlation coefficients. | |
121 void VadAudioProc::SubframeCorrelation(double* corr, | |
122 int length_corr, | |
123 int subframe_index) { | |
124 assert(length_corr >= kLpcOrder + 1); | |
125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; | |
126 int buffer_index = subframe_index * kNumSubframeSamples; | |
127 | |
128 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) | |
129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; | |
130 | |
131 WebRtcIsac_AutoCorr(corr, windowed_audio, | |
132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); | |
133 } | |
134 | |
135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. | |
136 // The analysis window is 15 ms long and it is centered on the first half of | |
137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the | |
138 // first half of each 10 ms subframe. | |
139 void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) { | |
140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); | |
141 double corr[kLpcOrder + 1]; | |
142 double reflec_coeff[kLpcOrder]; | |
143 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes; | |
144 i++, offset_lpc += kLpcOrder + 1) { | |
145 SubframeCorrelation(corr, kLpcOrder + 1, i); | |
146 corr[0] *= 1.0001; | |
147 // This makes Lev-Durb a bit more stable. | |
148 for (int k = 0; k < kLpcOrder + 1; k++) { | |
149 corr[k] *= kCorrWeight[k]; | |
150 } | |
151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); | |
152 } | |
153 } | |
154 | |
155 // Fit a second order curve to these 3 points and find the location of the | |
156 // extremum. The points are inverted before curve fitting. | |
157 static float QuadraticInterpolation(float prev_val, | |
158 float curr_val, | |
159 float next_val) { | |
160 // Doing the interpolation in |1 / A(z)|^2. | |
161 float fractional_index = 0; | |
162 next_val = 1.0f / next_val; | |
163 prev_val = 1.0f / prev_val; | |
164 curr_val = 1.0f / curr_val; | |
165 | |
166 fractional_index = | |
167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); | |
168 assert(fabs(fractional_index) < 1); | |
169 return fractional_index; | |
170 } | |
171 | |
172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope | |
173 // of the input signal. The local maximum of the spectral envelope corresponds | |
174 // with the local minimum of A(z). It saves complexity, as we save one | |
175 // inversion. Furthermore, we find the first local maximum of magnitude squared, | |
176 // to save on one square root. | |
177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { | |
178 assert(length_f_peak >= kNum10msSubframes); | |
179 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; | |
180 // For all sub-frames. | |
181 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); | |
182 | |
183 const int kNumDftCoefficients = kDftSize / 2 + 1; | |
184 float data[kDftSize]; | |
185 | |
186 for (int i = 0; i < kNum10msSubframes; i++) { | |
187 // Convert to float with zero pad. | |
188 memset(data, 0, sizeof(data)); | |
189 for (int n = 0; n < kLpcOrder + 1; n++) { | |
190 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); | |
191 } | |
192 // Transform to frequency domain. | |
193 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | |
194 | |
195 int index_peak = 0; | |
196 float prev_magn_sqr = data[0] * data[0]; | |
197 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; | |
198 float next_magn_sqr; | |
199 bool found_peak = false; | |
200 for (int n = 2; n < kNumDftCoefficients - 1; n++) { | |
201 next_magn_sqr = | |
202 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; | |
203 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | |
204 found_peak = true; | |
205 index_peak = n - 1; | |
206 break; | |
207 } | |
208 prev_magn_sqr = curr_magn_sqr; | |
209 curr_magn_sqr = next_magn_sqr; | |
210 } | |
211 float fractional_index = 0; | |
212 if (!found_peak) { | |
213 // Checking if |kNumDftCoefficients - 1| is the local minimum. | |
214 next_magn_sqr = data[1] * data[1]; | |
215 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | |
216 index_peak = kNumDftCoefficients - 1; | |
217 } | |
218 } else { | |
219 // A peak is found, do a simple quadratic interpolation to get a more | |
220 // accurate estimate of the peak location. | |
221 fractional_index = | |
222 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); | |
223 } | |
224 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; | |
225 } | |
226 } | |
227 | |
228 // Using iSAC functions to estimate pitch gains & lags. | |
229 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, | |
230 double* pitch_lags_hz, | |
231 int length) { | |
232 // TODO(turajs): This can be "imported" from iSAC & and the next two | |
233 // constants. | |
234 assert(length >= kNum10msSubframes); | |
235 const int kNumPitchSubframes = 4; | |
236 double gains[kNumPitchSubframes]; | |
237 double lags[kNumPitchSubframes]; | |
238 | |
239 const int kNumSubbandFrameSamples = 240; | |
240 const int kNumLookaheadSamples = 24; | |
241 | |
242 float lower[kNumSubbandFrameSamples]; | |
243 float upper[kNumSubbandFrameSamples]; | |
244 double lower_lookahead[kNumSubbandFrameSamples]; | |
245 double upper_lookahead[kNumSubbandFrameSamples]; | |
246 double lower_lookahead_pre_filter[kNumSubbandFrameSamples + | |
247 kNumLookaheadSamples]; | |
248 | |
249 // Split signal to lower and upper bands | |
250 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, | |
251 upper, lower_lookahead, upper_lookahead, | |
252 pre_filter_handle_.get()); | |
253 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, | |
254 pitch_analysis_handle_.get(), lags, gains); | |
255 | |
256 // Lags are computed on lower-band signal with sampling rate half of the | |
257 // input signal. | |
258 GetSubframesPitchParameters( | |
259 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, | |
260 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); | |
261 } | |
262 | |
263 void VadAudioProc::Rms(double* rms, int length_rms) { | |
264 assert(length_rms >= kNum10msSubframes); | |
265 int offset = kNumPastSignalSamples; | |
266 for (int i = 0; i < kNum10msSubframes; i++) { | |
267 rms[i] = 0; | |
268 for (int n = 0; n < kNumSubframeSamples; n++, offset++) | |
269 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; | |
270 rms[i] = sqrt(rms[i] / kNumSubframeSamples); | |
271 } | |
272 } | |
273 | |
274 } // namespace webrtc | |
OLD | NEW |