Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(273)

Side by Side Diff: webrtc/modules/audio_processing/agc/agc_audio_proc.cc

Issue 1212543002: Pull the Voice Activity Detector out from the AGC (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "webrtc/modules/audio_processing/agc/agc_audio_proc.h"
12
13 #include <math.h>
14 #include <stdio.h>
15
16 #include "webrtc/common_audio/fft4g.h"
17 #include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h"
18 #include "webrtc/modules/audio_processing/agc/pitch_internal.h"
19 #include "webrtc/modules/audio_processing/agc/pole_zero_filter.h"
20 extern "C" {
21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
25 }
26 #include "webrtc/modules/interface/module_common_types.h"
27
28 namespace webrtc {
29
30 // The following structures are declared anonymous in iSAC's structs.h. To
31 // forward declare them, we use this derived class trick.
32 struct AgcAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
33 struct AgcAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
34
35 static const float kFrequencyResolution = kSampleRateHz /
36 static_cast<float>(AgcAudioProc::kDftSize);
37 static const int kSilenceRms = 5;
38
39 // TODO(turajs): Make a Create or Init for AgcAudioProc.
40 AgcAudioProc::AgcAudioProc()
41 : audio_buffer_(),
42 num_buffer_samples_(kNumPastSignalSamples),
43 log_old_gain_(-2),
44 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
45 pitch_analysis_handle_(new PitchAnalysisStruct),
46 pre_filter_handle_(new PreFiltBankstr),
47 high_pass_filter_(PoleZeroFilter::Create(
48 kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) {
49 static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
50 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
51 "lpc analysis window incorrect size");
52 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
53 "correlation weight incorrect size");
54
55 // TODO(turajs): Are we doing too much in the constructor?
56 float data[kDftSize];
57 // Make FFT to initialize.
58 ip_[0] = 0;
59 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
60 // TODO(turajs): Need to initialize high-pass filter.
61
62 // Initialize iSAC components.
63 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
64 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
65 }
66
67 AgcAudioProc::~AgcAudioProc() {}
68
69 void AgcAudioProc::ResetBuffer() {
70 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
71 sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
72 num_buffer_samples_ = kNumPastSignalSamples;
73 }
74
75 int AgcAudioProc::ExtractFeatures(const int16_t* frame,
76 int length,
77 AudioFeatures* features) {
78 features->num_frames = 0;
79 if (length != kNumSubframeSamples) {
80 return -1;
81 }
82
83 // High-pass filter to remove the DC component and very low frequency content.
84 // We have experienced that this high-pass filtering improves voice/non-voiced
85 // classification.
86 if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
87 &audio_buffer_[num_buffer_samples_]) != 0) {
88 return -1;
89 }
90
91 num_buffer_samples_ += kNumSubframeSamples;
92 if (num_buffer_samples_ < kBufferLength) {
93 return 0;
94 }
95 assert(num_buffer_samples_ == kBufferLength);
96 features->num_frames = kNum10msSubframes;
97 features->silence = false;
98
99 Rms(features->rms, kMaxNumFrames);
100 for (int i = 0; i < kNum10msSubframes; ++i) {
101 if (features->rms[i] < kSilenceRms) {
102 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
103 // Bail out here instead.
104 features->silence = true;
105 ResetBuffer();
106 return 0;
107 }
108 }
109
110 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
111 kMaxNumFrames);
112 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
113 ResetBuffer();
114 return 0;
115 }
116
117 // Computes |kLpcOrder + 1| correlation coefficients.
118 void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr,
119 int subframe_index) {
120 assert(length_corr >= kLpcOrder + 1);
121 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
122 int buffer_index = subframe_index * kNumSubframeSamples;
123
124 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
125 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
126
127 WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples +
128 kNumPastSignalSamples, kLpcOrder);
129 }
130
131 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
132 // The analysis window is 15 ms long and it is centered on the first half of
133 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
134 // first half of each 10 ms subframe.
135 void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {
136 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
137 double corr[kLpcOrder + 1];
138 double reflec_coeff[kLpcOrder];
139 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;
140 i++, offset_lpc += kLpcOrder + 1) {
141 SubframeCorrelation(corr, kLpcOrder + 1, i);
142 corr[0] *= 1.0001;
143 // This makes Lev-Durb a bit more stable.
144 for (int k = 0; k < kLpcOrder + 1; k++) {
145 corr[k] *= kCorrWeight[k];
146 }
147 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
148 }
149 }
150
151 // Fit a second order curve to these 3 points and find the location of the
152 // extremum. The points are inverted before curve fitting.
153 static float QuadraticInterpolation(float prev_val, float curr_val,
154 float next_val) {
155 // Doing the interpolation in |1 / A(z)|^2.
156 float fractional_index = 0;
157 next_val = 1.0f / next_val;
158 prev_val = 1.0f / prev_val;
159 curr_val = 1.0f / curr_val;
160
161 fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val -
162 2.f * curr_val);
163 assert(fabs(fractional_index) < 1);
164 return fractional_index;
165 }
166
167 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
168 // of the input signal. The local maximum of the spectral envelope corresponds
169 // with the local minimum of A(z). It saves complexity, as we save one
170 // inversion. Furthermore, we find the first local maximum of magnitude squared,
171 // to save on one square root.
172 void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {
173 assert(length_f_peak >= kNum10msSubframes);
174 double lpc[kNum10msSubframes * (kLpcOrder + 1)];
175 // For all sub-frames.
176 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
177
178 const int kNumDftCoefficients = kDftSize / 2 + 1;
179 float data[kDftSize];
180
181 for (int i = 0; i < kNum10msSubframes; i++) {
182 // Convert to float with zero pad.
183 memset(data, 0, sizeof(data));
184 for (int n = 0; n < kLpcOrder + 1; n++) {
185 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
186 }
187 // Transform to frequency domain.
188 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
189
190 int index_peak = 0;
191 float prev_magn_sqr = data[0] * data[0];
192 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
193 float next_magn_sqr;
194 bool found_peak = false;
195 for (int n = 2; n < kNumDftCoefficients - 1; n++) {
196 next_magn_sqr = data[2 * n] * data[2 * n] +
197 data[2 * n + 1] * data[2 * n + 1];
198 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
199 found_peak = true;
200 index_peak = n - 1;
201 break;
202 }
203 prev_magn_sqr = curr_magn_sqr;
204 curr_magn_sqr = next_magn_sqr;
205 }
206 float fractional_index = 0;
207 if (!found_peak) {
208 // Checking if |kNumDftCoefficients - 1| is the local minimum.
209 next_magn_sqr = data[1] * data[1];
210 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
211 index_peak = kNumDftCoefficients - 1;
212 }
213 } else {
214 // A peak is found, do a simple quadratic interpolation to get a more
215 // accurate estimate of the peak location.
216 fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr,
217 next_magn_sqr);
218 }
219 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
220 }
221 }
222
223 // Using iSAC functions to estimate pitch gains & lags.
224 void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz,
225 int length) {
226 // TODO(turajs): This can be "imported" from iSAC & and the next two
227 // constants.
228 assert(length >= kNum10msSubframes);
229 const int kNumPitchSubframes = 4;
230 double gains[kNumPitchSubframes];
231 double lags[kNumPitchSubframes];
232
233 const int kNumSubbandFrameSamples = 240;
234 const int kNumLookaheadSamples = 24;
235
236 float lower[kNumSubbandFrameSamples];
237 float upper[kNumSubbandFrameSamples];
238 double lower_lookahead[kNumSubbandFrameSamples];
239 double upper_lookahead[kNumSubbandFrameSamples];
240 double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
241 kNumLookaheadSamples];
242
243 // Split signal to lower and upper bands
244 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples],
245 lower, upper, lower_lookahead, upper_lookahead,
246 pre_filter_handle_.get());
247 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
248 pitch_analysis_handle_.get(), lags, gains);
249
250 // Lags are computed on lower-band signal with sampling rate half of the
251 // input signal.
252 GetSubframesPitchParameters(kSampleRateHz / 2, gains, lags,
253 kNumPitchSubframes, kNum10msSubframes,
254 &log_old_gain_, &old_lag_,
255 log_pitch_gains, pitch_lags_hz);
256 }
257
258 void AgcAudioProc::Rms(double* rms, int length_rms) {
259 assert(length_rms >= kNum10msSubframes);
260 int offset = kNumPastSignalSamples;
261 for (int i = 0; i < kNum10msSubframes; i++) {
262 rms[i] = 0;
263 for (int n = 0; n < kNumSubframeSamples; n++, offset++)
264 rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
265 rms[i] = sqrt(rms[i] / kNumSubframeSamples);
266 }
267 }
268
269 } // namespace webrtc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698