webrtc/modules/audio_processing/agc/agc_audio_proc.cc - Issue 1212543002: Pull the Voice Activity Detector out from the AGC

Side by Side Diff: webrtc/modules/audio_processing/agc/agc_audio_proc.cc

Issue 1212543002: Pull the Voice Activity Detector out from the AGC (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/agc/agc_audio_proc.h ('k') | webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h » ('j') | webrtc/modules/audio_processing/vad/voice_activity_detector.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11 #include "webrtc/modules/audio_processing/agc/agc_audio_proc.h"

12

13 #include <math.h>

14 #include <stdio.h>

15

16 #include "webrtc/common_audio/fft4g.h"

17 #include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h"

18 #include "webrtc/modules/audio_processing/agc/pitch_internal.h"

19 #include "webrtc/modules/audio_processing/agc/pole_zero_filter.h"

20 extern "C" {

21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"

22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"

23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"

24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"

25 }

26 #include "webrtc/modules/interface/module_common_types.h"

27

28 namespace webrtc {

29

30 // The following structures are declared anonymous in iSAC's structs.h. To

31 // forward declare them, we use this derived class trick.

32 struct AgcAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};

33 struct AgcAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};

34

35 static const float kFrequencyResolution = kSampleRateHz /

36 static_cast<float>(AgcAudioProc::kDftSize);

37 static const int kSilenceRms = 5;

38

39 // TODO(turajs): Make a Create or Init for AgcAudioProc.

40 AgcAudioProc::AgcAudioProc()

41 : audio_buffer_(),

42 num_buffer_samples_(kNumPastSignalSamples),

43 log_old_gain_(-2),

44 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).

45 pitch_analysis_handle_(new PitchAnalysisStruct),

46 pre_filter_handle_(new PreFiltBankstr),

47 high_pass_filter_(PoleZeroFilter::Create(

48 kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) {

49 static_assert(kNumPastSignalSamples + kNumSubframeSamples ==

50 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),

51 "lpc analysis window incorrect size");

52 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),

53 "correlation weight incorrect size");

54

55 // TODO(turajs): Are we doing too much in the constructor?

56 float data[kDftSize];

57 // Make FFT to initialize.

58 ip_[0] = 0;

59 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);

60 // TODO(turajs): Need to initialize high-pass filter.

61

62 // Initialize iSAC components.

63 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());

64 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());

65 }

66

67 AgcAudioProc::~AgcAudioProc() {}

68

69 void AgcAudioProc::ResetBuffer() {

70 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],

71 sizeof(audio_buffer_[0]) * kNumPastSignalSamples);

72 num_buffer_samples_ = kNumPastSignalSamples;

73 }

74

75 int AgcAudioProc::ExtractFeatures(const int16_t* frame,

76 int length,

77 AudioFeatures* features) {

78 features->num_frames = 0;

79 if (length != kNumSubframeSamples) {

80 return -1;

81 }

82

83 // High-pass filter to remove the DC component and very low frequency content.

84 // We have experienced that this high-pass filtering improves voice/non-voiced

85 // classification.

86 if (high_pass_filter_->Filter(frame, kNumSubframeSamples,

87 &audio_buffer_[num_buffer_samples_]) != 0) {

88 return -1;

89 }

90

91 num_buffer_samples_ += kNumSubframeSamples;

92 if (num_buffer_samples_ < kBufferLength) {

93 return 0;

94 }

95 assert(num_buffer_samples_ == kBufferLength);

96 features->num_frames = kNum10msSubframes;

97 features->silence = false;

98

99 Rms(features->rms, kMaxNumFrames);

100 for (int i = 0; i < kNum10msSubframes; ++i) {

101 if (features->rms[i] < kSilenceRms) {

102 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.

103 // Bail out here instead.

104 features->silence = true;

105 ResetBuffer();

106 return 0;

107 }

108 }

109

110 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,

111 kMaxNumFrames);

112 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);

113 ResetBuffer();

114 return 0;

115 }

116

117 // Computes \|kLpcOrder + 1\| correlation coefficients.

118 void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr,

119 int subframe_index) {

120 assert(length_corr >= kLpcOrder + 1);

121 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];

122 int buffer_index = subframe_index * kNumSubframeSamples;

123

124 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)

125 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];

126

127 WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples +

128 kNumPastSignalSamples, kLpcOrder);

129 }

130

131 // Compute \|kNum10msSubframes\| sets of LPC coefficients, one per 10 ms input.

132 // The analysis window is 15 ms long and it is centered on the first half of

133 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the

134 // first half of each 10 ms subframe.

135 void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {

136 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));

137 double corr[kLpcOrder + 1];

138 double reflec_coeff[kLpcOrder];

139 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;

140 i++, offset_lpc += kLpcOrder + 1) {

141 SubframeCorrelation(corr, kLpcOrder + 1, i);

142 corr[0] *= 1.0001;

143 // This makes Lev-Durb a bit more stable.

144 for (int k = 0; k < kLpcOrder + 1; k++) {

145 corr[k] *= kCorrWeight[k];

146 }

147 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);

148 }

149 }

150

151 // Fit a second order curve to these 3 points and find the location of the

152 // extremum. The points are inverted before curve fitting.

153 static float QuadraticInterpolation(float prev_val, float curr_val,

154 float next_val) {

155 // Doing the interpolation in \|1 / A(z)\|^2.

156 float fractional_index = 0;

157 next_val = 1.0f / next_val;

158 prev_val = 1.0f / prev_val;

159 curr_val = 1.0f / curr_val;

160

161 fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val -

162 2.f * curr_val);

163 assert(fabs(fractional_index) < 1);

164 return fractional_index;

165 }

166

167 // 1 / A(z), where A(z) is defined by \|lpc\| is a model of the spectral envelope

168 // of the input signal. The local maximum of the spectral envelope corresponds

169 // with the local minimum of A(z). It saves complexity, as we save one

170 // inversion. Furthermore, we find the first local maximum of magnitude squared,

171 // to save on one square root.

172 void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {

173 assert(length_f_peak >= kNum10msSubframes);

174 double lpc[kNum10msSubframes * (kLpcOrder + 1)];

175 // For all sub-frames.

176 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));

177

178 const int kNumDftCoefficients = kDftSize / 2 + 1;

179 float data[kDftSize];

180

181 for (int i = 0; i < kNum10msSubframes; i++) {

182 // Convert to float with zero pad.

183 memset(data, 0, sizeof(data));

184 for (int n = 0; n < kLpcOrder + 1; n++) {

185 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);

186 }

187 // Transform to frequency domain.

188 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);

189

190 int index_peak = 0;

191 float prev_magn_sqr = data[0] * data[0];

192 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];

193 float next_magn_sqr;

194 bool found_peak = false;

195 for (int n = 2; n < kNumDftCoefficients - 1; n++) {

196 next_magn_sqr = data[2 * n] * data[2 * n] +

197 data[2 * n + 1] * data[2 * n + 1];

198 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {

199 found_peak = true;

200 index_peak = n - 1;

201 break;

202 }

203 prev_magn_sqr = curr_magn_sqr;

204 curr_magn_sqr = next_magn_sqr;

205 }

206 float fractional_index = 0;

207 if (!found_peak) {

208 // Checking if \|kNumDftCoefficients - 1\| is the local minimum.

209 next_magn_sqr = data[1] * data[1];

210 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {

211 index_peak = kNumDftCoefficients - 1;

212 }

213 } else {

214 // A peak is found, do a simple quadratic interpolation to get a more

215 // accurate estimate of the peak location.

216 fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr,

217 next_magn_sqr);

218 }

219 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;

220 }

221 }

222

223 // Using iSAC functions to estimate pitch gains & lags.

224 void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz,

225 int length) {

226 // TODO(turajs): This can be "imported" from iSAC & and the next two

227 // constants.

228 assert(length >= kNum10msSubframes);

229 const int kNumPitchSubframes = 4;

230 double gains[kNumPitchSubframes];

231 double lags[kNumPitchSubframes];

232

233 const int kNumSubbandFrameSamples = 240;

234 const int kNumLookaheadSamples = 24;

235

236 float lower[kNumSubbandFrameSamples];

237 float upper[kNumSubbandFrameSamples];

238 double lower_lookahead[kNumSubbandFrameSamples];

239 double upper_lookahead[kNumSubbandFrameSamples];

240 double lower_lookahead_pre_filter[kNumSubbandFrameSamples +

241 kNumLookaheadSamples];

242

243 // Split signal to lower and upper bands

244 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples],

245 lower, upper, lower_lookahead, upper_lookahead,

246 pre_filter_handle_.get());

247 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,

248 pitch_analysis_handle_.get(), lags, gains);

249

250 // Lags are computed on lower-band signal with sampling rate half of the

251 // input signal.

252 GetSubframesPitchParameters(kSampleRateHz / 2, gains, lags,

253 kNumPitchSubframes, kNum10msSubframes,

254 &log_old_gain_, &old_lag_,

255 log_pitch_gains, pitch_lags_hz);

256 }

257

258 void AgcAudioProc::Rms(double* rms, int length_rms) {

259 assert(length_rms >= kNum10msSubframes);

260 int offset = kNumPastSignalSamples;

261 for (int i = 0; i < kNum10msSubframes; i++) {

262 rms[i] = 0;

263 for (int n = 0; n < kNumSubframeSamples; n++, offset++)

264 rms[i] += audio_buffer_[offset] * audio_buffer_[offset];

265 rms[i] = sqrt(rms[i] / kNumSubframeSamples);

266 }

267 }

268

269 } // namespace webrtc

OLD	NEW