webrtc/modules/audio_processing/vad/vad_audio_proc.cc - Issue 1208793002: Revert "Pull the Voice Activity Detector out from the AGC"

Side by Side Diff: webrtc/modules/audio_processing/vad/vad_audio_proc.cc

Issue 1208793002: Revert "Pull the Voice Activity Detector out from the AGC" (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"

12

13 #include <math.h>

14 #include <stdio.h>

15

16 #include "webrtc/common_audio/fft4g.h"

17 #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"

18 #include "webrtc/modules/audio_processing/vad/pitch_internal.h"

19 #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"

20 extern "C" {

21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"

22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"

23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"

24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"

25 }

26 #include "webrtc/modules/interface/module_common_types.h"

27

28 namespace webrtc {

29

30 // The following structures are declared anonymous in iSAC's structs.h. To

31 // forward declare them, we use this derived class trick.

32 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};

33 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};

34

35 static const float kFrequencyResolution =

36 kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);

37 static const int kSilenceRms = 5;

38

39 // TODO(turajs): Make a Create or Init for VadAudioProc.

40 VadAudioProc::VadAudioProc()

41 : audio_buffer_(),

42 num_buffer_samples_(kNumPastSignalSamples),

43 log_old_gain_(-2),

44 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).

45 pitch_analysis_handle_(new PitchAnalysisStruct),

46 pre_filter_handle_(new PreFiltBankstr),

47 high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,

48 kFilterOrder,

49 kCoeffDenominator,

50 kFilterOrder)) {

51 static_assert(kNumPastSignalSamples + kNumSubframeSamples ==

52 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),

53 "lpc analysis window incorrect size");

54 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),

55 "correlation weight incorrect size");

56

57 // TODO(turajs): Are we doing too much in the constructor?

58 float data[kDftSize];

59 // Make FFT to initialize.

60 ip_[0] = 0;

61 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);

62 // TODO(turajs): Need to initialize high-pass filter.

63

64 // Initialize iSAC components.

65 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());

66 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());

67 }

68

69 VadAudioProc::~VadAudioProc() {

70 }

71

72 void VadAudioProc::ResetBuffer() {

73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],

74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples);

75 num_buffer_samples_ = kNumPastSignalSamples;

76 }

77

78 int VadAudioProc::ExtractFeatures(const int16_t* frame,

79 int length,

80 AudioFeatures* features) {

81 features->num_frames = 0;

82 if (length != kNumSubframeSamples) {

83 return -1;

84 }

85

86 // High-pass filter to remove the DC component and very low frequency content.

87 // We have experienced that this high-pass filtering improves voice/non-voiced

88 // classification.

89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples,

90 &audio_buffer_[num_buffer_samples_]) != 0) {

91 return -1;

92 }

93

94 num_buffer_samples_ += kNumSubframeSamples;

95 if (num_buffer_samples_ < kBufferLength) {

96 return 0;

97 }

98 assert(num_buffer_samples_ == kBufferLength);

99 features->num_frames = kNum10msSubframes;

100 features->silence = false;

101

102 Rms(features->rms, kMaxNumFrames);

103 for (int i = 0; i < kNum10msSubframes; ++i) {

104 if (features->rms[i] < kSilenceRms) {

105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.

106 // Bail out here instead.

107 features->silence = true;

108 ResetBuffer();

109 return 0;

110 }

111 }

112

113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,

114 kMaxNumFrames);

115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);

116 ResetBuffer();

117 return 0;

118 }

119

120 // Computes \|kLpcOrder + 1\| correlation coefficients.

121 void VadAudioProc::SubframeCorrelation(double* corr,

122 int length_corr,

123 int subframe_index) {

124 assert(length_corr >= kLpcOrder + 1);

125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];

126 int buffer_index = subframe_index * kNumSubframeSamples;

127

128 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)

129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];

130

131 WebRtcIsac_AutoCorr(corr, windowed_audio,

132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);

133 }

134

135 // Compute \|kNum10msSubframes\| sets of LPC coefficients, one per 10 ms input.

136 // The analysis window is 15 ms long and it is centered on the first half of

137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the

138 // first half of each 10 ms subframe.

139 void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {

140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));

141 double corr[kLpcOrder + 1];

142 double reflec_coeff[kLpcOrder];

143 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;

144 i++, offset_lpc += kLpcOrder + 1) {

145 SubframeCorrelation(corr, kLpcOrder + 1, i);

146 corr[0] *= 1.0001;

147 // This makes Lev-Durb a bit more stable.

148 for (int k = 0; k < kLpcOrder + 1; k++) {

149 corr[k] *= kCorrWeight[k];

150 }

151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);

152 }

153 }

154

155 // Fit a second order curve to these 3 points and find the location of the

156 // extremum. The points are inverted before curve fitting.

157 static float QuadraticInterpolation(float prev_val,

158 float curr_val,

159 float next_val) {

160 // Doing the interpolation in \|1 / A(z)\|^2.

161 float fractional_index = 0;

162 next_val = 1.0f / next_val;

163 prev_val = 1.0f / prev_val;

164 curr_val = 1.0f / curr_val;

165

166 fractional_index =

167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);

168 assert(fabs(fractional_index) < 1);

169 return fractional_index;

170 }

171

172 // 1 / A(z), where A(z) is defined by \|lpc\| is a model of the spectral envelope

173 // of the input signal. The local maximum of the spectral envelope corresponds

174 // with the local minimum of A(z). It saves complexity, as we save one

175 // inversion. Furthermore, we find the first local maximum of magnitude squared,

176 // to save on one square root.

177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {

178 assert(length_f_peak >= kNum10msSubframes);

179 double lpc[kNum10msSubframes * (kLpcOrder + 1)];

180 // For all sub-frames.

181 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));

182

183 const int kNumDftCoefficients = kDftSize / 2 + 1;

184 float data[kDftSize];

185

186 for (int i = 0; i < kNum10msSubframes; i++) {

187 // Convert to float with zero pad.

188 memset(data, 0, sizeof(data));

189 for (int n = 0; n < kLpcOrder + 1; n++) {

190 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);

191 }

192 // Transform to frequency domain.

193 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);

194

195 int index_peak = 0;

196 float prev_magn_sqr = data[0] * data[0];

197 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];

198 float next_magn_sqr;

199 bool found_peak = false;

200 for (int n = 2; n < kNumDftCoefficients - 1; n++) {

201 next_magn_sqr =

202 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];

203 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {

204 found_peak = true;

205 index_peak = n - 1;

206 break;

207 }

208 prev_magn_sqr = curr_magn_sqr;

209 curr_magn_sqr = next_magn_sqr;

210 }

211 float fractional_index = 0;

212 if (!found_peak) {

213 // Checking if \|kNumDftCoefficients - 1\| is the local minimum.

214 next_magn_sqr = data[1] * data[1];

215 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {

216 index_peak = kNumDftCoefficients - 1;

217 }

218 } else {

219 // A peak is found, do a simple quadratic interpolation to get a more

220 // accurate estimate of the peak location.

221 fractional_index =

222 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);

223 }

224 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;

225 }

226 }

227

228 // Using iSAC functions to estimate pitch gains & lags.

229 void VadAudioProc::PitchAnalysis(double* log_pitch_gains,

230 double* pitch_lags_hz,

231 int length) {

232 // TODO(turajs): This can be "imported" from iSAC & and the next two

233 // constants.

234 assert(length >= kNum10msSubframes);

235 const int kNumPitchSubframes = 4;

236 double gains[kNumPitchSubframes];

237 double lags[kNumPitchSubframes];

238

239 const int kNumSubbandFrameSamples = 240;

240 const int kNumLookaheadSamples = 24;

241

242 float lower[kNumSubbandFrameSamples];

243 float upper[kNumSubbandFrameSamples];

244 double lower_lookahead[kNumSubbandFrameSamples];

245 double upper_lookahead[kNumSubbandFrameSamples];

246 double lower_lookahead_pre_filter[kNumSubbandFrameSamples +

247 kNumLookaheadSamples];

248

249 // Split signal to lower and upper bands

250 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,

251 upper, lower_lookahead, upper_lookahead,

252 pre_filter_handle_.get());

253 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,

254 pitch_analysis_handle_.get(), lags, gains);

255

256 // Lags are computed on lower-band signal with sampling rate half of the

257 // input signal.

258 GetSubframesPitchParameters(

259 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,

260 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);

261 }

262

263 void VadAudioProc::Rms(double* rms, int length_rms) {

264 assert(length_rms >= kNum10msSubframes);

265 int offset = kNumPastSignalSamples;

266 for (int i = 0; i < kNum10msSubframes; i++) {

267 rms[i] = 0;

268 for (int n = 0; n < kNumSubframeSamples; n++, offset++)

269 rms[i] += audio_buffer_[offset] * audio_buffer_[offset];

270 rms[i] = sqrt(rms[i] / kNumSubframeSamples);

271 }

272 }

273

274 } // namespace webrtc

OLD	NEW