webrtc/modules/audio_processing/vad/vad_audio_proc.cc - Issue 1230503003: Update a ton of audio code to use size_t more correctly and in general reduce

Side by Side Diff: webrtc/modules/audio_processing/vad/vad_audio_proc.cc

Issue 1230503003: Update a ton of audio code to use size_t more correctly and in general reduce (Closed) Base URL: https://chromium.googlesource.com/external/webrtc@master

Patch Set: Resync Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
69 VadAudioProc::~VadAudioProc() {	69 VadAudioProc::~VadAudioProc() {

70 }	70 }

71	71

72 void VadAudioProc::ResetBuffer() {	72 void VadAudioProc::ResetBuffer() {

73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],	73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],

74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples);	74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples);

75 num_buffer_samples_ = kNumPastSignalSamples;	75 num_buffer_samples_ = kNumPastSignalSamples;

76 }	76 }

77	77

78 int VadAudioProc::ExtractFeatures(const int16_t* frame,	78 int VadAudioProc::ExtractFeatures(const int16_t* frame,

79 int length,	79 size_t length,

80 AudioFeatures* features) {	80 AudioFeatures* features) {

81 features->num_frames = 0;	81 features->num_frames = 0;

82 if (length != kNumSubframeSamples) {	82 if (length != kNumSubframeSamples) {

83 return -1;	83 return -1;

84 }	84 }

85	85

86 // High-pass filter to remove the DC component and very low frequency content.	86 // High-pass filter to remove the DC component and very low frequency content.

87 // We have experienced that this high-pass filtering improves voice/non-voiced	87 // We have experienced that this high-pass filtering improves voice/non-voiced

88 // classification.	88 // classification.

89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples,	89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples,

90 &audio_buffer_[num_buffer_samples_]) != 0) {	90 &audio_buffer_[num_buffer_samples_]) != 0) {

91 return -1;	91 return -1;

92 }	92 }

93	93

94 num_buffer_samples_ += kNumSubframeSamples;	94 num_buffer_samples_ += kNumSubframeSamples;

95 if (num_buffer_samples_ < kBufferLength) {	95 if (num_buffer_samples_ < kBufferLength) {

96 return 0;	96 return 0;

97 }	97 }

98 assert(num_buffer_samples_ == kBufferLength);	98 assert(num_buffer_samples_ == kBufferLength);

99 features->num_frames = kNum10msSubframes;	99 features->num_frames = kNum10msSubframes;

100 features->silence = false;	100 features->silence = false;

101	101

102 Rms(features->rms, kMaxNumFrames);	102 Rms(features->rms, kMaxNumFrames);

103 for (int i = 0; i < kNum10msSubframes; ++i) {	103 for (size_t i = 0; i < kNum10msSubframes; ++i) {

104 if (features->rms[i] < kSilenceRms) {	104 if (features->rms[i] < kSilenceRms) {

105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.	105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.

106 // Bail out here instead.	106 // Bail out here instead.

107 features->silence = true;	107 features->silence = true;

108 ResetBuffer();	108 ResetBuffer();

109 return 0;	109 return 0;

110 }	110 }

111 }	111 }

112	112

113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,	113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,

114 kMaxNumFrames);	114 kMaxNumFrames);

115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);	115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);

116 ResetBuffer();	116 ResetBuffer();

117 return 0;	117 return 0;

118 }	118 }

119	119

120 // Computes \|kLpcOrder + 1\| correlation coefficients.	120 // Computes \|kLpcOrder + 1\| correlation coefficients.

121 void VadAudioProc::SubframeCorrelation(double* corr,	121 void VadAudioProc::SubframeCorrelation(double* corr,

122 int length_corr,	122 size_t length_corr,

123 int subframe_index) {	123 size_t subframe_index) {

124 assert(length_corr >= kLpcOrder + 1);	124 assert(length_corr >= kLpcOrder + 1);

125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];	125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];

126 int buffer_index = subframe_index * kNumSubframeSamples;	126 size_t buffer_index = subframe_index * kNumSubframeSamples;

127	127

128 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)	128 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)

129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];	129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];

130	130

131 WebRtcIsac_AutoCorr(corr, windowed_audio,	131 WebRtcIsac_AutoCorr(corr, windowed_audio,

132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);	132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);

133 }	133 }

134	134

135 // Compute \|kNum10msSubframes\| sets of LPC coefficients, one per 10 ms input.	135 // Compute \|kNum10msSubframes\| sets of LPC coefficients, one per 10 ms input.

136 // The analysis window is 15 ms long and it is centered on the first half of	136 // The analysis window is 15 ms long and it is centered on the first half of

137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the	137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the

138 // first half of each 10 ms subframe.	138 // first half of each 10 ms subframe.

139 void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {	139 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {

140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));	140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));

141 double corr[kLpcOrder + 1];	141 double corr[kLpcOrder + 1];

142 double reflec_coeff[kLpcOrder];	142 double reflec_coeff[kLpcOrder];

143 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;	143 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;

144 i++, offset_lpc += kLpcOrder + 1) {	144 i++, offset_lpc += kLpcOrder + 1) {

145 SubframeCorrelation(corr, kLpcOrder + 1, i);	145 SubframeCorrelation(corr, kLpcOrder + 1, i);

146 corr[0] *= 1.0001;	146 corr[0] *= 1.0001;

147 // This makes Lev-Durb a bit more stable.	147 // This makes Lev-Durb a bit more stable.

148 for (int k = 0; k < kLpcOrder + 1; k++) {	148 for (size_t k = 0; k < kLpcOrder + 1; k++) {

149 corr[k] *= kCorrWeight[k];	149 corr[k] *= kCorrWeight[k];

150 }	150 }

151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);	151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);

152 }	152 }

153 }	153 }

154	154

155 // Fit a second order curve to these 3 points and find the location of the	155 // Fit a second order curve to these 3 points and find the location of the

156 // extremum. The points are inverted before curve fitting.	156 // extremum. The points are inverted before curve fitting.

157 static float QuadraticInterpolation(float prev_val,	157 static float QuadraticInterpolation(float prev_val,

158 float curr_val,	158 float curr_val,

159 float next_val) {	159 float next_val) {

160 // Doing the interpolation in \|1 / A(z)\|^2.	160 // Doing the interpolation in \|1 / A(z)\|^2.

161 float fractional_index = 0;	161 float fractional_index = 0;

162 next_val = 1.0f / next_val;	162 next_val = 1.0f / next_val;

163 prev_val = 1.0f / prev_val;	163 prev_val = 1.0f / prev_val;

164 curr_val = 1.0f / curr_val;	164 curr_val = 1.0f / curr_val;

165	165

166 fractional_index =	166 fractional_index =

167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);	167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);

168 assert(fabs(fractional_index) < 1);	168 assert(fabs(fractional_index) < 1);

169 return fractional_index;	169 return fractional_index;

170 }	170 }

171	171

172 // 1 / A(z), where A(z) is defined by \|lpc\| is a model of the spectral envelope	172 // 1 / A(z), where A(z) is defined by \|lpc\| is a model of the spectral envelope

173 // of the input signal. The local maximum of the spectral envelope corresponds	173 // of the input signal. The local maximum of the spectral envelope corresponds

174 // with the local minimum of A(z). It saves complexity, as we save one	174 // with the local minimum of A(z). It saves complexity, as we save one

175 // inversion. Furthermore, we find the first local maximum of magnitude squared,	175 // inversion. Furthermore, we find the first local maximum of magnitude squared,

176 // to save on one square root.	176 // to save on one square root.

177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {	177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,

	178 size_t length_f_peak) {

178 assert(length_f_peak >= kNum10msSubframes);	179 assert(length_f_peak >= kNum10msSubframes);

179 double lpc[kNum10msSubframes * (kLpcOrder + 1)];	180 double lpc[kNum10msSubframes * (kLpcOrder + 1)];

180 // For all sub-frames.	181 // For all sub-frames.

181 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));	182 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));

182	183

183 const int kNumDftCoefficients = kDftSize / 2 + 1;	184 const size_t kNumDftCoefficients = kDftSize / 2 + 1;

184 float data[kDftSize];	185 float data[kDftSize];

185	186

186 for (int i = 0; i < kNum10msSubframes; i++) {	187 for (size_t i = 0; i < kNum10msSubframes; i++) {

187 // Convert to float with zero pad.	188 // Convert to float with zero pad.

188 memset(data, 0, sizeof(data));	189 memset(data, 0, sizeof(data));

189 for (int n = 0; n < kLpcOrder + 1; n++) {	190 for (size_t n = 0; n < kLpcOrder + 1; n++) {

190 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);	191 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);

191 }	192 }

192 // Transform to frequency domain.	193 // Transform to frequency domain.

193 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);	194 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);

194	195

195 int index_peak = 0;	196 size_t index_peak = 0;

196 float prev_magn_sqr = data[0] * data[0];	197 float prev_magn_sqr = data[0] * data[0];

197 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];	198 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];

198 float next_magn_sqr;	199 float next_magn_sqr;

199 bool found_peak = false;	200 bool found_peak = false;

200 for (int n = 2; n < kNumDftCoefficients - 1; n++) {	201 for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {

201 next_magn_sqr =	202 next_magn_sqr =

202 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];	203 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];

203 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {	204 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {

204 found_peak = true;	205 found_peak = true;

205 index_peak = n - 1;	206 index_peak = n - 1;

206 break;	207 break;

207 }	208 }

208 prev_magn_sqr = curr_magn_sqr;	209 prev_magn_sqr = curr_magn_sqr;

209 curr_magn_sqr = next_magn_sqr;	210 curr_magn_sqr = next_magn_sqr;

210 }	211 }

(...skipping 10 matching lines...) Expand all Loading...
221 fractional_index =	222 fractional_index =

222 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);	223 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);

223 }	224 }

224 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;	225 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;

225 }	226 }

226 }	227 }

227	228

228 // Using iSAC functions to estimate pitch gains & lags.	229 // Using iSAC functions to estimate pitch gains & lags.

229 void VadAudioProc::PitchAnalysis(double* log_pitch_gains,	230 void VadAudioProc::PitchAnalysis(double* log_pitch_gains,

230 double* pitch_lags_hz,	231 double* pitch_lags_hz,

231 int length) {	232 size_t length) {

232 // TODO(turajs): This can be "imported" from iSAC & and the next two	233 // TODO(turajs): This can be "imported" from iSAC & and the next two

233 // constants.	234 // constants.

234 assert(length >= kNum10msSubframes);	235 assert(length >= kNum10msSubframes);

235 const int kNumPitchSubframes = 4;	236 const int kNumPitchSubframes = 4;

236 double gains[kNumPitchSubframes];	237 double gains[kNumPitchSubframes];

237 double lags[kNumPitchSubframes];	238 double lags[kNumPitchSubframes];

238	239

239 const int kNumSubbandFrameSamples = 240;	240 const int kNumSubbandFrameSamples = 240;

240 const int kNumLookaheadSamples = 24;	241 const int kNumLookaheadSamples = 24;

241	242

(...skipping 11 matching lines...) Expand all Loading...
253 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,	254 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,

254 pitch_analysis_handle_.get(), lags, gains);	255 pitch_analysis_handle_.get(), lags, gains);

255	256

256 // Lags are computed on lower-band signal with sampling rate half of the	257 // Lags are computed on lower-band signal with sampling rate half of the

257 // input signal.	258 // input signal.

258 GetSubframesPitchParameters(	259 GetSubframesPitchParameters(

259 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,	260 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,

260 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);	261 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);

261 }	262 }

262	263

263 void VadAudioProc::Rms(double* rms, int length_rms) {	264 void VadAudioProc::Rms(double* rms, size_t length_rms) {

264 assert(length_rms >= kNum10msSubframes);	265 assert(length_rms >= kNum10msSubframes);

265 int offset = kNumPastSignalSamples;	266 size_t offset = kNumPastSignalSamples;

266 for (int i = 0; i < kNum10msSubframes; i++) {	267 for (size_t i = 0; i < kNum10msSubframes; i++) {

267 rms[i] = 0;	268 rms[i] = 0;

268 for (int n = 0; n < kNumSubframeSamples; n++, offset++)	269 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)

269 rms[i] += audio_buffer_[offset] * audio_buffer_[offset];	270 rms[i] += audio_buffer_[offset] * audio_buffer_[offset];

270 rms[i] = sqrt(rms[i] / kNumSubframeSamples);	271 rms[i] = sqrt(rms[i] / kNumSubframeSamples);

271 }	272 }

272 }	273 }

273	274

274 } // namespace webrtc	275 } // namespace webrtc

OLD	NEW