OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 58 matching lines...) Loading... |
69 VadAudioProc::~VadAudioProc() { | 69 VadAudioProc::~VadAudioProc() { |
70 } | 70 } |
71 | 71 |
72 void VadAudioProc::ResetBuffer() { | 72 void VadAudioProc::ResetBuffer() { |
73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], | 73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], |
74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples); | 74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples); |
75 num_buffer_samples_ = kNumPastSignalSamples; | 75 num_buffer_samples_ = kNumPastSignalSamples; |
76 } | 76 } |
77 | 77 |
78 int VadAudioProc::ExtractFeatures(const int16_t* frame, | 78 int VadAudioProc::ExtractFeatures(const int16_t* frame, |
79 int length, | 79 size_t length, |
80 AudioFeatures* features) { | 80 AudioFeatures* features) { |
81 features->num_frames = 0; | 81 features->num_frames = 0; |
82 if (length != kNumSubframeSamples) { | 82 if (length != kNumSubframeSamples) { |
83 return -1; | 83 return -1; |
84 } | 84 } |
85 | 85 |
86 // High-pass filter to remove the DC component and very low frequency content. | 86 // High-pass filter to remove the DC component and very low frequency content. |
87 // We have experienced that this high-pass filtering improves voice/non-voiced | 87 // We have experienced that this high-pass filtering improves voice/non-voiced |
88 // classification. | 88 // classification. |
89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, | 89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, |
90 &audio_buffer_[num_buffer_samples_]) != 0) { | 90 &audio_buffer_[num_buffer_samples_]) != 0) { |
91 return -1; | 91 return -1; |
92 } | 92 } |
93 | 93 |
94 num_buffer_samples_ += kNumSubframeSamples; | 94 num_buffer_samples_ += kNumSubframeSamples; |
95 if (num_buffer_samples_ < kBufferLength) { | 95 if (num_buffer_samples_ < kBufferLength) { |
96 return 0; | 96 return 0; |
97 } | 97 } |
98 assert(num_buffer_samples_ == kBufferLength); | 98 assert(num_buffer_samples_ == kBufferLength); |
99 features->num_frames = kNum10msSubframes; | 99 features->num_frames = kNum10msSubframes; |
100 features->silence = false; | 100 features->silence = false; |
101 | 101 |
102 Rms(features->rms, kMaxNumFrames); | 102 Rms(features->rms, kMaxNumFrames); |
103 for (int i = 0; i < kNum10msSubframes; ++i) { | 103 for (size_t i = 0; i < kNum10msSubframes; ++i) { |
104 if (features->rms[i] < kSilenceRms) { | 104 if (features->rms[i] < kSilenceRms) { |
105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. | 105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. |
106 // Bail out here instead. | 106 // Bail out here instead. |
107 features->silence = true; | 107 features->silence = true; |
108 ResetBuffer(); | 108 ResetBuffer(); |
109 return 0; | 109 return 0; |
110 } | 110 } |
111 } | 111 } |
112 | 112 |
113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, | 113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, |
114 kMaxNumFrames); | 114 kMaxNumFrames); |
115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); | 115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); |
116 ResetBuffer(); | 116 ResetBuffer(); |
117 return 0; | 117 return 0; |
118 } | 118 } |
119 | 119 |
120 // Computes |kLpcOrder + 1| correlation coefficients. | 120 // Computes |kLpcOrder + 1| correlation coefficients. |
121 void VadAudioProc::SubframeCorrelation(double* corr, | 121 void VadAudioProc::SubframeCorrelation(double* corr, |
122 int length_corr, | 122 size_t length_corr, |
123 int subframe_index) { | 123 size_t subframe_index) { |
124 assert(length_corr >= kLpcOrder + 1); | 124 assert(length_corr >= kLpcOrder + 1); |
125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; | 125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; |
126 int buffer_index = subframe_index * kNumSubframeSamples; | 126 size_t buffer_index = subframe_index * kNumSubframeSamples; |
127 | 127 |
128 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) | 128 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) |
129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; | 129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; |
130 | 130 |
131 WebRtcIsac_AutoCorr(corr, windowed_audio, | 131 WebRtcIsac_AutoCorr(corr, windowed_audio, |
132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); | 132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); |
133 } | 133 } |
134 | 134 |
135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. | 135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. |
136 // The analysis window is 15 ms long and it is centered on the first half of | 136 // The analysis window is 15 ms long and it is centered on the first half of |
137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the | 137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the |
138 // first half of each 10 ms subframe. | 138 // first half of each 10 ms subframe. |
139 void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) { | 139 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { |
140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); | 140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); |
141 double corr[kLpcOrder + 1]; | 141 double corr[kLpcOrder + 1]; |
142 double reflec_coeff[kLpcOrder]; | 142 double reflec_coeff[kLpcOrder]; |
143 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes; | 143 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; |
144 i++, offset_lpc += kLpcOrder + 1) { | 144 i++, offset_lpc += kLpcOrder + 1) { |
145 SubframeCorrelation(corr, kLpcOrder + 1, i); | 145 SubframeCorrelation(corr, kLpcOrder + 1, i); |
146 corr[0] *= 1.0001; | 146 corr[0] *= 1.0001; |
147 // This makes Lev-Durb a bit more stable. | 147 // This makes Lev-Durb a bit more stable. |
148 for (int k = 0; k < kLpcOrder + 1; k++) { | 148 for (size_t k = 0; k < kLpcOrder + 1; k++) { |
149 corr[k] *= kCorrWeight[k]; | 149 corr[k] *= kCorrWeight[k]; |
150 } | 150 } |
151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); | 151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); |
152 } | 152 } |
153 } | 153 } |
154 | 154 |
155 // Fit a second order curve to these 3 points and find the location of the | 155 // Fit a second order curve to these 3 points and find the location of the |
156 // extremum. The points are inverted before curve fitting. | 156 // extremum. The points are inverted before curve fitting. |
157 static float QuadraticInterpolation(float prev_val, | 157 static float QuadraticInterpolation(float prev_val, |
158 float curr_val, | 158 float curr_val, |
159 float next_val) { | 159 float next_val) { |
160 // Doing the interpolation in |1 / A(z)|^2. | 160 // Doing the interpolation in |1 / A(z)|^2. |
161 float fractional_index = 0; | 161 float fractional_index = 0; |
162 next_val = 1.0f / next_val; | 162 next_val = 1.0f / next_val; |
163 prev_val = 1.0f / prev_val; | 163 prev_val = 1.0f / prev_val; |
164 curr_val = 1.0f / curr_val; | 164 curr_val = 1.0f / curr_val; |
165 | 165 |
166 fractional_index = | 166 fractional_index = |
167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); | 167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); |
168 assert(fabs(fractional_index) < 1); | 168 assert(fabs(fractional_index) < 1); |
169 return fractional_index; | 169 return fractional_index; |
170 } | 170 } |
171 | 171 |
172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope | 172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope |
173 // of the input signal. The local maximum of the spectral envelope corresponds | 173 // of the input signal. The local maximum of the spectral envelope corresponds |
174 // with the local minimum of A(z). It saves complexity, as we save one | 174 // with the local minimum of A(z). It saves complexity, as we save one |
175 // inversion. Furthermore, we find the first local maximum of magnitude squared, | 175 // inversion. Furthermore, we find the first local maximum of magnitude squared, |
176 // to save on one square root. | 176 // to save on one square root. |
177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { | 177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, |
| 178 size_t length_f_peak) { |
178 assert(length_f_peak >= kNum10msSubframes); | 179 assert(length_f_peak >= kNum10msSubframes); |
179 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; | 180 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; |
180 // For all sub-frames. | 181 // For all sub-frames. |
181 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); | 182 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); |
182 | 183 |
183 const int kNumDftCoefficients = kDftSize / 2 + 1; | 184 const size_t kNumDftCoefficients = kDftSize / 2 + 1; |
184 float data[kDftSize]; | 185 float data[kDftSize]; |
185 | 186 |
186 for (int i = 0; i < kNum10msSubframes; i++) { | 187 for (size_t i = 0; i < kNum10msSubframes; i++) { |
187 // Convert to float with zero pad. | 188 // Convert to float with zero pad. |
188 memset(data, 0, sizeof(data)); | 189 memset(data, 0, sizeof(data)); |
189 for (int n = 0; n < kLpcOrder + 1; n++) { | 190 for (size_t n = 0; n < kLpcOrder + 1; n++) { |
190 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); | 191 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); |
191 } | 192 } |
192 // Transform to frequency domain. | 193 // Transform to frequency domain. |
193 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | 194 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); |
194 | 195 |
195 int index_peak = 0; | 196 size_t index_peak = 0; |
196 float prev_magn_sqr = data[0] * data[0]; | 197 float prev_magn_sqr = data[0] * data[0]; |
197 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; | 198 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; |
198 float next_magn_sqr; | 199 float next_magn_sqr; |
199 bool found_peak = false; | 200 bool found_peak = false; |
200 for (int n = 2; n < kNumDftCoefficients - 1; n++) { | 201 for (size_t n = 2; n < kNumDftCoefficients - 1; n++) { |
201 next_magn_sqr = | 202 next_magn_sqr = |
202 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; | 203 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; |
203 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | 204 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { |
204 found_peak = true; | 205 found_peak = true; |
205 index_peak = n - 1; | 206 index_peak = n - 1; |
206 break; | 207 break; |
207 } | 208 } |
208 prev_magn_sqr = curr_magn_sqr; | 209 prev_magn_sqr = curr_magn_sqr; |
209 curr_magn_sqr = next_magn_sqr; | 210 curr_magn_sqr = next_magn_sqr; |
210 } | 211 } |
(...skipping 10 matching lines...) Loading... |
221 fractional_index = | 222 fractional_index = |
222 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); | 223 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); |
223 } | 224 } |
224 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; | 225 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; |
225 } | 226 } |
226 } | 227 } |
227 | 228 |
228 // Using iSAC functions to estimate pitch gains & lags. | 229 // Using iSAC functions to estimate pitch gains & lags. |
229 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, | 230 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, |
230 double* pitch_lags_hz, | 231 double* pitch_lags_hz, |
231 int length) { | 232 size_t length) { |
232 // TODO(turajs): This can be "imported" from iSAC & and the next two | 233 // TODO(turajs): This can be "imported" from iSAC & and the next two |
233 // constants. | 234 // constants. |
234 assert(length >= kNum10msSubframes); | 235 assert(length >= kNum10msSubframes); |
235 const int kNumPitchSubframes = 4; | 236 const int kNumPitchSubframes = 4; |
236 double gains[kNumPitchSubframes]; | 237 double gains[kNumPitchSubframes]; |
237 double lags[kNumPitchSubframes]; | 238 double lags[kNumPitchSubframes]; |
238 | 239 |
239 const int kNumSubbandFrameSamples = 240; | 240 const int kNumSubbandFrameSamples = 240; |
240 const int kNumLookaheadSamples = 24; | 241 const int kNumLookaheadSamples = 24; |
241 | 242 |
(...skipping 11 matching lines...) Loading... |
253 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, | 254 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, |
254 pitch_analysis_handle_.get(), lags, gains); | 255 pitch_analysis_handle_.get(), lags, gains); |
255 | 256 |
256 // Lags are computed on lower-band signal with sampling rate half of the | 257 // Lags are computed on lower-band signal with sampling rate half of the |
257 // input signal. | 258 // input signal. |
258 GetSubframesPitchParameters( | 259 GetSubframesPitchParameters( |
259 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, | 260 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, |
260 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); | 261 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); |
261 } | 262 } |
262 | 263 |
263 void VadAudioProc::Rms(double* rms, int length_rms) { | 264 void VadAudioProc::Rms(double* rms, size_t length_rms) { |
264 assert(length_rms >= kNum10msSubframes); | 265 assert(length_rms >= kNum10msSubframes); |
265 int offset = kNumPastSignalSamples; | 266 size_t offset = kNumPastSignalSamples; |
266 for (int i = 0; i < kNum10msSubframes; i++) { | 267 for (size_t i = 0; i < kNum10msSubframes; i++) { |
267 rms[i] = 0; | 268 rms[i] = 0; |
268 for (int n = 0; n < kNumSubframeSamples; n++, offset++) | 269 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) |
269 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; | 270 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; |
270 rms[i] = sqrt(rms[i] / kNumSubframeSamples); | 271 rms[i] = sqrt(rms[i] / kNumSubframeSamples); |
271 } | 272 } |
272 } | 273 } |
273 | 274 |
274 } // namespace webrtc | 275 } // namespace webrtc |
OLD | NEW |