| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 #include "webrtc/modules/audio_processing/agc/agc_audio_proc.h" | |
| 12 | |
| 13 #include <math.h> | |
| 14 #include <stdio.h> | |
| 15 | |
| 16 #include "webrtc/common_audio/fft4g.h" | |
| 17 #include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h" | |
| 18 #include "webrtc/modules/audio_processing/agc/pitch_internal.h" | |
| 19 #include "webrtc/modules/audio_processing/agc/pole_zero_filter.h" | |
| 20 extern "C" { | |
| 21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h" | |
| 22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h" | |
| 23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" | |
| 24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h" | |
| 25 } | |
| 26 #include "webrtc/modules/interface/module_common_types.h" | |
| 27 | |
| 28 namespace webrtc { | |
| 29 | |
| 30 // The following structures are declared anonymous in iSAC's structs.h. To | |
| 31 // forward declare them, we use this derived class trick. | |
| 32 struct AgcAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; | |
| 33 struct AgcAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; | |
| 34 | |
| 35 static const float kFrequencyResolution = kSampleRateHz / | |
| 36 static_cast<float>(AgcAudioProc::kDftSize); | |
| 37 static const int kSilenceRms = 5; | |
| 38 | |
| 39 // TODO(turajs): Make a Create or Init for AgcAudioProc. | |
| 40 AgcAudioProc::AgcAudioProc() | |
| 41 : audio_buffer_(), | |
| 42 num_buffer_samples_(kNumPastSignalSamples), | |
| 43 log_old_gain_(-2), | |
| 44 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). | |
| 45 pitch_analysis_handle_(new PitchAnalysisStruct), | |
| 46 pre_filter_handle_(new PreFiltBankstr), | |
| 47 high_pass_filter_(PoleZeroFilter::Create( | |
| 48 kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) { | |
| 49 static_assert(kNumPastSignalSamples + kNumSubframeSamples == | |
| 50 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), | |
| 51 "lpc analysis window incorrect size"); | |
| 52 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), | |
| 53 "correlation weight incorrect size"); | |
| 54 | |
| 55 // TODO(turajs): Are we doing too much in the constructor? | |
| 56 float data[kDftSize]; | |
| 57 // Make FFT to initialize. | |
| 58 ip_[0] = 0; | |
| 59 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | |
| 60 // TODO(turajs): Need to initialize high-pass filter. | |
| 61 | |
| 62 // Initialize iSAC components. | |
| 63 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); | |
| 64 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); | |
| 65 } | |
| 66 | |
| 67 AgcAudioProc::~AgcAudioProc() {} | |
| 68 | |
| 69 void AgcAudioProc::ResetBuffer() { | |
| 70 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], | |
| 71 sizeof(audio_buffer_[0]) * kNumPastSignalSamples); | |
| 72 num_buffer_samples_ = kNumPastSignalSamples; | |
| 73 } | |
| 74 | |
| 75 int AgcAudioProc::ExtractFeatures(const int16_t* frame, | |
| 76 int length, | |
| 77 AudioFeatures* features) { | |
| 78 features->num_frames = 0; | |
| 79 if (length != kNumSubframeSamples) { | |
| 80 return -1; | |
| 81 } | |
| 82 | |
| 83 // High-pass filter to remove the DC component and very low frequency content. | |
| 84 // We have experienced that this high-pass filtering improves voice/non-voiced | |
| 85 // classification. | |
| 86 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, | |
| 87 &audio_buffer_[num_buffer_samples_]) != 0) { | |
| 88 return -1; | |
| 89 } | |
| 90 | |
| 91 num_buffer_samples_ += kNumSubframeSamples; | |
| 92 if (num_buffer_samples_ < kBufferLength) { | |
| 93 return 0; | |
| 94 } | |
| 95 assert(num_buffer_samples_ == kBufferLength); | |
| 96 features->num_frames = kNum10msSubframes; | |
| 97 features->silence = false; | |
| 98 | |
| 99 Rms(features->rms, kMaxNumFrames); | |
| 100 for (int i = 0; i < kNum10msSubframes; ++i) { | |
| 101 if (features->rms[i] < kSilenceRms) { | |
| 102 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. | |
| 103 // Bail out here instead. | |
| 104 features->silence = true; | |
| 105 ResetBuffer(); | |
| 106 return 0; | |
| 107 } | |
| 108 } | |
| 109 | |
| 110 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, | |
| 111 kMaxNumFrames); | |
| 112 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); | |
| 113 ResetBuffer(); | |
| 114 return 0; | |
| 115 } | |
| 116 | |
| 117 // Computes |kLpcOrder + 1| correlation coefficients. | |
| 118 void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr, | |
| 119 int subframe_index) { | |
| 120 assert(length_corr >= kLpcOrder + 1); | |
| 121 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; | |
| 122 int buffer_index = subframe_index * kNumSubframeSamples; | |
| 123 | |
| 124 for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) | |
| 125 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; | |
| 126 | |
| 127 WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples + | |
| 128 kNumPastSignalSamples, kLpcOrder); | |
| 129 } | |
| 130 | |
| 131 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. | |
| 132 // The analysis window is 15 ms long and it is centered on the first half of | |
| 133 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the | |
| 134 // first half of each 10 ms subframe. | |
| 135 void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) { | |
| 136 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); | |
| 137 double corr[kLpcOrder + 1]; | |
| 138 double reflec_coeff[kLpcOrder]; | |
| 139 for (int i = 0, offset_lpc = 0; i < kNum10msSubframes; | |
| 140 i++, offset_lpc += kLpcOrder + 1) { | |
| 141 SubframeCorrelation(corr, kLpcOrder + 1, i); | |
| 142 corr[0] *= 1.0001; | |
| 143 // This makes Lev-Durb a bit more stable. | |
| 144 for (int k = 0; k < kLpcOrder + 1; k++) { | |
| 145 corr[k] *= kCorrWeight[k]; | |
| 146 } | |
| 147 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); | |
| 148 } | |
| 149 } | |
| 150 | |
| 151 // Fit a second order curve to these 3 points and find the location of the | |
| 152 // extremum. The points are inverted before curve fitting. | |
| 153 static float QuadraticInterpolation(float prev_val, float curr_val, | |
| 154 float next_val) { | |
| 155 // Doing the interpolation in |1 / A(z)|^2. | |
| 156 float fractional_index = 0; | |
| 157 next_val = 1.0f / next_val; | |
| 158 prev_val = 1.0f / prev_val; | |
| 159 curr_val = 1.0f / curr_val; | |
| 160 | |
| 161 fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val - | |
| 162 2.f * curr_val); | |
| 163 assert(fabs(fractional_index) < 1); | |
| 164 return fractional_index; | |
| 165 } | |
| 166 | |
| 167 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope | |
| 168 // of the input signal. The local maximum of the spectral envelope corresponds | |
| 169 // with the local minimum of A(z). It saves complexity, as we save one | |
| 170 // inversion. Furthermore, we find the first local maximum of magnitude squared, | |
| 171 // to save on one square root. | |
| 172 void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { | |
| 173 assert(length_f_peak >= kNum10msSubframes); | |
| 174 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; | |
| 175 // For all sub-frames. | |
| 176 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); | |
| 177 | |
| 178 const int kNumDftCoefficients = kDftSize / 2 + 1; | |
| 179 float data[kDftSize]; | |
| 180 | |
| 181 for (int i = 0; i < kNum10msSubframes; i++) { | |
| 182 // Convert to float with zero pad. | |
| 183 memset(data, 0, sizeof(data)); | |
| 184 for (int n = 0; n < kLpcOrder + 1; n++) { | |
| 185 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); | |
| 186 } | |
| 187 // Transform to frequency domain. | |
| 188 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | |
| 189 | |
| 190 int index_peak = 0; | |
| 191 float prev_magn_sqr = data[0] * data[0]; | |
| 192 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; | |
| 193 float next_magn_sqr; | |
| 194 bool found_peak = false; | |
| 195 for (int n = 2; n < kNumDftCoefficients - 1; n++) { | |
| 196 next_magn_sqr = data[2 * n] * data[2 * n] + | |
| 197 data[2 * n + 1] * data[2 * n + 1]; | |
| 198 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | |
| 199 found_peak = true; | |
| 200 index_peak = n - 1; | |
| 201 break; | |
| 202 } | |
| 203 prev_magn_sqr = curr_magn_sqr; | |
| 204 curr_magn_sqr = next_magn_sqr; | |
| 205 } | |
| 206 float fractional_index = 0; | |
| 207 if (!found_peak) { | |
| 208 // Checking if |kNumDftCoefficients - 1| is the local minimum. | |
| 209 next_magn_sqr = data[1] * data[1]; | |
| 210 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | |
| 211 index_peak = kNumDftCoefficients - 1; | |
| 212 } | |
| 213 } else { | |
| 214 // A peak is found, do a simple quadratic interpolation to get a more | |
| 215 // accurate estimate of the peak location. | |
| 216 fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, | |
| 217 next_magn_sqr); | |
| 218 } | |
| 219 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; | |
| 220 } | |
| 221 } | |
| 222 | |
| 223 // Using iSAC functions to estimate pitch gains & lags. | |
| 224 void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz, | |
| 225 int length) { | |
| 226 // TODO(turajs): This can be "imported" from iSAC & and the next two | |
| 227 // constants. | |
| 228 assert(length >= kNum10msSubframes); | |
| 229 const int kNumPitchSubframes = 4; | |
| 230 double gains[kNumPitchSubframes]; | |
| 231 double lags[kNumPitchSubframes]; | |
| 232 | |
| 233 const int kNumSubbandFrameSamples = 240; | |
| 234 const int kNumLookaheadSamples = 24; | |
| 235 | |
| 236 float lower[kNumSubbandFrameSamples]; | |
| 237 float upper[kNumSubbandFrameSamples]; | |
| 238 double lower_lookahead[kNumSubbandFrameSamples]; | |
| 239 double upper_lookahead[kNumSubbandFrameSamples]; | |
| 240 double lower_lookahead_pre_filter[kNumSubbandFrameSamples + | |
| 241 kNumLookaheadSamples]; | |
| 242 | |
| 243 // Split signal to lower and upper bands | |
| 244 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], | |
| 245 lower, upper, lower_lookahead, upper_lookahead, | |
| 246 pre_filter_handle_.get()); | |
| 247 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, | |
| 248 pitch_analysis_handle_.get(), lags, gains); | |
| 249 | |
| 250 // Lags are computed on lower-band signal with sampling rate half of the | |
| 251 // input signal. | |
| 252 GetSubframesPitchParameters(kSampleRateHz / 2, gains, lags, | |
| 253 kNumPitchSubframes, kNum10msSubframes, | |
| 254 &log_old_gain_, &old_lag_, | |
| 255 log_pitch_gains, pitch_lags_hz); | |
| 256 } | |
| 257 | |
| 258 void AgcAudioProc::Rms(double* rms, int length_rms) { | |
| 259 assert(length_rms >= kNum10msSubframes); | |
| 260 int offset = kNumPastSignalSamples; | |
| 261 for (int i = 0; i < kNum10msSubframes; i++) { | |
| 262 rms[i] = 0; | |
| 263 for (int n = 0; n < kNumSubframeSamples; n++, offset++) | |
| 264 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; | |
| 265 rms[i] = sqrt(rms[i] / kNumSubframeSamples); | |
| 266 } | |
| 267 } | |
| 268 | |
| 269 } // namespace webrtc | |
| OLD | NEW |