Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 20 matching lines...) Expand all Loading... | |
| 31 const float kKbdAlpha = 1.5f; | 31 const float kKbdAlpha = 1.5f; |
| 32 const float kLambdaBot = -1.f; // Extreme values in bisection | 32 const float kLambdaBot = -1.f; // Extreme values in bisection |
| 33 const float kLambdaTop = -1e-5f; // search for lamda. | 33 const float kLambdaTop = -1e-5f; // search for lamda. |
| 34 const float kVoiceProbabilityThreshold = 0.02f; | 34 const float kVoiceProbabilityThreshold = 0.02f; |
| 35 // Number of chunks after voice activity which is still considered speech. | 35 // Number of chunks after voice activity which is still considered speech. |
| 36 const size_t kSpeechOffsetDelay = 80; | 36 const size_t kSpeechOffsetDelay = 80; |
| 37 const float kDecayRate = 0.994f; // Power estimation decay rate. | 37 const float kDecayRate = 0.994f; // Power estimation decay rate. |
| 38 const float kMaxRelativeGainChange = 0.006f; | 38 const float kMaxRelativeGainChange = 0.006f; |
| 39 const float kRho = 0.0004f; // Default production and interpretation SNR. | 39 const float kRho = 0.0004f; // Default production and interpretation SNR. |
| 40 const float kPowerNormalizationFactor = 1.f / (1 << 30); | 40 const float kPowerNormalizationFactor = 1.f / (1 << 30); |
| 41 const float kMaxActiveSNR = 128.f; // 21dB | |
| 42 const float kMinInactiveSNR = 32.f; // 15dB | |
| 41 | 43 |
| 42 // Returns dot product of vectors |a| and |b| with size |length|. | 44 // Returns dot product of vectors |a| and |b| with size |length|. |
| 43 float DotProduct(const float* a, const float* b, size_t length) { | 45 float DotProduct(const float* a, const float* b, size_t length) { |
| 44 float ret = 0.f; | 46 float ret = 0.f; |
| 45 for (size_t i = 0; i < length; ++i) { | 47 for (size_t i = 0; i < length; ++i) { |
| 46 ret += a[i] * b[i]; | 48 ret += a[i] * b[i]; |
| 47 } | 49 } |
| 48 return ret; | 50 return ret; |
| 49 } | 51 } |
| 50 | 52 |
| (...skipping 26 matching lines...) Expand all Loading... | |
| 77 filtered_clear_pow_(bank_size_, 0.f), | 79 filtered_clear_pow_(bank_size_, 0.f), |
| 78 filtered_noise_pow_(num_noise_bins, 0.f), | 80 filtered_noise_pow_(num_noise_bins, 0.f), |
| 79 center_freqs_(bank_size_), | 81 center_freqs_(bank_size_), |
| 80 capture_filter_bank_(CreateErbBank(num_noise_bins)), | 82 capture_filter_bank_(CreateErbBank(num_noise_bins)), |
| 81 render_filter_bank_(CreateErbBank(freqs_)), | 83 render_filter_bank_(CreateErbBank(freqs_)), |
| 82 gains_eq_(bank_size_), | 84 gains_eq_(bank_size_), |
| 83 gain_applier_(freqs_, kMaxRelativeGainChange), | 85 gain_applier_(freqs_, kMaxRelativeGainChange), |
| 84 audio_s16_(chunk_length_), | 86 audio_s16_(chunk_length_), |
| 85 chunks_since_voice_(kSpeechOffsetDelay), | 87 chunks_since_voice_(kSpeechOffsetDelay), |
| 86 is_speech_(false), | 88 is_speech_(false), |
| 89 snr_(kMaxActiveSNR), | |
| 90 is_active_(false), | |
| 87 noise_estimation_buffer_(num_noise_bins), | 91 noise_estimation_buffer_(num_noise_bins), |
| 88 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, | 92 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, |
| 89 std::vector<float>(num_noise_bins), | 93 std::vector<float>(num_noise_bins), |
| 90 RenderQueueItemVerifier<float>(num_noise_bins)) { | 94 RenderQueueItemVerifier<float>(num_noise_bins)) { |
| 91 RTC_DCHECK_LE(kRho, 1.f); | 95 RTC_DCHECK_LE(kRho, 1.f); |
| 92 | 96 |
| 93 const size_t erb_index = static_cast<size_t>( | 97 const size_t erb_index = static_cast<size_t>( |
| 94 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + | 98 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + |
| 95 43.f)); | 99 43.f)); |
| 96 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); | 100 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 128 void IntelligibilityEnhancer::ProcessAudioBlock( | 132 void IntelligibilityEnhancer::ProcessAudioBlock( |
| 129 const std::complex<float>* const* in_block, | 133 const std::complex<float>* const* in_block, |
| 130 size_t in_channels, | 134 size_t in_channels, |
| 131 size_t frames, | 135 size_t frames, |
| 132 size_t /* out_channels */, | 136 size_t /* out_channels */, |
| 133 std::complex<float>* const* out_block) { | 137 std::complex<float>* const* out_block) { |
| 134 RTC_DCHECK_EQ(freqs_, frames); | 138 RTC_DCHECK_EQ(freqs_, frames); |
| 135 if (is_speech_) { | 139 if (is_speech_) { |
| 136 clear_power_estimator_.Step(in_block[0]); | 140 clear_power_estimator_.Step(in_block[0]); |
| 137 } | 141 } |
| 138 const std::vector<float>& clear_power = clear_power_estimator_.power(); | 142 UpdateActivity(); |
| 139 const std::vector<float>& noise_power = noise_power_estimator_.power(); | 143 if (is_active_) { |
| 140 MapToErbBands(clear_power.data(), render_filter_bank_, | 144 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_, |
| 141 filtered_clear_pow_.data()); | 145 filtered_clear_pow_.data()); |
| 142 MapToErbBands(noise_power.data(), capture_filter_bank_, | 146 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_, |
| 143 filtered_noise_pow_.data()); | 147 filtered_noise_pow_.data()); |
| 144 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); | 148 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); |
| 145 const float power_target = std::accumulate( | 149 const float power_target = std::accumulate( |
| 146 filtered_clear_pow_.data(), filtered_clear_pow_.data() + bank_size_, 0.f); | 150 filtered_clear_pow_.data(), |
| 147 const float power_top = | 151 filtered_clear_pow_.data() + bank_size_, |
| 148 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); | 152 0.f); |
| 149 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); | 153 const float power_top = |
| 150 const float power_bot = | 154 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
| 151 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); | 155 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); |
| 152 if (power_target >= power_bot && power_target <= power_top) { | 156 const float power_bot = |
| 153 SolveForLambda(power_target); | 157 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
| 154 UpdateErbGains(); | 158 if (power_target >= power_bot && power_target <= power_top) { |
| 155 } // Else experiencing power underflow, so do nothing. | 159 SolveForLambda(power_target); |
| 160 UpdateErbGains(); | |
| 161 } // Else experiencing power underflow, so do nothing. | |
| 162 } | |
| 156 for (size_t i = 0; i < in_channels; ++i) { | 163 for (size_t i = 0; i < in_channels; ++i) { |
| 157 gain_applier_.Apply(in_block[i], out_block[i]); | 164 gain_applier_.Apply(in_block[i], out_block[i]); |
| 158 } | 165 } |
| 159 } | 166 } |
| 160 | 167 |
| 168 void IntelligibilityEnhancer::UpdateActivity() { | |
|
peah-webrtc
2016/04/12 13:39:21
What you are updating here is the is_active flag a
aluebs-webrtc
2016/04/12 18:34:28
Yes, your understanding is completely right. And I
| |
| 169 const float* clear_psd = clear_power_estimator_.power().data(); | |
| 170 const float* noise_psd = noise_power_estimator_.power().data(); | |
| 171 const float clear_power = | |
| 172 std::accumulate(clear_psd, clear_psd + freqs_, 0.f); | |
| 173 const float noise_power = | |
| 174 std::accumulate(noise_psd, noise_psd + freqs_, 0.f); | |
| 175 snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / noise_power; | |
|
peah-webrtc
2016/04/12 13:39:21
This SNR estimate is an average of the instantaneo
peah-webrtc
2016/04/12 13:39:21
This SNR estimate is assuming that the ratio of th
aluebs-webrtc
2016/04/12 18:34:28
That is an interesting point. Because the PSDs are
aluebs-webrtc
2016/04/12 18:34:28
As discussed offline at the beginning of this proj
| |
| 176 if (is_active_) { | |
| 177 if (snr_ > kMaxActiveSNR) { | |
| 178 is_active_ = false; | |
| 179 // Set the target gains to unity. | |
| 180 float* gains = gain_applier_.target(); | |
| 181 for (size_t i = 0; i < freqs_; ++i) { | |
| 182 gains[i] = 1.f; | |
| 183 } | |
| 184 } | |
| 185 } else { | |
| 186 is_active_ = snr_ < kMinInactiveSNR; | |
| 187 } | |
| 188 } | |
| 189 | |
| 161 void IntelligibilityEnhancer::SolveForLambda(float power_target) { | 190 void IntelligibilityEnhancer::SolveForLambda(float power_target) { |
| 162 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values | 191 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values |
| 163 const int kMaxIters = 100; // for these, based on experiments. | 192 const int kMaxIters = 100; // for these, based on experiments. |
| 164 | 193 |
| 165 const float reciprocal_power_target = | 194 const float reciprocal_power_target = |
| 166 1.f / (power_target + std::numeric_limits<float>::epsilon()); | 195 1.f / (power_target + std::numeric_limits<float>::epsilon()); |
| 167 float lambda_bot = kLambdaBot; | 196 float lambda_bot = kLambdaBot; |
| 168 float lambda_top = kLambdaTop; | 197 float lambda_top = kLambdaTop; |
| 169 float power_ratio = 2.f; // Ratio of achieved power to target power. | 198 float power_ratio = 2.f; // Ratio of achieved power to target power. |
| 170 int iters = 0; | 199 int iters = 0; |
| (...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 308 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); | 337 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); |
| 309 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { | 338 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { |
| 310 chunks_since_voice_ = 0; | 339 chunks_since_voice_ = 0; |
| 311 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { | 340 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { |
| 312 ++chunks_since_voice_; | 341 ++chunks_since_voice_; |
| 313 } | 342 } |
| 314 return chunks_since_voice_ < kSpeechOffsetDelay; | 343 return chunks_since_voice_ < kSpeechOffsetDelay; |
| 315 } | 344 } |
| 316 | 345 |
| 317 } // namespace webrtc | 346 } // namespace webrtc |
| OLD | NEW |