Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(282)

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1878133002: Disable Intelligibility Enhancer for high SNRs (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 20 matching lines...) Expand all
31 const float kKbdAlpha = 1.5f; 31 const float kKbdAlpha = 1.5f;
32 const float kLambdaBot = -1.f; // Extreme values in bisection 32 const float kLambdaBot = -1.f; // Extreme values in bisection
33 const float kLambdaTop = -1e-5f; // search for lamda. 33 const float kLambdaTop = -1e-5f; // search for lamda.
34 const float kVoiceProbabilityThreshold = 0.02f; 34 const float kVoiceProbabilityThreshold = 0.02f;
35 // Number of chunks after voice activity which is still considered speech. 35 // Number of chunks after voice activity which is still considered speech.
36 const size_t kSpeechOffsetDelay = 80; 36 const size_t kSpeechOffsetDelay = 80;
37 const float kDecayRate = 0.994f; // Power estimation decay rate. 37 const float kDecayRate = 0.994f; // Power estimation decay rate.
38 const float kMaxRelativeGainChange = 0.006f; 38 const float kMaxRelativeGainChange = 0.006f;
39 const float kRho = 0.0004f; // Default production and interpretation SNR. 39 const float kRho = 0.0004f; // Default production and interpretation SNR.
40 const float kPowerNormalizationFactor = 1.f / (1 << 30); 40 const float kPowerNormalizationFactor = 1.f / (1 << 30);
41 const float kMaxActiveSNR = 128.f; // 21dB
42 const float kMinInactiveSNR = 32.f; // 15dB
41 43
42 // Returns dot product of vectors |a| and |b| with size |length|. 44 // Returns dot product of vectors |a| and |b| with size |length|.
43 float DotProduct(const float* a, const float* b, size_t length) { 45 float DotProduct(const float* a, const float* b, size_t length) {
44 float ret = 0.f; 46 float ret = 0.f;
45 for (size_t i = 0; i < length; ++i) { 47 for (size_t i = 0; i < length; ++i) {
46 ret += a[i] * b[i]; 48 ret += a[i] * b[i];
47 } 49 }
48 return ret; 50 return ret;
49 } 51 }
50 52
(...skipping 26 matching lines...) Expand all
77 filtered_clear_pow_(bank_size_, 0.f), 79 filtered_clear_pow_(bank_size_, 0.f),
78 filtered_noise_pow_(num_noise_bins, 0.f), 80 filtered_noise_pow_(num_noise_bins, 0.f),
79 center_freqs_(bank_size_), 81 center_freqs_(bank_size_),
80 capture_filter_bank_(CreateErbBank(num_noise_bins)), 82 capture_filter_bank_(CreateErbBank(num_noise_bins)),
81 render_filter_bank_(CreateErbBank(freqs_)), 83 render_filter_bank_(CreateErbBank(freqs_)),
82 gains_eq_(bank_size_), 84 gains_eq_(bank_size_),
83 gain_applier_(freqs_, kMaxRelativeGainChange), 85 gain_applier_(freqs_, kMaxRelativeGainChange),
84 audio_s16_(chunk_length_), 86 audio_s16_(chunk_length_),
85 chunks_since_voice_(kSpeechOffsetDelay), 87 chunks_since_voice_(kSpeechOffsetDelay),
86 is_speech_(false), 88 is_speech_(false),
89 snr_(kMaxActiveSNR),
90 is_active_(false),
87 noise_estimation_buffer_(num_noise_bins), 91 noise_estimation_buffer_(num_noise_bins),
88 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, 92 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer,
89 std::vector<float>(num_noise_bins), 93 std::vector<float>(num_noise_bins),
90 RenderQueueItemVerifier<float>(num_noise_bins)) { 94 RenderQueueItemVerifier<float>(num_noise_bins)) {
91 RTC_DCHECK_LE(kRho, 1.f); 95 RTC_DCHECK_LE(kRho, 1.f);
92 96
93 const size_t erb_index = static_cast<size_t>( 97 const size_t erb_index = static_cast<size_t>(
94 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + 98 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
95 43.f)); 99 43.f));
96 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); 100 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
128 void IntelligibilityEnhancer::ProcessAudioBlock( 132 void IntelligibilityEnhancer::ProcessAudioBlock(
129 const std::complex<float>* const* in_block, 133 const std::complex<float>* const* in_block,
130 size_t in_channels, 134 size_t in_channels,
131 size_t frames, 135 size_t frames,
132 size_t /* out_channels */, 136 size_t /* out_channels */,
133 std::complex<float>* const* out_block) { 137 std::complex<float>* const* out_block) {
134 RTC_DCHECK_EQ(freqs_, frames); 138 RTC_DCHECK_EQ(freqs_, frames);
135 if (is_speech_) { 139 if (is_speech_) {
136 clear_power_estimator_.Step(in_block[0]); 140 clear_power_estimator_.Step(in_block[0]);
137 } 141 }
138 const std::vector<float>& clear_power = clear_power_estimator_.power(); 142 UpdateActivity();
139 const std::vector<float>& noise_power = noise_power_estimator_.power(); 143 if (is_active_) {
140 MapToErbBands(clear_power.data(), render_filter_bank_, 144 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_,
141 filtered_clear_pow_.data()); 145 filtered_clear_pow_.data());
142 MapToErbBands(noise_power.data(), capture_filter_bank_, 146 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_,
143 filtered_noise_pow_.data()); 147 filtered_noise_pow_.data());
144 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); 148 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());
145 const float power_target = std::accumulate( 149 const float power_target = std::accumulate(
146 filtered_clear_pow_.data(), filtered_clear_pow_.data() + bank_size_, 0.f); 150 filtered_clear_pow_.data(),
147 const float power_top = 151 filtered_clear_pow_.data() + bank_size_,
148 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); 152 0.f);
149 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); 153 const float power_top =
150 const float power_bot = 154 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
151 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); 155 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());
152 if (power_target >= power_bot && power_target <= power_top) { 156 const float power_bot =
153 SolveForLambda(power_target); 157 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
154 UpdateErbGains(); 158 if (power_target >= power_bot && power_target <= power_top) {
155 } // Else experiencing power underflow, so do nothing. 159 SolveForLambda(power_target);
160 UpdateErbGains();
161 } // Else experiencing power underflow, so do nothing.
162 }
156 for (size_t i = 0; i < in_channels; ++i) { 163 for (size_t i = 0; i < in_channels; ++i) {
157 gain_applier_.Apply(in_block[i], out_block[i]); 164 gain_applier_.Apply(in_block[i], out_block[i]);
158 } 165 }
159 } 166 }
160 167
168 void IntelligibilityEnhancer::UpdateActivity() {
peah-webrtc 2016/04/12 13:39:21 What you are updating here is the is_active flag a
aluebs-webrtc 2016/04/12 18:34:28 Yes, your understanding is completely right. And I
169 const float* clear_psd = clear_power_estimator_.power().data();
170 const float* noise_psd = noise_power_estimator_.power().data();
171 const float clear_power =
172 std::accumulate(clear_psd, clear_psd + freqs_, 0.f);
173 const float noise_power =
174 std::accumulate(noise_psd, noise_psd + freqs_, 0.f);
175 snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / noise_power;
peah-webrtc 2016/04/12 13:39:21 This SNR estimate is an average of the instantaneo
peah-webrtc 2016/04/12 13:39:21 This SNR estimate is assuming that the ratio of th
aluebs-webrtc 2016/04/12 18:34:28 That is an interesting point. Because the PSDs are
aluebs-webrtc 2016/04/12 18:34:28 As discussed offline at the beginning of this proj
176 if (is_active_) {
177 if (snr_ > kMaxActiveSNR) {
178 is_active_ = false;
179 // Set the target gains to unity.
180 float* gains = gain_applier_.target();
181 for (size_t i = 0; i < freqs_; ++i) {
182 gains[i] = 1.f;
183 }
184 }
185 } else {
186 is_active_ = snr_ < kMinInactiveSNR;
187 }
188 }
189
161 void IntelligibilityEnhancer::SolveForLambda(float power_target) { 190 void IntelligibilityEnhancer::SolveForLambda(float power_target) {
162 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values 191 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values
163 const int kMaxIters = 100; // for these, based on experiments. 192 const int kMaxIters = 100; // for these, based on experiments.
164 193
165 const float reciprocal_power_target = 194 const float reciprocal_power_target =
166 1.f / (power_target + std::numeric_limits<float>::epsilon()); 195 1.f / (power_target + std::numeric_limits<float>::epsilon());
167 float lambda_bot = kLambdaBot; 196 float lambda_bot = kLambdaBot;
168 float lambda_top = kLambdaTop; 197 float lambda_top = kLambdaTop;
169 float power_ratio = 2.f; // Ratio of achieved power to target power. 198 float power_ratio = 2.f; // Ratio of achieved power to target power.
170 int iters = 0; 199 int iters = 0;
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
308 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); 337 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);
309 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { 338 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
310 chunks_since_voice_ = 0; 339 chunks_since_voice_ = 0;
311 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { 340 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {
312 ++chunks_since_voice_; 341 ++chunks_since_voice_;
313 } 342 }
314 return chunks_since_voice_ < kSpeechOffsetDelay; 343 return chunks_since_voice_ < kSpeechOffsetDelay;
315 } 344 }
316 345
317 } // namespace webrtc 346 } // namespace webrtc
OLDNEW
« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698