OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h" | 11 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h" |
12 | 12 |
13 #include <math.h> | 13 #include <math.h> |
14 #include <stdlib.h> | 14 #include <stdlib.h> |
15 #include <algorithm> | 15 #include <algorithm> |
16 #include <limits> | 16 #include <limits> |
17 #include <numeric> | 17 #include <numeric> |
18 | 18 |
19 #include "webrtc/base/checks.h" | 19 #include "webrtc/base/checks.h" |
20 #include "webrtc/base/logging.h" | |
20 #include "webrtc/common_audio/include/audio_util.h" | 21 #include "webrtc/common_audio/include/audio_util.h" |
21 #include "webrtc/common_audio/window_generator.h" | 22 #include "webrtc/common_audio/window_generator.h" |
22 | 23 |
23 namespace webrtc { | 24 namespace webrtc { |
24 | 25 |
25 namespace { | 26 namespace { |
26 | 27 |
27 const size_t kErbResolution = 2; | 28 const size_t kErbResolution = 2; |
28 const int kWindowSizeMs = 16; | 29 const int kWindowSizeMs = 16; |
29 const int kChunkSizeMs = 10; // Size provided by APM. | 30 const int kChunkSizeMs = 10; // Size provided by APM. |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
83 capture_filter_bank_(CreateErbBank(num_noise_bins)), | 84 capture_filter_bank_(CreateErbBank(num_noise_bins)), |
84 render_filter_bank_(CreateErbBank(freqs_)), | 85 render_filter_bank_(CreateErbBank(freqs_)), |
85 gains_eq_(bank_size_), | 86 gains_eq_(bank_size_), |
86 gain_applier_(freqs_, kMaxRelativeGainChange), | 87 gain_applier_(freqs_, kMaxRelativeGainChange), |
87 audio_s16_(chunk_length_), | 88 audio_s16_(chunk_length_), |
88 chunks_since_voice_(kSpeechOffsetDelay), | 89 chunks_since_voice_(kSpeechOffsetDelay), |
89 is_speech_(false), | 90 is_speech_(false), |
90 snr_(kMaxActiveSNR), | 91 snr_(kMaxActiveSNR), |
91 is_active_(false), | 92 is_active_(false), |
92 num_chunks_(0u), | 93 num_chunks_(0u), |
94 num_active_chunks_(0u), | |
93 noise_estimation_buffer_(num_noise_bins), | 95 noise_estimation_buffer_(num_noise_bins), |
94 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, | 96 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, |
95 std::vector<float>(num_noise_bins), | 97 std::vector<float>(num_noise_bins), |
96 RenderQueueItemVerifier<float>(num_noise_bins)) { | 98 RenderQueueItemVerifier<float>(num_noise_bins)) { |
97 RTC_DCHECK_LE(kRho, 1.f); | 99 RTC_DCHECK_LE(kRho, 1.f); |
98 | 100 |
99 const size_t erb_index = static_cast<size_t>( | 101 const size_t erb_index = static_cast<size_t>( |
100 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + | 102 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + |
101 43.f)); | 103 43.f)); |
102 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); | 104 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); |
103 | 105 |
104 size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_); | 106 size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_); |
105 std::vector<float> kbd_window(window_size); | 107 std::vector<float> kbd_window(window_size); |
106 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, | 108 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, |
107 kbd_window.data()); | 109 kbd_window.data()); |
108 render_mangler_.reset(new LappedTransform( | 110 render_mangler_.reset(new LappedTransform( |
109 num_render_channels_, num_render_channels_, chunk_length_, | 111 num_render_channels_, num_render_channels_, chunk_length_, |
110 kbd_window.data(), window_size, window_size / 2, this)); | 112 kbd_window.data(), window_size, window_size / 2, this)); |
111 } | 113 } |
112 | 114 |
115 IntelligibilityEnhancer::~IntelligibilityEnhancer() { | |
116 // Don't rely on this log, since the destructor isn't called when the app/tab | |
117 // is killed. | |
118 LOG(LS_INFO) << "Intelligibility Enhancer was active for " | |
119 << static_cast<float>(num_active_chunks_) / num_chunks_ | |
120 << "% of the call."; | |
121 } | |
122 | |
113 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( | 123 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( |
114 std::vector<float> noise, int gain_db) { | 124 std::vector<float> noise, int gain_db) { |
115 RTC_DCHECK_EQ(noise.size(), num_noise_bins_); | 125 RTC_DCHECK_EQ(noise.size(), num_noise_bins_); |
116 const float gain = std::pow(10.f, gain_db / 20.f); | 126 const float gain = std::pow(10.f, gain_db / 20.f); |
117 for (auto& bin : noise) { | 127 for (auto& bin : noise) { |
118 bin *= gain; | 128 bin *= gain; |
119 } | 129 } |
120 // Disregarding return value since buffer overflow is acceptable, because it | 130 // Disregarding return value since buffer overflow is acceptable, because it |
121 // is not critical to get each noise estimate. | 131 // is not critical to get each noise estimate. |
122 if (noise_estimation_queue_.Insert(&noise)) { | 132 if (noise_estimation_queue_.Insert(&noise)) { |
(...skipping 16 matching lines...) Expand all Loading... | |
139 const std::complex<float>* const* in_block, | 149 const std::complex<float>* const* in_block, |
140 size_t in_channels, | 150 size_t in_channels, |
141 size_t frames, | 151 size_t frames, |
142 size_t /* out_channels */, | 152 size_t /* out_channels */, |
143 std::complex<float>* const* out_block) { | 153 std::complex<float>* const* out_block) { |
144 RTC_DCHECK_EQ(freqs_, frames); | 154 RTC_DCHECK_EQ(freqs_, frames); |
145 if (is_speech_) { | 155 if (is_speech_) { |
146 clear_power_estimator_.Step(in_block[0]); | 156 clear_power_estimator_.Step(in_block[0]); |
147 } | 157 } |
148 SnrBasedEffectActivation(); | 158 SnrBasedEffectActivation(); |
149 if (is_active_ && num_chunks_++ % kGainUpdatePeriod == 0) { | 159 ++num_chunks_; |
150 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_, | 160 if (is_active_) { |
151 filtered_clear_pow_.data()); | 161 ++num_active_chunks_; |
152 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_, | 162 if (num_chunks_ % kGainUpdatePeriod == 0) { |
153 filtered_noise_pow_.data()); | 163 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_, |
154 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); | 164 filtered_clear_pow_.data()); |
155 const float power_target = std::accumulate( | 165 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_, |
156 filtered_clear_pow_.data(), | 166 filtered_noise_pow_.data()); |
157 filtered_clear_pow_.data() + bank_size_, | 167 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); |
158 0.f); | 168 const float power_target = std::accumulate( |
159 const float power_top = | 169 filtered_clear_pow_.data(), |
160 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); | 170 filtered_clear_pow_.data() + bank_size_, |
161 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); | 171 0.f); |
162 const float power_bot = | 172 const float power_top = |
163 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); | 173 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
164 if (power_target >= power_bot && power_target <= power_top) { | 174 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); |
165 SolveForLambda(power_target); | 175 const float power_bot = |
166 UpdateErbGains(); | 176 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
167 } // Else experiencing power underflow, so do nothing. | 177 if (power_target >= power_bot && power_target <= power_top) { |
178 SolveForLambda(power_target); | |
179 UpdateErbGains(); | |
180 } // Else experiencing power underflow, so do nothing. | |
181 } | |
168 } | 182 } |
169 for (size_t i = 0; i < in_channels; ++i) { | 183 for (size_t i = 0; i < in_channels; ++i) { |
170 gain_applier_.Apply(in_block[i], out_block[i]); | 184 gain_applier_.Apply(in_block[i], out_block[i]); |
171 } | 185 } |
172 } | 186 } |
173 | 187 |
174 void IntelligibilityEnhancer::SnrBasedEffectActivation() { | 188 void IntelligibilityEnhancer::SnrBasedEffectActivation() { |
175 const float* clear_psd = clear_power_estimator_.power().data(); | 189 const float* clear_psd = clear_power_estimator_.power().data(); |
176 const float* noise_psd = noise_power_estimator_.power().data(); | 190 const float* noise_psd = noise_power_estimator_.power().data(); |
177 const float clear_power = | 191 const float clear_power = |
178 std::accumulate(clear_psd, clear_psd + freqs_, 0.f); | 192 std::accumulate(clear_psd, clear_psd + freqs_, 0.f); |
179 const float noise_power = | 193 const float noise_power = |
180 std::accumulate(noise_psd, noise_psd + freqs_, 0.f); | 194 std::accumulate(noise_psd, noise_psd + freqs_, 0.f); |
181 snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / | 195 snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / |
182 (noise_power + std::numeric_limits<float>::epsilon()); | 196 (noise_power + std::numeric_limits<float>::epsilon()); |
183 if (is_active_) { | 197 if (is_active_) { |
184 if (snr_ > kMaxActiveSNR) { | 198 if (snr_ > kMaxActiveSNR) { |
199 LOG(LS_INFO) << "Intelligibility Enhancer was activated at chunk " | |
ivoc
2016/07/01 08:04:30
This should probably say "deactivated".
aluebs-webrtc
2016/07/01 22:11:13
Good catch. Done.
| |
200 << num_chunks_; | |
185 is_active_ = false; | 201 is_active_ = false; |
186 // Set the target gains to unity. | 202 // Set the target gains to unity. |
187 float* gains = gain_applier_.target(); | 203 float* gains = gain_applier_.target(); |
188 for (size_t i = 0; i < freqs_; ++i) { | 204 for (size_t i = 0; i < freqs_; ++i) { |
189 gains[i] = 1.f; | 205 gains[i] = 1.f; |
190 } | 206 } |
191 } | 207 } |
192 } else { | 208 } else { |
193 is_active_ = snr_ < kMinInactiveSNR; | 209 if (snr_ < kMinInactiveSNR) { |
210 LOG(LS_INFO) << "Intelligibility Enhancer was activated at chunk " | |
211 << num_chunks_; | |
212 is_active_ = true; | |
213 } | |
194 } | 214 } |
195 } | 215 } |
196 | 216 |
197 void IntelligibilityEnhancer::SolveForLambda(float power_target) { | 217 void IntelligibilityEnhancer::SolveForLambda(float power_target) { |
198 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values | 218 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values |
199 const int kMaxIters = 100; // for these, based on experiments. | 219 const int kMaxIters = 100; // for these, based on experiments. |
200 | 220 |
201 const float reciprocal_power_target = | 221 const float reciprocal_power_target = |
202 1.f / (power_target + std::numeric_limits<float>::epsilon()); | 222 1.f / (power_target + std::numeric_limits<float>::epsilon()); |
203 float lambda_bot = kLambdaBot; | 223 float lambda_bot = kLambdaBot; |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
344 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); | 364 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); |
345 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { | 365 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { |
346 chunks_since_voice_ = 0; | 366 chunks_since_voice_ = 0; |
347 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { | 367 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { |
348 ++chunks_since_voice_; | 368 ++chunks_since_voice_; |
349 } | 369 } |
350 return chunks_since_voice_ < kSpeechOffsetDelay; | 370 return chunks_since_voice_ < kSpeechOffsetDelay; |
351 } | 371 } |
352 | 372 |
353 } // namespace webrtc | 373 } // namespace webrtc |
OLD | NEW |