Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(238)

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@gains2
Patch Set: Rebasing Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
47 return ret; 47 return ret;
48 } 48 }
49 49
50 // Computes the power across ERB bands from the power spectral density |pow|. 50 // Computes the power across ERB bands from the power spectral density |pow|.
51 // Stores it in |result|. 51 // Stores it in |result|.
52 void MapToErbBands(const float* pow, 52 void MapToErbBands(const float* pow,
53 const std::vector<std::vector<float>>& filter_bank, 53 const std::vector<std::vector<float>>& filter_bank,
54 float* result) { 54 float* result) {
55 for (size_t i = 0; i < filter_bank.size(); ++i) { 55 for (size_t i = 0; i < filter_bank.size(); ++i) {
56 RTC_DCHECK_GT(filter_bank[i].size(), 0u); 56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);
57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size()); 57 result[i] = DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());
58 } 58 }
59 } 59 }
60 60
61 } // namespace 61 } // namespace
62 62
63 IntelligibilityEnhancer::TransformCallback::TransformCallback(
64 IntelligibilityEnhancer* parent)
65 : parent_(parent) {
66 }
67
68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
69 const std::complex<float>* const* in_block,
70 size_t in_channels,
71 size_t frames,
72 size_t /* out_channels */,
73 std::complex<float>* const* out_block) {
74 RTC_DCHECK_EQ(parent_->freqs_, frames);
75 for (size_t i = 0; i < in_channels; ++i) {
76 parent_->ProcessClearBlock(in_block[i], out_block[i]);
77 }
78 }
79
80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, 63 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
81 size_t num_render_channels) 64 size_t num_render_channels)
82 : freqs_(RealFourier::ComplexLength( 65 : freqs_(RealFourier::ComplexLength(
83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), 66 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), 67 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), 68 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
86 sample_rate_hz_(sample_rate_hz), 69 sample_rate_hz_(sample_rate_hz),
87 num_render_channels_(num_render_channels), 70 num_render_channels_(num_render_channels),
88 clear_power_estimator_(freqs_, kDecayRate), 71 clear_power_estimator_(freqs_, kDecayRate),
89 noise_power_estimator_( 72 noise_power_estimator_(
90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)), 73 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
91 filtered_clear_pow_(new float[bank_size_]), 74 filtered_clear_pow_(bank_size_, 0.f),
92 filtered_noise_pow_(new float[bank_size_]), 75 filtered_noise_pow_(bank_size_, 0.f),
93 center_freqs_(new float[bank_size_]), 76 center_freqs_(bank_size_),
94 render_filter_bank_(CreateErbBank(freqs_)), 77 render_filter_bank_(CreateErbBank(freqs_)),
95 gains_eq_(new float[bank_size_]), 78 gains_eq_(bank_size_),
96 gain_applier_(freqs_, kMaxRelativeGainChange), 79 gain_applier_(freqs_, kMaxRelativeGainChange),
97 temp_render_out_buffer_(chunk_length_, num_render_channels_),
98 render_callback_(this),
99 audio_s16_(chunk_length_), 80 audio_s16_(chunk_length_),
100 chunks_since_voice_(kSpeechOffsetDelay), 81 chunks_since_voice_(kSpeechOffsetDelay),
101 is_speech_(false) { 82 is_speech_(false) {
102 RTC_DCHECK_LE(kRho, 1.f); 83 RTC_DCHECK_LE(kRho, 1.f);
103 84
104 memset(filtered_clear_pow_.get(), 0,
105 bank_size_ * sizeof(filtered_clear_pow_[0]));
106 memset(filtered_noise_pow_.get(), 0,
107 bank_size_ * sizeof(filtered_noise_pow_[0]));
108
109 const size_t erb_index = static_cast<size_t>( 85 const size_t erb_index = static_cast<size_t>(
110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + 86 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
111 43.f)); 87 43.f));
112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); 88 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
113 89
114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_)); 90 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
115 std::vector<float> kbd_window(window_size); 91 std::vector<float> kbd_window(window_size);
116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); 92 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,
93 kbd_window.data());
117 render_mangler_.reset(new LappedTransform( 94 render_mangler_.reset(new LappedTransform(
118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], 95 num_render_channels_, num_render_channels_, chunk_length_,
119 window_size, window_size / 2, &render_callback_)); 96 kbd_window.data(), window_size, window_size / 2, this));
120 } 97 }
121 98
122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( 99 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
123 std::vector<float> noise) { 100 std::vector<float> noise) {
124 if (capture_filter_bank_.size() != bank_size_ || 101 if (capture_filter_bank_.size() != bank_size_ ||
125 capture_filter_bank_[0].size() != noise.size()) { 102 capture_filter_bank_[0].size() != noise.size()) {
126 capture_filter_bank_ = CreateErbBank(noise.size()); 103 capture_filter_bank_ = CreateErbBank(noise.size());
127 noise_power_estimator_.reset( 104 noise_power_estimator_.reset(
128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate)); 105 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
129 } 106 }
130 noise_power_estimator_->Step(&noise[0]); 107 noise_power_estimator_->Step(noise.data());
131 } 108 }
132 109
133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, 110 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
134 int sample_rate_hz, 111 int sample_rate_hz,
135 size_t num_channels) { 112 size_t num_channels) {
136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); 113 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
137 RTC_CHECK_EQ(num_render_channels_, num_channels); 114 RTC_CHECK_EQ(num_render_channels_, num_channels);
138 is_speech_ = IsSpeech(audio[0]); 115 is_speech_ = IsSpeech(audio[0]);
139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); 116 render_mangler_->ProcessChunk(audio, audio);
140 for (size_t i = 0; i < num_render_channels_; ++i) {
141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],
142 chunk_length_ * sizeof(**audio));
143 }
144 } 117 }
145 118
146 void IntelligibilityEnhancer::ProcessClearBlock( 119 void IntelligibilityEnhancer::ProcessAudioBlock(
147 const std::complex<float>* in_block, 120 const std::complex<float>* const* in_block,
148 std::complex<float>* out_block) { 121 size_t in_channels,
122 size_t frames,
123 size_t /* out_channels */,
124 std::complex<float>* const* out_block) {
125 RTC_DCHECK_EQ(freqs_, frames);
149 if (is_speech_) { 126 if (is_speech_) {
150 clear_power_estimator_.Step(in_block); 127 clear_power_estimator_.Step(in_block[0]);
151 } 128 }
152 const std::vector<float>& clear_power = clear_power_estimator_.power(); 129 const std::vector<float>& clear_power = clear_power_estimator_.power();
153 const std::vector<float>& noise_power = noise_power_estimator_->power(); 130 const std::vector<float>& noise_power = noise_power_estimator_->power();
154 MapToErbBands(&clear_power[0], render_filter_bank_, 131 MapToErbBands(clear_power.data(), render_filter_bank_,
155 filtered_clear_pow_.get()); 132 filtered_clear_pow_.data());
156 MapToErbBands(&noise_power[0], capture_filter_bank_, 133 MapToErbBands(noise_power.data(), capture_filter_bank_,
157 filtered_noise_pow_.get()); 134 filtered_noise_pow_.data());
158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); 135 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());
159 const float power_target = 136 const float power_target =
160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); 137 std::accumulate(clear_power.data(), clear_power.data() + freqs_, 0.f);
161 const float power_top = 138 const float power_top =
162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 139 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); 140 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());
164 const float power_bot = 141 const float power_bot =
165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 142 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
166 if (power_target >= power_bot && power_target <= power_top) { 143 if (power_target >= power_bot && power_target <= power_top) {
167 SolveForLambda(power_target); 144 SolveForLambda(power_target);
168 UpdateErbGains(); 145 UpdateErbGains();
169 } // Else experiencing power underflow, so do nothing. 146 } // Else experiencing power underflow, so do nothing.
170 gain_applier_.Apply(in_block, out_block); 147 for (size_t i = 0; i < in_channels; ++i) {
148 gain_applier_.Apply(in_block[i], out_block[i]);
149 }
171 } 150 }
172 151
173 void IntelligibilityEnhancer::SolveForLambda(float power_target) { 152 void IntelligibilityEnhancer::SolveForLambda(float power_target) {
174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values 153 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values
175 const int kMaxIters = 100; // for these, based on experiments. 154 const int kMaxIters = 100; // for these, based on experiments.
176 155
177 const float reciprocal_power_target = 156 const float reciprocal_power_target =
178 1.f / (power_target + std::numeric_limits<float>::epsilon()); 157 1.f / (power_target + std::numeric_limits<float>::epsilon());
179 float lambda_bot = kLambdaBot; 158 float lambda_bot = kLambdaBot;
180 float lambda_top = kLambdaTop; 159 float lambda_top = kLambdaTop;
181 float power_ratio = 2.f; // Ratio of achieved power to target power. 160 float power_ratio = 2.f; // Ratio of achieved power to target power.
182 int iters = 0; 161 int iters = 0;
183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { 162 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
184 const float lambda = (lambda_bot + lambda_top) / 2.f; 163 const float lambda = (lambda_bot + lambda_top) / 2.f;
185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); 164 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data());
186 const float power = 165 const float power =
187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 166 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
188 if (power < power_target) { 167 if (power < power_target) {
189 lambda_bot = lambda; 168 lambda_bot = lambda;
190 } else { 169 } else {
191 lambda_top = lambda; 170 lambda_top = lambda;
192 } 171 }
193 power_ratio = std::fabs(power * reciprocal_power_target); 172 power_ratio = std::fabs(power * reciprocal_power_target);
194 ++iters; 173 ++iters;
195 } 174 }
196 } 175 }
197 176
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
279 } 258 }
280 } 259 }
281 return filter_bank; 260 return filter_bank;
282 } 261 }
283 262
284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, 263 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
285 size_t start_freq, 264 size_t start_freq,
286 float* sols) { 265 float* sols) {
287 const float kMinPower = 1e-5f; 266 const float kMinPower = 1e-5f;
288 267
289 const float* pow_x0 = filtered_clear_pow_.get(); 268 const float* pow_x0 = filtered_clear_pow_.data();
290 const float* pow_n0 = filtered_noise_pow_.get(); 269 const float* pow_n0 = filtered_noise_pow_.data();
291 270
292 for (size_t n = 0; n < start_freq; ++n) { 271 for (size_t n = 0; n < start_freq; ++n) {
293 sols[n] = 1.f; 272 sols[n] = 1.f;
294 } 273 }
295 274
296 // Analytic solution for optimal gains. See paper for derivation. 275 // Analytic solution for optimal gains. See paper for derivation.
297 for (size_t n = start_freq; n < bank_size_; ++n) { 276 for (size_t n = start_freq; n < bank_size_; ++n) {
298 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { 277 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) {
299 sols[n] = 1.f; 278 sols[n] = 1.f;
300 } else { 279 } else {
301 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] + 280 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +
302 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n]; 281 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];
303 const float beta0 = 282 const float beta0 =
304 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n]; 283 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n];
305 const float alpha0 = 284 const float alpha0 =
306 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n]; 285 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n];
307 RTC_DCHECK_LT(alpha0, 0.f); 286 RTC_DCHECK_LT(alpha0, 0.f);
308 // The quadratic equation should always have real roots, but to guard 287 // The quadratic equation should always have real roots, but to guard
309 // against numerical errors we limit it to a minimum of zero. 288 // against numerical errors we limit it to a minimum of zero.
310 sols[n] = std::max( 289 sols[n] = std::max(
311 0.f, (-beta0 - std::sqrt(std::max( 290 0.f, (-beta0 - std::sqrt(std::max(
312 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) / 291 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) /
313 (2.f * alpha0)); 292 (2.f * alpha0));
314 } 293 }
315 } 294 }
316 } 295 }
317 296
318 bool IntelligibilityEnhancer::IsSpeech(const float* audio) { 297 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
319 FloatToS16(audio, chunk_length_, &audio_s16_[0]); 298 FloatToS16(audio, chunk_length_, audio_s16_.data());
320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); 299 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);
321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { 300 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
322 chunks_since_voice_ = 0; 301 chunks_since_voice_ = 0;
323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { 302 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {
324 ++chunks_since_voice_; 303 ++chunks_since_voice_;
325 } 304 }
326 return chunks_since_voice_ < kSpeechOffsetDelay; 305 return chunks_since_voice_ < kSpeechOffsetDelay;
327 } 306 }
328 307
329 } // namespace webrtc 308 } // namespace webrtc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698