Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(10)

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@gains2
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
53 const std::vector<std::vector<float>>& filter_bank, 53 const std::vector<std::vector<float>>& filter_bank,
54 float* result) { 54 float* result) {
55 for (size_t i = 0; i < filter_bank.size(); ++i) { 55 for (size_t i = 0; i < filter_bank.size(); ++i) {
56 RTC_DCHECK_GT(filter_bank[i].size(), 0u); 56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);
57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size()); 57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());
58 } 58 }
59 } 59 }
60 60
61 } // namespace 61 } // namespace
62 62
63 IntelligibilityEnhancer::TransformCallback::TransformCallback(
64 IntelligibilityEnhancer* parent)
65 : parent_(parent) {
66 }
67
68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
69 const std::complex<float>* const* in_block,
70 size_t in_channels,
71 size_t frames,
72 size_t /* out_channels */,
73 std::complex<float>* const* out_block) {
74 RTC_DCHECK_EQ(parent_->freqs_, frames);
75 for (size_t i = 0; i < in_channels; ++i) {
76 parent_->ProcessClearBlock(in_block[i], out_block[i]);
77 }
78 }
79
80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, 63 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
81 size_t num_render_channels) 64 size_t num_render_channels)
82 : freqs_(RealFourier::ComplexLength( 65 : freqs_(RealFourier::ComplexLength(
83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), 66 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), 67 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), 68 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
86 sample_rate_hz_(sample_rate_hz), 69 sample_rate_hz_(sample_rate_hz),
87 num_render_channels_(num_render_channels), 70 num_render_channels_(num_render_channels),
88 clear_power_estimator_(freqs_, kDecayRate), 71 clear_power_estimator_(freqs_, kDecayRate),
89 noise_power_estimator_( 72 noise_power_estimator_(
90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)), 73 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
91 filtered_clear_pow_(new float[bank_size_]), 74 filtered_clear_pow_(bank_size_, 0.f),
92 filtered_noise_pow_(new float[bank_size_]), 75 filtered_noise_pow_(bank_size_, 0.f),
93 center_freqs_(new float[bank_size_]), 76 center_freqs_(bank_size_),
94 render_filter_bank_(CreateErbBank(freqs_)), 77 render_filter_bank_(CreateErbBank(freqs_)),
95 gains_eq_(new float[bank_size_]), 78 gains_eq_(bank_size_),
96 gain_applier_(freqs_, kMaxRelativeGainChange), 79 gain_applier_(freqs_, kMaxRelativeGainChange),
97 temp_render_out_buffer_(chunk_length_, num_render_channels_),
98 render_callback_(this),
99 audio_s16_(chunk_length_), 80 audio_s16_(chunk_length_),
100 chunks_since_voice_(kSpeechOffsetDelay), 81 chunks_since_voice_(kSpeechOffsetDelay),
101 is_speech_(false) { 82 is_speech_(false) {
102 RTC_DCHECK_LE(kRho, 1.f); 83 RTC_DCHECK_LE(kRho, 1.f);
103 84
104 memset(filtered_clear_pow_.get(), 0,
105 bank_size_ * sizeof(filtered_clear_pow_[0]));
106 memset(filtered_noise_pow_.get(), 0,
107 bank_size_ * sizeof(filtered_noise_pow_[0]));
108
109 const size_t erb_index = static_cast<size_t>( 85 const size_t erb_index = static_cast<size_t>(
110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + 86 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
111 43.f)); 87 43.f));
112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); 88 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
113 89
114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_)); 90 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
115 std::vector<float> kbd_window(window_size); 91 std::vector<float> kbd_window(window_size);
116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); 92 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
117 render_mangler_.reset(new LappedTransform( 93 render_mangler_.reset(new LappedTransform(
118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], 94 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
119 window_size, window_size / 2, &render_callback_)); 95 window_size, window_size / 2, this));
120 } 96 }
121 97
122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( 98 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
123 std::vector<float> noise) { 99 std::vector<float> noise) {
124 if (capture_filter_bank_.size() != bank_size_ || 100 if (capture_filter_bank_.size() != bank_size_ ||
125 capture_filter_bank_[0].size() != noise.size()) { 101 capture_filter_bank_[0].size() != noise.size()) {
126 capture_filter_bank_ = CreateErbBank(noise.size()); 102 capture_filter_bank_ = CreateErbBank(noise.size());
127 noise_power_estimator_.reset( 103 noise_power_estimator_.reset(
128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate)); 104 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
129 } 105 }
130 noise_power_estimator_->Step(&noise[0]); 106 noise_power_estimator_->Step(&noise[0]);
131 } 107 }
132 108
133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, 109 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
134 int sample_rate_hz, 110 int sample_rate_hz,
135 size_t num_channels) { 111 size_t num_channels) {
136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); 112 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
137 RTC_CHECK_EQ(num_render_channels_, num_channels); 113 RTC_CHECK_EQ(num_render_channels_, num_channels);
138 is_speech_ = IsSpeech(audio[0]); 114 is_speech_ = IsSpeech(audio[0]);
139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); 115 render_mangler_->ProcessChunk(audio, audio);
140 for (size_t i = 0; i < num_render_channels_; ++i) {
141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],
142 chunk_length_ * sizeof(**audio));
143 }
144 } 116 }
145 117
146 void IntelligibilityEnhancer::ProcessClearBlock( 118 void IntelligibilityEnhancer::ProcessAudioBlock(
147 const std::complex<float>* in_block, 119 const std::complex<float>* const* in_block,
148 std::complex<float>* out_block) { 120 size_t in_channels,
121 size_t frames,
122 size_t /* out_channels */,
123 std::complex<float>* const* out_block) {
124 RTC_DCHECK_EQ(freqs_, frames);
149 if (is_speech_) { 125 if (is_speech_) {
150 clear_power_estimator_.Step(in_block); 126 clear_power_estimator_.Step(in_block[0]);
turaj 2016/02/24 16:00:17 I suppose this change and changes in lines 146-148
aluebs-webrtc 2016/02/25 00:18:37 Yes, as you point out, the main change is removing
151 } 127 }
152 const std::vector<float>& clear_power = clear_power_estimator_.power(); 128 const std::vector<float>& clear_power = clear_power_estimator_.power();
153 const std::vector<float>& noise_power = noise_power_estimator_->power(); 129 const std::vector<float>& noise_power = noise_power_estimator_->power();
154 MapToErbBands(&clear_power[0], render_filter_bank_, 130 MapToErbBands(&clear_power[0], render_filter_bank_,
155 filtered_clear_pow_.get()); 131 &filtered_clear_pow_[0]);
hlundin-webrtc 2016/02/24 10:17:03 I suggest you use .data() instead of &...[0]. See
aluebs-webrtc 2016/02/25 00:18:37 I am aware of that, but I thought it was still not
156 MapToErbBands(&noise_power[0], capture_filter_bank_, 132 MapToErbBands(&noise_power[0], capture_filter_bank_,
157 filtered_noise_pow_.get()); 133 &filtered_noise_pow_[0]);
158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); 134 SolveForGainsGivenLambda(kLambdaTop, start_freq_, &gains_eq_[0]);
159 const float power_target = 135 const float power_target =
160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); 136 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
161 const float power_top = 137 const float power_top =
162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 138 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_);
163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); 139 SolveForGainsGivenLambda(kLambdaBot, start_freq_, &gains_eq_[0]);
164 const float power_bot = 140 const float power_bot =
165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 141 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_);
166 if (power_target >= power_bot && power_target <= power_top) { 142 if (power_target >= power_bot && power_target <= power_top) {
167 SolveForLambda(power_target); 143 SolveForLambda(power_target);
168 UpdateErbGains(); 144 UpdateErbGains();
169 } // Else experiencing power underflow, so do nothing. 145 } // Else experiencing power underflow, so do nothing.
170 gain_applier_.Apply(in_block, out_block); 146 for (size_t i = 0; i < in_channels; ++i) {
147 gain_applier_.Apply(in_block[i], out_block[i]);
148 }
171 } 149 }
172 150
173 void IntelligibilityEnhancer::SolveForLambda(float power_target) { 151 void IntelligibilityEnhancer::SolveForLambda(float power_target) {
174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values 152 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values
175 const int kMaxIters = 100; // for these, based on experiments. 153 const int kMaxIters = 100; // for these, based on experiments.
176 154
177 const float reciprocal_power_target = 155 const float reciprocal_power_target =
178 1.f / (power_target + std::numeric_limits<float>::epsilon()); 156 1.f / (power_target + std::numeric_limits<float>::epsilon());
179 float lambda_bot = kLambdaBot; 157 float lambda_bot = kLambdaBot;
180 float lambda_top = kLambdaTop; 158 float lambda_top = kLambdaTop;
181 float power_ratio = 2.f; // Ratio of achieved power to target power. 159 float power_ratio = 2.f; // Ratio of achieved power to target power.
182 int iters = 0; 160 int iters = 0;
183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { 161 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
184 const float lambda = (lambda_bot + lambda_top) / 2.f; 162 const float lambda = (lambda_bot + lambda_top) / 2.f;
185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); 163 SolveForGainsGivenLambda(lambda, start_freq_, &gains_eq_[0]);
186 const float power = 164 const float power =
187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 165 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_);
188 if (power < power_target) { 166 if (power < power_target) {
189 lambda_bot = lambda; 167 lambda_bot = lambda;
190 } else { 168 } else {
191 lambda_top = lambda; 169 lambda_top = lambda;
192 } 170 }
193 power_ratio = std::fabs(power * reciprocal_power_target); 171 power_ratio = std::fabs(power * reciprocal_power_target);
194 ++iters; 172 ++iters;
195 } 173 }
196 } 174 }
197 175
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
279 } 257 }
280 } 258 }
281 return filter_bank; 259 return filter_bank;
282 } 260 }
283 261
284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, 262 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
285 size_t start_freq, 263 size_t start_freq,
286 float* sols) { 264 float* sols) {
287 const float kMinPower = 1e-5; 265 const float kMinPower = 1e-5;
288 266
289 const float* pow_x0 = filtered_clear_pow_.get(); 267 const float* pow_x0 = &filtered_clear_pow_[0];
290 const float* pow_n0 = filtered_noise_pow_.get(); 268 const float* pow_n0 = &filtered_noise_pow_[0];
291 269
292 for (size_t n = 0; n < start_freq; ++n) { 270 for (size_t n = 0; n < start_freq; ++n) {
293 sols[n] = 1.f; 271 sols[n] = 1.f;
294 } 272 }
295 273
296 // Analytic solution for optimal gains. See paper for derivation. 274 // Analytic solution for optimal gains. See paper for derivation.
297 for (size_t n = start_freq; n < bank_size_; ++n) { 275 for (size_t n = start_freq; n < bank_size_; ++n) {
298 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { 276 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) {
299 sols[n] = 1.f; 277 sols[n] = 1.f;
300 } else { 278 } else {
(...skipping 19 matching lines...) Expand all
320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); 298 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { 299 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
322 chunks_since_voice_ = 0; 300 chunks_since_voice_ = 0;
323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { 301 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {
324 ++chunks_since_voice_; 302 ++chunks_since_voice_;
325 } 303 }
326 return chunks_since_voice_ < kSpeechOffsetDelay; 304 return chunks_since_voice_ < kSpeechOffsetDelay;
327 } 305 }
328 306
329 } // namespace webrtc 307 } // namespace webrtc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698