OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
47 return ret; | 47 return ret; |
48 } | 48 } |
49 | 49 |
50 // Computes the power across ERB bands from the power spectral density |pow|. | 50 // Computes the power across ERB bands from the power spectral density |pow|. |
51 // Stores it in |result|. | 51 // Stores it in |result|. |
52 void MapToErbBands(const float* pow, | 52 void MapToErbBands(const float* pow, |
53 const std::vector<std::vector<float>>& filter_bank, | 53 const std::vector<std::vector<float>>& filter_bank, |
54 float* result) { | 54 float* result) { |
55 for (size_t i = 0; i < filter_bank.size(); ++i) { | 55 for (size_t i = 0; i < filter_bank.size(); ++i) { |
56 RTC_DCHECK_GT(filter_bank[i].size(), 0u); | 56 RTC_DCHECK_GT(filter_bank[i].size(), 0u); |
57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size()); | 57 result[i] = DotProduct(filter_bank[i].data(), pow, filter_bank[i].size()); |
58 } | 58 } |
59 } | 59 } |
60 | 60 |
61 } // namespace | 61 } // namespace |
62 | 62 |
63 IntelligibilityEnhancer::TransformCallback::TransformCallback( | |
64 IntelligibilityEnhancer* parent) | |
65 : parent_(parent) { | |
66 } | |
67 | |
68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( | |
69 const std::complex<float>* const* in_block, | |
70 size_t in_channels, | |
71 size_t frames, | |
72 size_t /* out_channels */, | |
73 std::complex<float>* const* out_block) { | |
74 RTC_DCHECK_EQ(parent_->freqs_, frames); | |
75 for (size_t i = 0; i < in_channels; ++i) { | |
76 parent_->ProcessClearBlock(in_block[i], out_block[i]); | |
77 } | |
78 } | |
79 | |
80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, | 63 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, |
81 size_t num_render_channels) | 64 size_t num_render_channels) |
82 : freqs_(RealFourier::ComplexLength( | 65 : freqs_(RealFourier::ComplexLength( |
83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), | 66 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), |
84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), | 67 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), |
85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), | 68 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), |
86 sample_rate_hz_(sample_rate_hz), | 69 sample_rate_hz_(sample_rate_hz), |
87 num_render_channels_(num_render_channels), | 70 num_render_channels_(num_render_channels), |
88 clear_power_estimator_(freqs_, kDecayRate), | 71 clear_power_estimator_(freqs_, kDecayRate), |
89 noise_power_estimator_( | 72 noise_power_estimator_( |
90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)), | 73 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)), |
91 filtered_clear_pow_(new float[bank_size_]), | 74 filtered_clear_pow_(bank_size_, 0.f), |
92 filtered_noise_pow_(new float[bank_size_]), | 75 filtered_noise_pow_(bank_size_, 0.f), |
93 center_freqs_(new float[bank_size_]), | 76 center_freqs_(bank_size_), |
94 render_filter_bank_(CreateErbBank(freqs_)), | 77 render_filter_bank_(CreateErbBank(freqs_)), |
95 gains_eq_(new float[bank_size_]), | 78 gains_eq_(bank_size_), |
96 gain_applier_(freqs_, kMaxRelativeGainChange), | 79 gain_applier_(freqs_, kMaxRelativeGainChange), |
97 temp_render_out_buffer_(chunk_length_, num_render_channels_), | |
98 render_callback_(this), | |
99 audio_s16_(chunk_length_), | 80 audio_s16_(chunk_length_), |
100 chunks_since_voice_(kSpeechOffsetDelay), | 81 chunks_since_voice_(kSpeechOffsetDelay), |
101 is_speech_(false) { | 82 is_speech_(false) { |
102 RTC_DCHECK_LE(kRho, 1.f); | 83 RTC_DCHECK_LE(kRho, 1.f); |
103 | 84 |
104 memset(filtered_clear_pow_.get(), 0, | |
105 bank_size_ * sizeof(filtered_clear_pow_[0])); | |
106 memset(filtered_noise_pow_.get(), 0, | |
107 bank_size_ * sizeof(filtered_noise_pow_[0])); | |
108 | |
109 const size_t erb_index = static_cast<size_t>( | 85 const size_t erb_index = static_cast<size_t>( |
110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + | 86 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + |
111 43.f)); | 87 43.f)); |
112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); | 88 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); |
113 | 89 |
114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_)); | 90 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_)); |
115 std::vector<float> kbd_window(window_size); | 91 std::vector<float> kbd_window(window_size); |
116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); | 92 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, |
| 93 kbd_window.data()); |
117 render_mangler_.reset(new LappedTransform( | 94 render_mangler_.reset(new LappedTransform( |
118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], | 95 num_render_channels_, num_render_channels_, chunk_length_, |
119 window_size, window_size / 2, &render_callback_)); | 96 kbd_window.data(), window_size, window_size / 2, this)); |
120 } | 97 } |
121 | 98 |
122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( | 99 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( |
123 std::vector<float> noise) { | 100 std::vector<float> noise) { |
124 if (capture_filter_bank_.size() != bank_size_ || | 101 if (capture_filter_bank_.size() != bank_size_ || |
125 capture_filter_bank_[0].size() != noise.size()) { | 102 capture_filter_bank_[0].size() != noise.size()) { |
126 capture_filter_bank_ = CreateErbBank(noise.size()); | 103 capture_filter_bank_ = CreateErbBank(noise.size()); |
127 noise_power_estimator_.reset( | 104 noise_power_estimator_.reset( |
128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate)); | 105 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate)); |
129 } | 106 } |
130 noise_power_estimator_->Step(&noise[0]); | 107 noise_power_estimator_->Step(noise.data()); |
131 } | 108 } |
132 | 109 |
133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, | 110 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, |
134 int sample_rate_hz, | 111 int sample_rate_hz, |
135 size_t num_channels) { | 112 size_t num_channels) { |
136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); | 113 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); |
137 RTC_CHECK_EQ(num_render_channels_, num_channels); | 114 RTC_CHECK_EQ(num_render_channels_, num_channels); |
138 is_speech_ = IsSpeech(audio[0]); | 115 is_speech_ = IsSpeech(audio[0]); |
139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); | 116 render_mangler_->ProcessChunk(audio, audio); |
140 for (size_t i = 0; i < num_render_channels_; ++i) { | |
141 memcpy(audio[i], temp_render_out_buffer_.channels()[i], | |
142 chunk_length_ * sizeof(**audio)); | |
143 } | |
144 } | 117 } |
145 | 118 |
146 void IntelligibilityEnhancer::ProcessClearBlock( | 119 void IntelligibilityEnhancer::ProcessAudioBlock( |
147 const std::complex<float>* in_block, | 120 const std::complex<float>* const* in_block, |
148 std::complex<float>* out_block) { | 121 size_t in_channels, |
| 122 size_t frames, |
| 123 size_t /* out_channels */, |
| 124 std::complex<float>* const* out_block) { |
| 125 RTC_DCHECK_EQ(freqs_, frames); |
149 if (is_speech_) { | 126 if (is_speech_) { |
150 clear_power_estimator_.Step(in_block); | 127 clear_power_estimator_.Step(in_block[0]); |
151 } | 128 } |
152 const std::vector<float>& clear_power = clear_power_estimator_.power(); | 129 const std::vector<float>& clear_power = clear_power_estimator_.power(); |
153 const std::vector<float>& noise_power = noise_power_estimator_->power(); | 130 const std::vector<float>& noise_power = noise_power_estimator_->power(); |
154 MapToErbBands(&clear_power[0], render_filter_bank_, | 131 MapToErbBands(clear_power.data(), render_filter_bank_, |
155 filtered_clear_pow_.get()); | 132 filtered_clear_pow_.data()); |
156 MapToErbBands(&noise_power[0], capture_filter_bank_, | 133 MapToErbBands(noise_power.data(), capture_filter_bank_, |
157 filtered_noise_pow_.get()); | 134 filtered_noise_pow_.data()); |
158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); | 135 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); |
159 const float power_target = | 136 const float power_target = |
160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); | 137 std::accumulate(clear_power.data(), clear_power.data() + freqs_, 0.f); |
161 const float power_top = | 138 const float power_top = |
162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); | 139 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); | 140 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); |
164 const float power_bot = | 141 const float power_bot = |
165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); | 142 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
166 if (power_target >= power_bot && power_target <= power_top) { | 143 if (power_target >= power_bot && power_target <= power_top) { |
167 SolveForLambda(power_target); | 144 SolveForLambda(power_target); |
168 UpdateErbGains(); | 145 UpdateErbGains(); |
169 } // Else experiencing power underflow, so do nothing. | 146 } // Else experiencing power underflow, so do nothing. |
170 gain_applier_.Apply(in_block, out_block); | 147 for (size_t i = 0; i < in_channels; ++i) { |
| 148 gain_applier_.Apply(in_block[i], out_block[i]); |
| 149 } |
171 } | 150 } |
172 | 151 |
173 void IntelligibilityEnhancer::SolveForLambda(float power_target) { | 152 void IntelligibilityEnhancer::SolveForLambda(float power_target) { |
174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values | 153 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values |
175 const int kMaxIters = 100; // for these, based on experiments. | 154 const int kMaxIters = 100; // for these, based on experiments. |
176 | 155 |
177 const float reciprocal_power_target = | 156 const float reciprocal_power_target = |
178 1.f / (power_target + std::numeric_limits<float>::epsilon()); | 157 1.f / (power_target + std::numeric_limits<float>::epsilon()); |
179 float lambda_bot = kLambdaBot; | 158 float lambda_bot = kLambdaBot; |
180 float lambda_top = kLambdaTop; | 159 float lambda_top = kLambdaTop; |
181 float power_ratio = 2.f; // Ratio of achieved power to target power. | 160 float power_ratio = 2.f; // Ratio of achieved power to target power. |
182 int iters = 0; | 161 int iters = 0; |
183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { | 162 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { |
184 const float lambda = (lambda_bot + lambda_top) / 2.f; | 163 const float lambda = (lambda_bot + lambda_top) / 2.f; |
185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); | 164 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data()); |
186 const float power = | 165 const float power = |
187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); | 166 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); |
188 if (power < power_target) { | 167 if (power < power_target) { |
189 lambda_bot = lambda; | 168 lambda_bot = lambda; |
190 } else { | 169 } else { |
191 lambda_top = lambda; | 170 lambda_top = lambda; |
192 } | 171 } |
193 power_ratio = std::fabs(power * reciprocal_power_target); | 172 power_ratio = std::fabs(power * reciprocal_power_target); |
194 ++iters; | 173 ++iters; |
195 } | 174 } |
196 } | 175 } |
197 | 176 |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
279 } | 258 } |
280 } | 259 } |
281 return filter_bank; | 260 return filter_bank; |
282 } | 261 } |
283 | 262 |
284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, | 263 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, |
285 size_t start_freq, | 264 size_t start_freq, |
286 float* sols) { | 265 float* sols) { |
287 const float kMinPower = 1e-5f; | 266 const float kMinPower = 1e-5f; |
288 | 267 |
289 const float* pow_x0 = filtered_clear_pow_.get(); | 268 const float* pow_x0 = filtered_clear_pow_.data(); |
290 const float* pow_n0 = filtered_noise_pow_.get(); | 269 const float* pow_n0 = filtered_noise_pow_.data(); |
291 | 270 |
292 for (size_t n = 0; n < start_freq; ++n) { | 271 for (size_t n = 0; n < start_freq; ++n) { |
293 sols[n] = 1.f; | 272 sols[n] = 1.f; |
294 } | 273 } |
295 | 274 |
296 // Analytic solution for optimal gains. See paper for derivation. | 275 // Analytic solution for optimal gains. See paper for derivation. |
297 for (size_t n = start_freq; n < bank_size_; ++n) { | 276 for (size_t n = start_freq; n < bank_size_; ++n) { |
298 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { | 277 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { |
299 sols[n] = 1.f; | 278 sols[n] = 1.f; |
300 } else { | 279 } else { |
301 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] + | 280 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] + |
302 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n]; | 281 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n]; |
303 const float beta0 = | 282 const float beta0 = |
304 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n]; | 283 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n]; |
305 const float alpha0 = | 284 const float alpha0 = |
306 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n]; | 285 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n]; |
307 RTC_DCHECK_LT(alpha0, 0.f); | 286 RTC_DCHECK_LT(alpha0, 0.f); |
308 // The quadratic equation should always have real roots, but to guard | 287 // The quadratic equation should always have real roots, but to guard |
309 // against numerical errors we limit it to a minimum of zero. | 288 // against numerical errors we limit it to a minimum of zero. |
310 sols[n] = std::max( | 289 sols[n] = std::max( |
311 0.f, (-beta0 - std::sqrt(std::max( | 290 0.f, (-beta0 - std::sqrt(std::max( |
312 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) / | 291 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) / |
313 (2.f * alpha0)); | 292 (2.f * alpha0)); |
314 } | 293 } |
315 } | 294 } |
316 } | 295 } |
317 | 296 |
318 bool IntelligibilityEnhancer::IsSpeech(const float* audio) { | 297 bool IntelligibilityEnhancer::IsSpeech(const float* audio) { |
319 FloatToS16(audio, chunk_length_, &audio_s16_[0]); | 298 FloatToS16(audio, chunk_length_, audio_s16_.data()); |
320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); | 299 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); |
321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { | 300 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { |
322 chunks_since_voice_ = 0; | 301 chunks_since_voice_ = 0; |
323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { | 302 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { |
324 ++chunks_since_voice_; | 303 ++chunks_since_voice_; |
325 } | 304 } |
326 return chunks_since_voice_ < kSpeechOffsetDelay; | 305 return chunks_since_voice_ < kSpeechOffsetDelay; |
327 } | 306 } |
328 | 307 |
329 } // namespace webrtc | 308 } // namespace webrtc |
OLD | NEW |