OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
53 const std::vector<std::vector<float>>& filter_bank, | 53 const std::vector<std::vector<float>>& filter_bank, |
54 float* result) { | 54 float* result) { |
55 for (size_t i = 0; i < filter_bank.size(); ++i) { | 55 for (size_t i = 0; i < filter_bank.size(); ++i) { |
56 RTC_DCHECK_GT(filter_bank[i].size(), 0u); | 56 RTC_DCHECK_GT(filter_bank[i].size(), 0u); |
57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size()); | 57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size()); |
58 } | 58 } |
59 } | 59 } |
60 | 60 |
61 } // namespace | 61 } // namespace |
62 | 62 |
63 IntelligibilityEnhancer::TransformCallback::TransformCallback( | |
64 IntelligibilityEnhancer* parent) | |
65 : parent_(parent) { | |
66 } | |
67 | |
68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( | |
69 const std::complex<float>* const* in_block, | |
70 size_t in_channels, | |
71 size_t frames, | |
72 size_t /* out_channels */, | |
73 std::complex<float>* const* out_block) { | |
74 RTC_DCHECK_EQ(parent_->freqs_, frames); | |
75 for (size_t i = 0; i < in_channels; ++i) { | |
76 parent_->ProcessClearBlock(in_block[i], out_block[i]); | |
77 } | |
78 } | |
79 | |
80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, | 63 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, |
81 size_t num_render_channels) | 64 size_t num_render_channels) |
82 : freqs_(RealFourier::ComplexLength( | 65 : freqs_(RealFourier::ComplexLength( |
83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), | 66 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), |
84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), | 67 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), |
85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), | 68 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), |
86 sample_rate_hz_(sample_rate_hz), | 69 sample_rate_hz_(sample_rate_hz), |
87 num_render_channels_(num_render_channels), | 70 num_render_channels_(num_render_channels), |
88 clear_power_estimator_(freqs_, kDecayRate), | 71 clear_power_estimator_(freqs_, kDecayRate), |
89 noise_power_estimator_( | 72 noise_power_estimator_( |
90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)), | 73 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)), |
91 filtered_clear_pow_(new float[bank_size_]), | 74 filtered_clear_pow_(bank_size_, 0.f), |
92 filtered_noise_pow_(new float[bank_size_]), | 75 filtered_noise_pow_(bank_size_, 0.f), |
93 center_freqs_(new float[bank_size_]), | 76 center_freqs_(bank_size_), |
94 render_filter_bank_(CreateErbBank(freqs_)), | 77 render_filter_bank_(CreateErbBank(freqs_)), |
95 gains_eq_(new float[bank_size_]), | 78 gains_eq_(bank_size_), |
96 gain_applier_(freqs_, kMaxRelativeGainChange), | 79 gain_applier_(freqs_, kMaxRelativeGainChange), |
97 temp_render_out_buffer_(chunk_length_, num_render_channels_), | |
98 render_callback_(this), | |
99 audio_s16_(chunk_length_), | 80 audio_s16_(chunk_length_), |
100 chunks_since_voice_(kSpeechOffsetDelay), | 81 chunks_since_voice_(kSpeechOffsetDelay), |
101 is_speech_(false) { | 82 is_speech_(false) { |
102 RTC_DCHECK_LE(kRho, 1.f); | 83 RTC_DCHECK_LE(kRho, 1.f); |
103 | 84 |
104 memset(filtered_clear_pow_.get(), 0, | |
105 bank_size_ * sizeof(filtered_clear_pow_[0])); | |
106 memset(filtered_noise_pow_.get(), 0, | |
107 bank_size_ * sizeof(filtered_noise_pow_[0])); | |
108 | |
109 const size_t erb_index = static_cast<size_t>( | 85 const size_t erb_index = static_cast<size_t>( |
110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + | 86 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + |
111 43.f)); | 87 43.f)); |
112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); | 88 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); |
113 | 89 |
114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_)); | 90 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_)); |
115 std::vector<float> kbd_window(window_size); | 91 std::vector<float> kbd_window(window_size); |
116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); | 92 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); |
117 render_mangler_.reset(new LappedTransform( | 93 render_mangler_.reset(new LappedTransform( |
118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], | 94 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], |
119 window_size, window_size / 2, &render_callback_)); | 95 window_size, window_size / 2, this)); |
120 } | 96 } |
121 | 97 |
122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( | 98 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( |
123 std::vector<float> noise) { | 99 std::vector<float> noise) { |
124 if (capture_filter_bank_.size() != bank_size_ || | 100 if (capture_filter_bank_.size() != bank_size_ || |
125 capture_filter_bank_[0].size() != noise.size()) { | 101 capture_filter_bank_[0].size() != noise.size()) { |
126 capture_filter_bank_ = CreateErbBank(noise.size()); | 102 capture_filter_bank_ = CreateErbBank(noise.size()); |
127 noise_power_estimator_.reset( | 103 noise_power_estimator_.reset( |
128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate)); | 104 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate)); |
129 } | 105 } |
130 noise_power_estimator_->Step(&noise[0]); | 106 noise_power_estimator_->Step(&noise[0]); |
131 } | 107 } |
132 | 108 |
133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, | 109 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, |
134 int sample_rate_hz, | 110 int sample_rate_hz, |
135 size_t num_channels) { | 111 size_t num_channels) { |
136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); | 112 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); |
137 RTC_CHECK_EQ(num_render_channels_, num_channels); | 113 RTC_CHECK_EQ(num_render_channels_, num_channels); |
138 is_speech_ = IsSpeech(audio[0]); | 114 is_speech_ = IsSpeech(audio[0]); |
139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); | 115 render_mangler_->ProcessChunk(audio, audio); |
140 for (size_t i = 0; i < num_render_channels_; ++i) { | |
141 memcpy(audio[i], temp_render_out_buffer_.channels()[i], | |
142 chunk_length_ * sizeof(**audio)); | |
143 } | |
144 } | 116 } |
145 | 117 |
146 void IntelligibilityEnhancer::ProcessClearBlock( | 118 void IntelligibilityEnhancer::ProcessAudioBlock( |
147 const std::complex<float>* in_block, | 119 const std::complex<float>* const* in_block, |
148 std::complex<float>* out_block) { | 120 size_t in_channels, |
121 size_t frames, | |
122 size_t /* out_channels */, | |
123 std::complex<float>* const* out_block) { | |
124 RTC_DCHECK_EQ(freqs_, frames); | |
149 if (is_speech_) { | 125 if (is_speech_) { |
150 clear_power_estimator_.Step(in_block); | 126 clear_power_estimator_.Step(in_block[0]); |
turaj
2016/02/24 16:00:17
I suppose this change and changes in lines 146-148
aluebs-webrtc
2016/02/25 00:18:37
Yes, as you point out, the main change is removing
| |
151 } | 127 } |
152 const std::vector<float>& clear_power = clear_power_estimator_.power(); | 128 const std::vector<float>& clear_power = clear_power_estimator_.power(); |
153 const std::vector<float>& noise_power = noise_power_estimator_->power(); | 129 const std::vector<float>& noise_power = noise_power_estimator_->power(); |
154 MapToErbBands(&clear_power[0], render_filter_bank_, | 130 MapToErbBands(&clear_power[0], render_filter_bank_, |
155 filtered_clear_pow_.get()); | 131 &filtered_clear_pow_[0]); |
hlundin-webrtc
2016/02/24 10:17:03
I suggest you use .data() instead of &...[0]. See
aluebs-webrtc
2016/02/25 00:18:37
I am aware of that, but I thought it was still not
| |
156 MapToErbBands(&noise_power[0], capture_filter_bank_, | 132 MapToErbBands(&noise_power[0], capture_filter_bank_, |
157 filtered_noise_pow_.get()); | 133 &filtered_noise_pow_[0]); |
158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); | 134 SolveForGainsGivenLambda(kLambdaTop, start_freq_, &gains_eq_[0]); |
159 const float power_target = | 135 const float power_target = |
160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); | 136 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); |
161 const float power_top = | 137 const float power_top = |
162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); | 138 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_); |
163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); | 139 SolveForGainsGivenLambda(kLambdaBot, start_freq_, &gains_eq_[0]); |
164 const float power_bot = | 140 const float power_bot = |
165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); | 141 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_); |
166 if (power_target >= power_bot && power_target <= power_top) { | 142 if (power_target >= power_bot && power_target <= power_top) { |
167 SolveForLambda(power_target); | 143 SolveForLambda(power_target); |
168 UpdateErbGains(); | 144 UpdateErbGains(); |
169 } // Else experiencing power underflow, so do nothing. | 145 } // Else experiencing power underflow, so do nothing. |
170 gain_applier_.Apply(in_block, out_block); | 146 for (size_t i = 0; i < in_channels; ++i) { |
147 gain_applier_.Apply(in_block[i], out_block[i]); | |
148 } | |
171 } | 149 } |
172 | 150 |
173 void IntelligibilityEnhancer::SolveForLambda(float power_target) { | 151 void IntelligibilityEnhancer::SolveForLambda(float power_target) { |
174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values | 152 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values |
175 const int kMaxIters = 100; // for these, based on experiments. | 153 const int kMaxIters = 100; // for these, based on experiments. |
176 | 154 |
177 const float reciprocal_power_target = | 155 const float reciprocal_power_target = |
178 1.f / (power_target + std::numeric_limits<float>::epsilon()); | 156 1.f / (power_target + std::numeric_limits<float>::epsilon()); |
179 float lambda_bot = kLambdaBot; | 157 float lambda_bot = kLambdaBot; |
180 float lambda_top = kLambdaTop; | 158 float lambda_top = kLambdaTop; |
181 float power_ratio = 2.f; // Ratio of achieved power to target power. | 159 float power_ratio = 2.f; // Ratio of achieved power to target power. |
182 int iters = 0; | 160 int iters = 0; |
183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { | 161 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { |
184 const float lambda = (lambda_bot + lambda_top) / 2.f; | 162 const float lambda = (lambda_bot + lambda_top) / 2.f; |
185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); | 163 SolveForGainsGivenLambda(lambda, start_freq_, &gains_eq_[0]); |
186 const float power = | 164 const float power = |
187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); | 165 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_); |
188 if (power < power_target) { | 166 if (power < power_target) { |
189 lambda_bot = lambda; | 167 lambda_bot = lambda; |
190 } else { | 168 } else { |
191 lambda_top = lambda; | 169 lambda_top = lambda; |
192 } | 170 } |
193 power_ratio = std::fabs(power * reciprocal_power_target); | 171 power_ratio = std::fabs(power * reciprocal_power_target); |
194 ++iters; | 172 ++iters; |
195 } | 173 } |
196 } | 174 } |
197 | 175 |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
279 } | 257 } |
280 } | 258 } |
281 return filter_bank; | 259 return filter_bank; |
282 } | 260 } |
283 | 261 |
284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, | 262 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, |
285 size_t start_freq, | 263 size_t start_freq, |
286 float* sols) { | 264 float* sols) { |
287 const float kMinPower = 1e-5; | 265 const float kMinPower = 1e-5; |
288 | 266 |
289 const float* pow_x0 = filtered_clear_pow_.get(); | 267 const float* pow_x0 = &filtered_clear_pow_[0]; |
290 const float* pow_n0 = filtered_noise_pow_.get(); | 268 const float* pow_n0 = &filtered_noise_pow_[0]; |
291 | 269 |
292 for (size_t n = 0; n < start_freq; ++n) { | 270 for (size_t n = 0; n < start_freq; ++n) { |
293 sols[n] = 1.f; | 271 sols[n] = 1.f; |
294 } | 272 } |
295 | 273 |
296 // Analytic solution for optimal gains. See paper for derivation. | 274 // Analytic solution for optimal gains. See paper for derivation. |
297 for (size_t n = start_freq; n < bank_size_; ++n) { | 275 for (size_t n = start_freq; n < bank_size_; ++n) { |
298 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { | 276 if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { |
299 sols[n] = 1.f; | 277 sols[n] = 1.f; |
300 } else { | 278 } else { |
(...skipping 19 matching lines...) Expand all Loading... | |
320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); | 298 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); |
321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { | 299 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { |
322 chunks_since_voice_ = 0; | 300 chunks_since_voice_ = 0; |
323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { | 301 } else if (chunks_since_voice_ < kSpeechOffsetDelay) { |
324 ++chunks_since_voice_; | 302 ++chunks_since_voice_; |
325 } | 303 } |
326 return chunks_since_voice_ < kSpeechOffsetDelay; | 304 return chunks_since_voice_ < kSpeechOffsetDelay; |
327 } | 305 } |
328 | 306 |
329 } // namespace webrtc | 307 } // namespace webrtc |
OLD | NEW |