Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(263)

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 13 matching lines...) Expand all
24 namespace { 24 namespace {
25 25
26 const size_t kErbResolution = 2; 26 const size_t kErbResolution = 2;
27 const int kWindowSizeMs = 16; 27 const int kWindowSizeMs = 16;
28 const int kChunkSizeMs = 10; // Size provided by APM. 28 const int kChunkSizeMs = 10; // Size provided by APM.
29 const float kClipFreq = 200.0f; 29 const float kClipFreq = 200.0f;
30 const float kConfigRho = 0.02f; // Default production and interpretation SNR. 30 const float kConfigRho = 0.02f; // Default production and interpretation SNR.
31 const float kKbdAlpha = 1.5f; 31 const float kKbdAlpha = 1.5f;
32 const float kLambdaBot = -1.0f; // Extreme values in bisection 32 const float kLambdaBot = -1.0f; // Extreme values in bisection
33 const float kLambdaTop = -10e-18f; // search for lamda. 33 const float kLambdaTop = -10e-18f; // search for lamda.
34 const float kVoiceProbabilityThreshold = 0.02;
35 // Number of chunks after voice activity which is still considered speech.
36 const size_t kSpeechOffsetDelay = 80;
37 const float kDecayRate = 0.97f; // Power estimation decay rate.
hlundin-webrtc 2016/02/15 13:05:11 Two spaces before comment.
hlundin-webrtc 2016/02/15 13:05:11 You change this value from 0.9 to 0.97. Can you ex
aluebs-webrtc 2016/02/19 03:56:30 For this algorithm we care about the long-time psd
aluebs-webrtc 2016/02/19 03:56:31 Done.
38 const float kGainChangeLimit = 0.1f; // Maximum change in gain.
turaj 2016/02/13 00:09:42 Is kGainChangeLimit relative to current value, or
hlundin-webrtc 2016/02/15 13:05:11 Two spaces before comment.
aluebs-webrtc 2016/02/19 03:56:30 It was an absolute limit, but I agree that a relat
aluebs-webrtc 2016/02/19 03:56:31 Done.
39 const float kRho = 0.0004f;
hlundin-webrtc 2016/02/15 13:05:11 This value is also changed...
aluebs-webrtc 2016/02/19 03:56:30 It changed to be squared, which is the only way it
40
34 41
35 // Returns dot product of vectors |a| and |b| with size |length|. 42 // Returns dot product of vectors |a| and |b| with size |length|.
36 float DotProduct(const float* a, const float* b, size_t length) { 43 float DotProduct(const float* a, const float* b, size_t length) {
37 float ret = 0.f; 44 float ret = 0.f;
38 for (size_t i = 0; i < length; ++i) { 45 for (size_t i = 0; i < length; ++i) {
39 ret = fmaf(a[i], b[i], ret); 46 ret = fmaf(a[i], b[i], ret);
40 } 47 }
41 return ret; 48 return ret;
42 } 49 }
43 50
(...skipping 20 matching lines...) Expand all
64 size_t in_channels, 71 size_t in_channels,
65 size_t frames, 72 size_t frames,
66 size_t /* out_channels */, 73 size_t /* out_channels */,
67 std::complex<float>* const* out_block) { 74 std::complex<float>* const* out_block) {
68 RTC_DCHECK_EQ(parent_->freqs_, frames); 75 RTC_DCHECK_EQ(parent_->freqs_, frames);
69 for (size_t i = 0; i < in_channels; ++i) { 76 for (size_t i = 0; i < in_channels; ++i) {
70 parent_->ProcessClearBlock(in_block[i], out_block[i]); 77 parent_->ProcessClearBlock(in_block[i], out_block[i]);
71 } 78 }
72 } 79 }
73 80
74 IntelligibilityEnhancer::IntelligibilityEnhancer() 81 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
75 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) { 82 size_t num_render_channels)
76 }
77
78 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
79 : freqs_(RealFourier::ComplexLength( 83 : freqs_(RealFourier::ComplexLength(
80 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))), 84 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
81 window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))), 85 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
82 chunk_length_( 86 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
83 static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)), 87 sample_rate_hz_(sample_rate_hz),
84 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)), 88 num_render_channels_(num_render_channels),
85 sample_rate_hz_(config.sample_rate_hz), 89 clear_power_estimator_(freqs_, kDecayRate),
86 erb_resolution_(kErbResolution),
87 num_capture_channels_(config.num_capture_channels),
88 num_render_channels_(config.num_render_channels),
89 analysis_rate_(config.analysis_rate),
90 active_(true),
91 clear_power_(freqs_, config.decay_rate),
92 noise_power_(freqs_, 0.f),
93 filtered_clear_pow_(new float[bank_size_]), 90 filtered_clear_pow_(new float[bank_size_]),
94 filtered_noise_pow_(new float[bank_size_]), 91 filtered_noise_pow_(new float[bank_size_]),
95 center_freqs_(new float[bank_size_]), 92 center_freqs_(new float[bank_size_]),
96 render_filter_bank_(CreateErbBank(freqs_)), 93 render_filter_bank_(CreateErbBank(freqs_)),
97 rho_(new float[bank_size_]),
98 gains_eq_(new float[bank_size_]), 94 gains_eq_(new float[bank_size_]),
99 gain_applier_(freqs_, config.gain_change_limit), 95 gain_applier_(freqs_, kGainChangeLimit),
100 temp_render_out_buffer_(chunk_length_, num_render_channels_), 96 temp_render_out_buffer_(chunk_length_, num_render_channels_),
101 kbd_window_(new float[window_size_]),
102 render_callback_(this), 97 render_callback_(this),
103 block_count_(0), 98 audio_s16_(chunk_length_),
104 analysis_step_(0) { 99 chunks_since_voice_(kSpeechOffsetDelay),
105 RTC_DCHECK_LE(config.rho, 1.0f); 100 is_speech_(false) {
101 RTC_DCHECK_LE(kRho, 1.f);
106 102
107 memset(filtered_clear_pow_.get(), 103 memset(filtered_clear_pow_.get(),
108 0, 104 0,
109 bank_size_ * sizeof(filtered_clear_pow_[0])); 105 bank_size_ * sizeof(filtered_clear_pow_[0]));
110 memset(filtered_noise_pow_.get(), 106 memset(filtered_noise_pow_.get(),
111 0, 107 0,
112 bank_size_ * sizeof(filtered_noise_pow_[0])); 108 bank_size_ * sizeof(filtered_noise_pow_[0]));
113 109
114 // Assumes all rho equal. 110 float freqs_khz = kClipFreq / 1000.f;
hlundin-webrtc 2016/02/15 13:05:11 This is const too. And it should probably be named
aluebs-webrtc 2016/02/19 03:56:30 I removed it and updated the constant directly.
115 for (size_t i = 0; i < bank_size_; ++i) { 111 size_t erb_index = static_cast<size_t>(ceilf(
hlundin-webrtc 2016/02/15 13:05:11 const
aluebs-webrtc 2016/02/19 03:56:30 Done.
116 rho_[i] = config.rho * config.rho; 112 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.f));
117 } 113 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
118 114
119 float freqs_khz = kClipFreq / 1000.0f; 115 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
120 size_t erb_index = static_cast<size_t>(ceilf( 116 std::vector<float> kbd_window(window_size);
121 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f)); 117 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
122 start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);
123
124 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
125 kbd_window_.get());
126 render_mangler_.reset(new LappedTransform( 118 render_mangler_.reset(new LappedTransform(
127 num_render_channels_, num_render_channels_, chunk_length_, 119 num_render_channels_, num_render_channels_, chunk_length_,
128 kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_)); 120 &kbd_window[0], window_size, window_size / 2, &render_callback_));
129 } 121 }
130 122
131 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( 123 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
132 std::vector<float> noise) { 124 std::vector<float> noise) {
133 if (capture_filter_bank_.size() != bank_size_ || 125 if (capture_filter_bank_.size() != bank_size_ ||
134 capture_filter_bank_[0].size() != noise.size()) { 126 capture_filter_bank_[0].size() != noise.size()) {
135 capture_filter_bank_ = CreateErbBank(noise.size()); 127 capture_filter_bank_ = CreateErbBank(noise.size());
128 noise_power_estimator_.reset(new PowerEstimator(noise.size(), kDecayRate));
136 } 129 }
137 if (noise.size() != noise_power_.size()) { 130 noise_power_estimator_->Step(&noise[0]);
138 noise_power_.resize(noise.size());
139 }
140 for (size_t i = 0; i < noise.size(); ++i) {
141 noise_power_[i] = noise[i] * noise[i];
142 }
143 } 131 }
144 132
145 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, 133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
146 int sample_rate_hz, 134 int sample_rate_hz,
147 size_t num_channels) { 135 size_t num_channels) {
148 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); 136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
149 RTC_CHECK_EQ(num_render_channels_, num_channels); 137 RTC_CHECK_EQ(num_render_channels_, num_channels);
150 138 is_speech_ = IsSpeech(audio[0]);
151 if (active_) { 139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
152 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); 140 for (size_t i = 0; i < num_render_channels_; ++i) {
153 } 141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],
154 142 chunk_length_ * sizeof(**audio));
155 if (active_) {
156 for (size_t i = 0; i < num_render_channels_; ++i) {
157 memcpy(audio[i], temp_render_out_buffer_.channels()[i],
158 chunk_length_ * sizeof(**audio));
159 }
160 } 143 }
161 } 144 }
162 145
163 void IntelligibilityEnhancer::ProcessClearBlock( 146 void IntelligibilityEnhancer::ProcessClearBlock(
164 const std::complex<float>* in_block, 147 const std::complex<float>* in_block,
165 std::complex<float>* out_block) { 148 std::complex<float>* out_block) {
166 if (block_count_ < 2) { 149 if (is_speech_) {
167 memset(out_block, 0, freqs_ * sizeof(*out_block)); 150 clear_power_estimator_.Step(in_block);
168 ++block_count_;
169 return;
170 } 151 }
171 152 MapToErbBands(clear_power_estimator_.power(),
172 // TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary.
173 if (true) {
174 clear_power_.Step(in_block);
175 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
176 AnalyzeClearBlock();
177 ++analysis_step_;
178 }
179 ++block_count_;
180 }
181
182 if (active_) {
183 gain_applier_.Apply(in_block, out_block);
184 }
185 }
186
187 void IntelligibilityEnhancer::AnalyzeClearBlock() {
188 const float* clear_power = clear_power_.Power();
189 MapToErbBands(clear_power,
190 render_filter_bank_, 153 render_filter_bank_,
191 filtered_clear_pow_.get()); 154 filtered_clear_pow_.get());
192 MapToErbBands(&noise_power_[0], 155 MapToErbBands(noise_power_estimator_->power(),
turaj 2016/02/13 00:09:42 I'm confused that why we are back to using PowerEs
aluebs-webrtc 2016/02/19 03:56:30 To be consistent with the PSD estimation from the
193 capture_filter_bank_, 156 capture_filter_bank_,
194 filtered_noise_pow_.get()); 157 filtered_noise_pow_.get());
195 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); 158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
196 const float power_target = std::accumulate( 159 const float power_target = std::accumulate(
197 clear_power, clear_power + freqs_, 0.f); 160 clear_power_estimator_.power(),
161 clear_power_estimator_.power() + freqs_,
162 0.f);
198 const float power_top = 163 const float power_top =
199 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 164 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
200 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); 165 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
201 const float power_bot = 166 const float power_bot =
202 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 167 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
203 if (power_target >= power_bot && power_target <= power_top) { 168 if (power_target >= power_bot && power_target <= power_top) {
204 SolveForLambda(power_target, power_bot, power_top); 169 SolveForLambda(power_target, power_bot, power_top);
205 UpdateErbGains(); 170 UpdateErbGains();
206 } // Else experiencing power underflow, so do nothing. 171 } // Else experiencing power underflow, so do nothing.
172 gain_applier_.Apply(in_block, out_block);
207 } 173 }
208 174
209 void IntelligibilityEnhancer::SolveForLambda(float power_target, 175 void IntelligibilityEnhancer::SolveForLambda(float power_target,
210 float power_bot, 176 float power_bot,
211 float power_top) { 177 float power_top) {
212 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values 178 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values
213 const int kMaxIters = 100; // for these, based on experiments. 179 const int kMaxIters = 100; // for these, based on experiments.
214 180
215 const float reciprocal_power_target = 1.f / power_target; 181 const float reciprocal_power_target = 1.f / power_target;
216 float lambda_bot = kLambdaBot; 182 float lambda_bot = kLambdaBot;
217 float lambda_top = kLambdaTop; 183 float lambda_top = kLambdaTop;
218 float power_ratio = 2.0f; // Ratio of achieved power to target power. 184 float power_ratio = 2.f; // Ratio of achieved power to target power.
219 int iters = 0; 185 int iters = 0;
220 while (std::fabs(power_ratio - 1.0f) > kConvergeThresh && 186 while (std::fabs(power_ratio - 1.f) > kConvergeThresh &&
221 iters <= kMaxIters) { 187 iters <= kMaxIters) {
222 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f; 188 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;
223 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); 189 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
224 const float power = 190 const float power =
225 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); 191 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
226 if (power < power_target) { 192 if (power < power_target) {
227 lambda_bot = lambda; 193 lambda_bot = lambda;
228 } else { 194 } else {
229 lambda_top = lambda; 195 lambda_top = lambda;
230 } 196 }
231 power_ratio = std::fabs(power * reciprocal_power_target); 197 power_ratio = std::fabs(power * reciprocal_power_target);
232 ++iters; 198 ++iters;
233 } 199 }
234 } 200 }
235 201
236 void IntelligibilityEnhancer::UpdateErbGains() { 202 void IntelligibilityEnhancer::UpdateErbGains() {
237 // (ERB gain) = filterbank' * (freq gain) 203 // (ERB gain) = filterbank' * (freq gain)
238 float* gains = gain_applier_.target(); 204 float* gains = gain_applier_.target();
239 for (size_t i = 0; i < freqs_; ++i) { 205 for (size_t i = 0; i < freqs_; ++i) {
240 gains[i] = 0.0f; 206 gains[i] = 0.f;
241 for (size_t j = 0; j < bank_size_; ++j) { 207 for (size_t j = 0; j < bank_size_; ++j) {
242 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]); 208 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);
243 } 209 }
244 } 210 }
245 } 211 }
246 212
247 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate, 213 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,
248 size_t erb_resolution) { 214 size_t erb_resolution) {
249 float freq_limit = sample_rate / 2000.0f; 215 float freq_limit = sample_rate / 2000.f;
250 size_t erb_scale = static_cast<size_t>(ceilf( 216 size_t erb_scale = static_cast<size_t>(ceilf(
251 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f)); 217 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));
252 return erb_scale * erb_resolution; 218 return erb_scale * erb_resolution;
253 } 219 }
254 220
255 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank( 221 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
256 size_t num_freqs) { 222 size_t num_freqs) {
257 std::vector<std::vector<float>> filter_bank(bank_size_); 223 std::vector<std::vector<float>> filter_bank(bank_size_);
258 size_t lf = 1, rf = 4; 224 size_t lf = 1, rf = 4;
259 225
260 for (size_t i = 0; i < bank_size_; ++i) { 226 for (size_t i = 0; i < bank_size_; ++i) {
261 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_)); 227 float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));
262 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); 228 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));
263 center_freqs_[i] -= 14678.49f; 229 center_freqs_[i] -= 14678.49f;
264 } 230 }
265 float last_center_freq = center_freqs_[bank_size_ - 1]; 231 float last_center_freq = center_freqs_[bank_size_ - 1];
266 for (size_t i = 0; i < bank_size_; ++i) { 232 for (size_t i = 0; i < bank_size_; ++i) {
267 center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq; 233 center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq;
268 } 234 }
269 235
270 for (size_t i = 0; i < bank_size_; ++i) { 236 for (size_t i = 0; i < bank_size_; ++i) {
271 filter_bank[i].resize(num_freqs); 237 filter_bank[i].resize(num_freqs);
272 } 238 }
273 239
274 for (size_t i = 1; i <= bank_size_; ++i) { 240 for (size_t i = 1; i <= bank_size_; ++i) {
275 size_t lll, ll, rr, rrr; 241 size_t lll, ll, rr, rrr;
276 static const size_t kOne = 1; // Avoids repeated static_cast<>s below.
277 lll = static_cast<size_t>(round( 242 lll = static_cast<size_t>(round(
278 center_freqs_[std::max(kOne, i - lf) - 1] * num_freqs / 243 center_freqs_[std::max(1ul, i - lf) - 1] * num_freqs /
279 (0.5f * sample_rate_hz_))); 244 (0.5f * sample_rate_hz_)));
280 ll = static_cast<size_t>(round( 245 ll = static_cast<size_t>(round(
281 center_freqs_[std::max(kOne, i) - 1] * num_freqs / 246 center_freqs_[std::max(1ul, i) - 1] * num_freqs /
282 (0.5f * sample_rate_hz_))); 247 (0.5f * sample_rate_hz_)));
283 lll = std::min(num_freqs, std::max(lll, kOne)) - 1; 248 lll = std::min(num_freqs, std::max(lll, 1ul)) - 1;
284 ll = std::min(num_freqs, std::max(ll, kOne)) - 1; 249 ll = std::min(num_freqs, std::max(ll, 1ul)) - 1;
285 250
286 rrr = static_cast<size_t>(round( 251 rrr = static_cast<size_t>(round(
287 center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs / 252 center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
288 (0.5f * sample_rate_hz_))); 253 (0.5f * sample_rate_hz_)));
289 rr = static_cast<size_t>(round( 254 rr = static_cast<size_t>(round(
290 center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs / 255 center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
291 (0.5f * sample_rate_hz_))); 256 (0.5f * sample_rate_hz_)));
292 rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1; 257 rrr = std::min(num_freqs, std::max(rrr, 1ul)) - 1;
293 rr = std::min(num_freqs, std::max(rr, kOne)) - 1; 258 rr = std::min(num_freqs, std::max(rr, 1ul)) - 1;
294 259
295 float step, element; 260 float step, element;
296 261
297 step = 1.0f / (ll - lll); 262 step = 1.f / (ll - lll);
298 element = 0.0f; 263 element = 0.f;
299 for (size_t j = lll; j <= ll; ++j) { 264 for (size_t j = lll; j <= ll; ++j) {
300 filter_bank[i - 1][j] = element; 265 filter_bank[i - 1][j] = element;
301 element += step; 266 element += step;
302 } 267 }
303 step = 1.0f / (rrr - rr); 268 step = 1.f / (rrr - rr);
304 element = 1.0f; 269 element = 1.f;
305 for (size_t j = rr; j <= rrr; ++j) { 270 for (size_t j = rr; j <= rrr; ++j) {
306 filter_bank[i - 1][j] = element; 271 filter_bank[i - 1][j] = element;
307 element -= step; 272 element -= step;
308 } 273 }
309 for (size_t j = ll; j <= rr; ++j) { 274 for (size_t j = ll; j <= rr; ++j) {
310 filter_bank[i - 1][j] = 1.0f; 275 filter_bank[i - 1][j] = 1.f;
311 } 276 }
312 } 277 }
313 278
314 float sum; 279 float sum;
315 for (size_t i = 0; i < num_freqs; ++i) { 280 for (size_t i = 0; i < num_freqs; ++i) {
316 sum = 0.0f; 281 sum = 0.f;
317 for (size_t j = 0; j < bank_size_; ++j) { 282 for (size_t j = 0; j < bank_size_; ++j) {
318 sum += filter_bank[j][i]; 283 sum += filter_bank[j][i];
319 } 284 }
320 for (size_t j = 0; j < bank_size_; ++j) { 285 for (size_t j = 0; j < bank_size_; ++j) {
321 filter_bank[j][i] /= sum; 286 filter_bank[j][i] /= sum;
322 } 287 }
323 } 288 }
324 return filter_bank; 289 return filter_bank;
325 } 290 }
326 291
327 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, 292 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
328 size_t start_freq, 293 size_t start_freq,
329 float* sols) { 294 float* sols) {
330 bool quadratic = (kConfigRho < 1.0f); 295 bool quadratic = (kConfigRho < 1.f);
331 const float* pow_x0 = filtered_clear_pow_.get(); 296 const float* pow_x0 = filtered_clear_pow_.get();
332 const float* pow_n0 = filtered_noise_pow_.get(); 297 const float* pow_n0 = filtered_noise_pow_.get();
333 298
334 for (size_t n = 0; n < start_freq; ++n) { 299 for (size_t n = 0; n < start_freq; ++n) {
335 sols[n] = 1.0f; 300 sols[n] = 1.f;
336 } 301 }
337 302
338 // Analytic solution for optimal gains. See paper for derivation. 303 // Analytic solution for optimal gains. See paper for derivation.
339 for (size_t n = start_freq - 1; n < bank_size_; ++n) { 304 for (size_t n = start_freq - 1; n < bank_size_; ++n) {
340 float alpha0, beta0, gamma0; 305 float alpha0, beta0, gamma0;
341 gamma0 = 0.5f * rho_[n] * pow_x0[n] * pow_n0[n] + 306 gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +
342 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n]; 307 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];
343 beta0 = lambda * pow_x0[n] * (2 - rho_[n]) * pow_x0[n] * pow_n0[n]; 308 beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];
344 if (quadratic) { 309 if (quadratic) {
345 alpha0 = lambda * pow_x0[n] * (1 - rho_[n]) * pow_x0[n] * pow_x0[n]; 310 alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];
346 sols[n] = 311 sols[n] =
347 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0); 312 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0);
348 } else { 313 } else {
349 sols[n] = -gamma0 / beta0; 314 sols[n] = -gamma0 / beta0;
350 } 315 }
351 sols[n] = fmax(0, sols[n]); 316 sols[n] = fmax(0, sols[n]);
352 } 317 }
353 } 318 }
354 319
355 bool IntelligibilityEnhancer::active() const { 320 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
356 return active_; 321 FloatToS16(audio, chunk_length_, &audio_s16_[0]);
322 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
323 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
turaj 2016/02/13 00:09:42 I thought we gonna use the energy-based VAD with h
aluebs-webrtc 2016/02/19 03:56:30 As discussed offline, having the pitch-based VAD w
324 chunks_since_voice_ = 0;
325 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {
turaj 2016/02/13 00:09:42 If energy-based VAD is used, do we still need this
aluebs-webrtc 2016/02/19 03:56:30 No, but I think we should use the pitch-based VAD.
326 ++chunks_since_voice_;
327 }
328 return chunks_since_voice_ < kSpeechOffsetDelay;
357 } 329 }
358 330
359 } // namespace webrtc 331 } // namespace webrtc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698