OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
48 | 48 |
49 IntelligibilityEnhancer::TransformCallback::TransformCallback( | 49 IntelligibilityEnhancer::TransformCallback::TransformCallback( |
50 IntelligibilityEnhancer* parent, | 50 IntelligibilityEnhancer* parent, |
51 IntelligibilityEnhancer::AudioSource source) | 51 IntelligibilityEnhancer::AudioSource source) |
52 : parent_(parent), source_(source) { | 52 : parent_(parent), source_(source) { |
53 } | 53 } |
54 | 54 |
55 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( | 55 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( |
56 const complex<float>* const* in_block, | 56 const complex<float>* const* in_block, |
57 int in_channels, | 57 int in_channels, |
58 int frames, | 58 size_t frames, |
59 int /* out_channels */, | 59 int /* out_channels */, |
60 complex<float>* const* out_block) { | 60 complex<float>* const* out_block) { |
61 DCHECK_EQ(parent_->freqs_, frames); | 61 DCHECK_EQ(parent_->freqs_, frames); |
62 for (int i = 0; i < in_channels; ++i) { | 62 for (int i = 0; i < in_channels; ++i) { |
63 parent_->DispatchAudio(source_, in_block[i], out_block[i]); | 63 parent_->DispatchAudio(source_, in_block[i], out_block[i]); |
64 } | 64 } |
65 } | 65 } |
66 | 66 |
67 IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution, | 67 IntelligibilityEnhancer::IntelligibilityEnhancer(size_t erb_resolution, |
68 int sample_rate_hz, | 68 int sample_rate_hz, |
69 int channels, | 69 int channels, |
70 int cv_type, | 70 int cv_type, |
71 float cv_alpha, | 71 float cv_alpha, |
72 int cv_win, | 72 size_t cv_win, |
73 int analysis_rate, | 73 int analysis_rate, |
74 int variance_rate, | 74 int variance_rate, |
75 float gain_limit) | 75 float gain_limit) |
76 : freqs_(RealFourier::ComplexLength( | 76 : freqs_(RealFourier::ComplexLength( |
77 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), | 77 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), |
78 window_size_(1 << RealFourier::FftOrder(freqs_)), | 78 window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))), |
79 chunk_length_(sample_rate_hz * kChunkSizeMs / 1000), | 79 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), |
80 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)), | 80 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)), |
81 sample_rate_hz_(sample_rate_hz), | 81 sample_rate_hz_(sample_rate_hz), |
82 erb_resolution_(erb_resolution), | 82 erb_resolution_(erb_resolution), |
83 channels_(channels), | 83 channels_(channels), |
84 analysis_rate_(analysis_rate), | 84 analysis_rate_(analysis_rate), |
85 variance_rate_(variance_rate), | 85 variance_rate_(variance_rate), |
86 clear_variance_(freqs_, | 86 clear_variance_(freqs_, |
87 static_cast<VarianceType>(cv_type), | 87 static_cast<VarianceType>(cv_type), |
88 cv_win, | 88 cv_win, |
89 cv_alpha), | 89 cv_alpha), |
(...skipping 27 matching lines...) Expand all Loading... | |
117 temp_out_buffer_ = static_cast<float**>( | 117 temp_out_buffer_ = static_cast<float**>( |
118 malloc(sizeof(*temp_out_buffer_) * channels_ + | 118 malloc(sizeof(*temp_out_buffer_) * channels_ + |
119 sizeof(**temp_out_buffer_) * chunk_length_ * channels_)); | 119 sizeof(**temp_out_buffer_) * chunk_length_ * channels_)); |
120 for (int i = 0; i < channels_; ++i) { | 120 for (int i = 0; i < channels_; ++i) { |
121 temp_out_buffer_[i] = | 121 temp_out_buffer_[i] = |
122 reinterpret_cast<float*>(temp_out_buffer_ + channels_) + | 122 reinterpret_cast<float*>(temp_out_buffer_ + channels_) + |
123 chunk_length_ * i; | 123 chunk_length_ * i; |
124 } | 124 } |
125 | 125 |
126 // Assumes all rho equal. | 126 // Assumes all rho equal. |
127 for (int i = 0; i < bank_size_; ++i) { | 127 for (size_t i = 0; i < bank_size_; ++i) { |
128 rho_[i] = kConfigRho * kConfigRho; | 128 rho_[i] = kConfigRho * kConfigRho; |
129 } | 129 } |
130 | 130 |
131 float freqs_khz = kClipFreq / 1000.0f; | 131 float freqs_khz = kClipFreq / 1000.0f; |
132 int erb_index = static_cast<int>(ceilf( | 132 size_t erb_index = static_cast<size_t>(ceilf( |
133 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f)); | 133 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f)); |
134 start_freq_ = std::max(1, erb_index * erb_resolution); | 134 start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution); |
135 | 135 |
136 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_, | 136 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_, |
137 kbd_window_.get()); | 137 kbd_window_.get()); |
138 render_mangler_.reset(new LappedTransform( | 138 render_mangler_.reset(new LappedTransform( |
139 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_, | 139 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_, |
140 window_size_ / 2, &render_callback_)); | 140 window_size_ / 2, &render_callback_)); |
141 capture_mangler_.reset(new LappedTransform( | 141 capture_mangler_.reset(new LappedTransform( |
142 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_, | 142 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_, |
143 window_size_ / 2, &capture_callback_)); | 143 window_size_ / 2, &capture_callback_)); |
144 } | 144 } |
145 | 145 |
146 IntelligibilityEnhancer::~IntelligibilityEnhancer() { | 146 IntelligibilityEnhancer::~IntelligibilityEnhancer() { |
147 WebRtcVad_Free(vad_low_); | 147 WebRtcVad_Free(vad_low_); |
148 WebRtcVad_Free(vad_high_); | 148 WebRtcVad_Free(vad_high_); |
149 free(temp_out_buffer_); | 149 free(temp_out_buffer_); |
150 } | 150 } |
151 | 151 |
152 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) { | 152 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) { |
153 for (int i = 0; i < chunk_length_; ++i) { | 153 for (size_t i = 0; i < chunk_length_; ++i) { |
154 vad_tmp_buffer_[i] = (int16_t)audio[0][i]; | 154 vad_tmp_buffer_[i] = (int16_t)audio[0][i]; |
155 } | 155 } |
156 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_, | 156 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_, |
157 vad_tmp_buffer_.get(), chunk_length_) == 1; | 157 vad_tmp_buffer_.get(), chunk_length_) == 1; |
158 | 158 |
159 // Process and enhance chunk of |audio| | 159 // Process and enhance chunk of |audio| |
160 render_mangler_->ProcessChunk(audio, temp_out_buffer_); | 160 render_mangler_->ProcessChunk(audio, temp_out_buffer_); |
161 | 161 |
162 for (int i = 0; i < channels_; ++i) { | 162 for (int i = 0; i < channels_; ++i) { |
163 memcpy(audio[i], temp_out_buffer_[i], | 163 memcpy(audio[i], temp_out_buffer_[i], |
164 chunk_length_ * sizeof(**temp_out_buffer_)); | 164 chunk_length_ * sizeof(**temp_out_buffer_)); |
165 } | 165 } |
166 } | 166 } |
167 | 167 |
168 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) { | 168 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) { |
169 for (int i = 0; i < chunk_length_; ++i) { | 169 for (size_t i = 0; i < chunk_length_; ++i) { |
170 vad_tmp_buffer_[i] = (int16_t)audio[0][i]; | 170 vad_tmp_buffer_[i] = (int16_t)audio[0][i]; |
171 } | 171 } |
172 // TODO(bercic): The VAD was always detecting voice in the noise stream, | 172 // TODO(bercic): The VAD was always detecting voice in the noise stream, |
173 // no matter what the aggressiveness, so it was temporarily disabled here. | 173 // no matter what the aggressiveness, so it was temporarily disabled here. |
174 | 174 |
175 #if 0 | 175 #if 0 |
176 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(), | 176 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(), |
177 chunk_length_) == 1) { | 177 chunk_length_) == 1) { |
178 printf("capture HAS speech\n"); | 178 printf("capture HAS speech\n"); |
179 return; | 179 return; |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
269 lambda_top = lambda; | 269 lambda_top = lambda; |
270 } | 270 } |
271 power_ratio = std::fabs(power * reciprocal_power_target); | 271 power_ratio = std::fabs(power * reciprocal_power_target); |
272 ++iters; | 272 ++iters; |
273 } | 273 } |
274 } | 274 } |
275 | 275 |
276 void IntelligibilityEnhancer::UpdateErbGains() { | 276 void IntelligibilityEnhancer::UpdateErbGains() { |
277 // (ERB gain) = filterbank' * (freq gain) | 277 // (ERB gain) = filterbank' * (freq gain) |
278 float* gains = gain_applier_.target(); | 278 float* gains = gain_applier_.target(); |
279 for (int i = 0; i < freqs_; ++i) { | 279 for (size_t i = 0; i < freqs_; ++i) { |
280 gains[i] = 0.0f; | 280 gains[i] = 0.0f; |
281 for (int j = 0; j < bank_size_; ++j) { | 281 for (size_t j = 0; j < bank_size_; ++j) { |
282 gains[i] = fmaf(filter_bank_[j][i], gains_eq_[j], gains[i]); | 282 gains[i] = fmaf(filter_bank_[j][i], gains_eq_[j], gains[i]); |
283 } | 283 } |
284 } | 284 } |
285 } | 285 } |
286 | 286 |
287 void IntelligibilityEnhancer::ProcessNoiseBlock(const complex<float>* in_block, | 287 void IntelligibilityEnhancer::ProcessNoiseBlock(const complex<float>* in_block, |
288 complex<float>* /*out_block*/) { | 288 complex<float>* /*out_block*/) { |
289 noise_variance_.Step(in_block); | 289 noise_variance_.Step(in_block); |
290 } | 290 } |
291 | 291 |
292 int IntelligibilityEnhancer::GetBankSize(int sample_rate, int erb_resolution) { | 292 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate, |
293 size_t erb_resolution) { | |
293 float freq_limit = sample_rate / 2000.0f; | 294 float freq_limit = sample_rate / 2000.0f; |
294 int erb_scale = ceilf( | 295 size_t erb_scale = static_cast<size_t>(ceilf( |
295 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f); | 296 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f)); |
296 return erb_scale * erb_resolution; | 297 return erb_scale * erb_resolution; |
297 } | 298 } |
298 | 299 |
299 void IntelligibilityEnhancer::CreateErbBank() { | 300 void IntelligibilityEnhancer::CreateErbBank() { |
300 int lf = 1, rf = 4; | 301 size_t lf = 1, rf = 4; |
Andrew MacDonald
2015/07/24 04:01:43
Sorry to make you do this, but could you replace l
ekm
2015/07/24 06:29:16
Sorry about this. This function is a direct port o
Peter Kasting
2015/07/24 06:44:22
Based on this, I won't touch these.
| |
301 | 302 |
302 for (int i = 0; i < bank_size_; ++i) { | 303 for (size_t i = 0; i < bank_size_; ++i) { |
303 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_)); | 304 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_)); |
304 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); | 305 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); |
Andrew MacDonald
2015/07/24 04:01:44
Gah, what are these magic numbers? Elliot, please
ekm
2015/07/24 06:29:16
This is the magic formula for inverse ERBS from ps
| |
305 center_freqs_[i] -= 14678.49f; | 306 center_freqs_[i] -= 14678.49f; |
306 } | 307 } |
307 float last_center_freq = center_freqs_[bank_size_ - 1]; | 308 float last_center_freq = center_freqs_[bank_size_ - 1]; |
308 for (int i = 0; i < bank_size_; ++i) { | 309 for (size_t i = 0; i < bank_size_; ++i) { |
309 center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq; | 310 center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq; |
310 } | 311 } |
311 | 312 |
312 for (int i = 0; i < bank_size_; ++i) { | 313 for (size_t i = 0; i < bank_size_; ++i) { |
313 filter_bank_[i].resize(freqs_); | 314 filter_bank_[i].resize(freqs_); |
314 } | 315 } |
315 | 316 |
316 for (int i = 1; i <= bank_size_; ++i) { | 317 for (size_t i = 1; i <= bank_size_; ++i) { |
317 int lll, ll, rr, rrr; | 318 size_t lll, ll, rr, rrr; |
Andrew MacDonald
2015/07/24 04:01:44
This is horrific. Elliot, could you look into usin
ekm
2015/07/24 06:29:16
Agreed; will do. They correspond from left-to-righ
| |
318 lll = round(center_freqs_[max(1, i - lf) - 1] * freqs_ / | 319 static const size_t kOne = 1; // Avoids repeated static_cast<>s below. |
319 (0.5f * sample_rate_hz_)); | 320 lll = static_cast<size_t>(round( |
320 ll = | 321 center_freqs_[max(one, i - lf) - 1] * freqs_ / |
Andrew MacDonald
2015/07/24 04:01:44
kOne, but why not just use a literal? "1u"
Peter Kasting
2015/07/24 06:44:22
1u causes compile failures :( (i tried that first
Peter Kasting
2015/07/27 23:09:54
Correctly copied over the kOne usage from the full
| |
321 round(center_freqs_[max(1, i) - 1] * freqs_ / (0.5f * sample_rate_hz_)); | 322 (0.5f * sample_rate_hz_))); |
322 lll = min(freqs_, max(lll, 1)) - 1; | 323 ll = static_cast<size_t>(round( |
323 ll = min(freqs_, max(ll, 1)) - 1; | 324 center_freqs_[max(one, i) - 1] * freqs_ / (0.5f * sample_rate_hz_))); |
325 lll = min(freqs_, max(lll, one)) - 1; | |
326 ll = min(freqs_, max(ll, one)) - 1; | |
324 | 327 |
325 rrr = round(center_freqs_[min(bank_size_, i + rf) - 1] * freqs_ / | 328 rrr = static_cast<size_t>(round( |
326 (0.5f * sample_rate_hz_)); | 329 center_freqs_[min(bank_size_, i + rf) - 1] * freqs_ / |
327 rr = round(center_freqs_[min(bank_size_, i + 1) - 1] * freqs_ / | 330 (0.5f * sample_rate_hz_))); |
328 (0.5f * sample_rate_hz_)); | 331 rr = static_cast<size_t>(round( |
329 rrr = min(freqs_, max(rrr, 1)) - 1; | 332 center_freqs_[min(bank_size_, i + 1) - 1] * freqs_ / |
330 rr = min(freqs_, max(rr, 1)) - 1; | 333 (0.5f * sample_rate_hz_))); |
334 rrr = min(freqs_, max(rrr, one)) - 1; | |
335 rr = min(freqs_, max(rr, one)) - 1; | |
331 | 336 |
332 float step, element; | 337 float step, element; |
333 | 338 |
334 step = 1.0f / (ll - lll); | 339 step = 1.0f / (ll - lll); |
335 element = 0.0f; | 340 element = 0.0f; |
336 for (int j = lll; j <= ll; ++j) { | 341 for (size_t j = lll; j <= ll; ++j) { |
337 filter_bank_[i - 1][j] = element; | 342 filter_bank_[i - 1][j] = element; |
338 element += step; | 343 element += step; |
339 } | 344 } |
340 step = 1.0f / (rrr - rr); | 345 step = 1.0f / (rrr - rr); |
341 element = 1.0f; | 346 element = 1.0f; |
342 for (int j = rr; j <= rrr; ++j) { | 347 for (size_t j = rr; j <= rrr; ++j) { |
343 filter_bank_[i - 1][j] = element; | 348 filter_bank_[i - 1][j] = element; |
344 element -= step; | 349 element -= step; |
345 } | 350 } |
346 for (int j = ll; j <= rr; ++j) { | 351 for (size_t j = ll; j <= rr; ++j) { |
347 filter_bank_[i - 1][j] = 1.0f; | 352 filter_bank_[i - 1][j] = 1.0f; |
348 } | 353 } |
349 } | 354 } |
350 | 355 |
351 float sum; | 356 float sum; |
352 for (int i = 0; i < freqs_; ++i) { | 357 for (size_t i = 0; i < freqs_; ++i) { |
353 sum = 0.0f; | 358 sum = 0.0f; |
354 for (int j = 0; j < bank_size_; ++j) { | 359 for (size_t j = 0; j < bank_size_; ++j) { |
355 sum += filter_bank_[j][i]; | 360 sum += filter_bank_[j][i]; |
356 } | 361 } |
357 for (int j = 0; j < bank_size_; ++j) { | 362 for (size_t j = 0; j < bank_size_; ++j) { |
358 filter_bank_[j][i] /= sum; | 363 filter_bank_[j][i] /= sum; |
359 } | 364 } |
360 } | 365 } |
361 } | 366 } |
362 | 367 |
363 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, | 368 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, |
364 int start_freq, | 369 size_t start_freq, |
365 float* sols) { | 370 float* sols) { |
366 bool quadratic = (kConfigRho < 1.0f); | 371 bool quadratic = (kConfigRho < 1.0f); |
367 const float* var_x0 = filtered_clear_var_.get(); | 372 const float* var_x0 = filtered_clear_var_.get(); |
368 const float* var_n0 = filtered_noise_var_.get(); | 373 const float* var_n0 = filtered_noise_var_.get(); |
369 | 374 |
370 for (int n = 0; n < start_freq; ++n) { | 375 for (size_t n = 0; n < start_freq; ++n) { |
371 sols[n] = 1.0f; | 376 sols[n] = 1.0f; |
372 } | 377 } |
373 | 378 |
374 // Analytic solution for optimal gains. See paper for derivation. | 379 // Analytic solution for optimal gains. See paper for derivation. |
375 for (int n = start_freq - 1; n < bank_size_; ++n) { | 380 for (size_t n = start_freq - 1; n < bank_size_; ++n) { |
376 float alpha0, beta0, gamma0; | 381 float alpha0, beta0, gamma0; |
377 gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] + | 382 gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] + |
378 lambda * var_x0[n] * var_n0[n] * var_n0[n]; | 383 lambda * var_x0[n] * var_n0[n] * var_n0[n]; |
379 beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n]; | 384 beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n]; |
380 if (quadratic) { | 385 if (quadratic) { |
381 alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n]; | 386 alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n]; |
382 sols[n] = | 387 sols[n] = |
383 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0); | 388 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0); |
384 } else { | 389 } else { |
385 sols[n] = -gamma0 / beta0; | 390 sols[n] = -gamma0 / beta0; |
386 } | 391 } |
387 sols[n] = fmax(0, sols[n]); | 392 sols[n] = fmax(0, sols[n]); |
388 } | 393 } |
389 } | 394 } |
390 | 395 |
391 void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) { | 396 void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) { |
392 DCHECK_GT(freqs_, 0); | 397 DCHECK_GT(freqs_, 0u); |
393 for (int i = 0; i < bank_size_; ++i) { | 398 for (size_t i = 0; i < bank_size_; ++i) { |
394 result[i] = DotProduct(&filter_bank_[i][0], var, freqs_); | 399 result[i] = DotProduct(&filter_bank_[i][0], var, freqs_); |
395 } | 400 } |
396 } | 401 } |
397 | 402 |
398 float IntelligibilityEnhancer::DotProduct(const float* a, | 403 float IntelligibilityEnhancer::DotProduct(const float* a, |
399 const float* b, | 404 const float* b, |
400 int length) { | 405 size_t length) { |
401 float ret = 0.0f; | 406 float ret = 0.0f; |
402 | 407 |
403 for (int i = 0; i < length; ++i) { | 408 for (size_t i = 0; i < length; ++i) { |
404 ret = fmaf(a[i], b[i], ret); | 409 ret = fmaf(a[i], b[i], ret); |
405 } | 410 } |
406 return ret; | 411 return ret; |
407 } | 412 } |
408 | 413 |
409 } // namespace webrtc | 414 } // namespace webrtc |
OLD | NEW |