Chromium Code Reviews| Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc |
| diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc |
| index 1e766875caedc519004077e4a2ebfc1f993c9262..da2d86d9745886c93b553e2bc850b8ded0b0aa29 100644 |
| --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc |
| +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc |
| @@ -19,13 +19,12 @@ |
| #include <math.h> |
| #include <stdlib.h> |
| - |
| #include <algorithm> |
| #include <numeric> |
| #include "webrtc/base/checks.h" |
| -#include "webrtc/common_audio/vad/include/webrtc_vad.h" |
| #include "webrtc/common_audio/window_generator.h" |
| +#include "webrtc/common_audio/include/audio_util.h" |
| namespace webrtc { |
| @@ -39,6 +38,7 @@ const float kConfigRho = 0.02f; // Default production and interpretation SNR. |
| const float kKbdAlpha = 1.5f; |
| const float kLambdaBot = -1.0f; // Extreme values in bisection |
| const float kLambdaTop = -10e-18f; // search for lamda. |
| +const float kMinNoise = 10e-18f; |
| } // namespace |
| @@ -65,124 +65,133 @@ void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( |
| } |
| } |
| -IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution, |
| - int sample_rate_hz, |
| - int channels, |
| - int cv_type, |
| - float cv_alpha, |
| - int cv_win, |
| - int analysis_rate, |
| - int variance_rate, |
| - float gain_limit) |
| +IntelligibilityEnhancer::IntelligibilityEnhancer() |
| + : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) { |
| +} |
| + |
| +IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config) |
| : freqs_(RealFourier::ComplexLength( |
| - RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), |
| + RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))), |
| window_size_(1 << RealFourier::FftOrder(freqs_)), |
| - chunk_length_(sample_rate_hz * kChunkSizeMs / 1000), |
| - bank_size_(GetBankSize(sample_rate_hz, erb_resolution)), |
| - sample_rate_hz_(sample_rate_hz), |
| - erb_resolution_(erb_resolution), |
| - channels_(channels), |
| - analysis_rate_(analysis_rate), |
| - variance_rate_(variance_rate), |
| + chunk_length_(config.sample_rate_hz * kChunkSizeMs / 1000), |
| + bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)), |
| + sample_rate_hz_(config.sample_rate_hz), |
| + erb_resolution_(kErbResolution), |
| + num_capture_channels_(config.num_capture_channels), |
| + num_render_channels_(config.num_render_channels), |
| + analysis_rate_(config.analysis_rate), |
| + capture_vad_thresh_(config.capture_vad_thresh), |
| + render_vad_thresh_(config.render_vad_thresh), |
| + activate_snr_thresh_(config.activate_snr_thresh), |
| + deactivate_snr_thresh_(config.deactivate_snr_thresh), |
| + active_(false), |
| + deactivating_(false), |
| clear_variance_(freqs_, |
| - static_cast<VarianceType>(cv_type), |
| - cv_win, |
| - cv_alpha), |
| - noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f), |
| + config.var_type, |
| + config.var_window_size, |
| + config.var_decay_rate), |
| + noise_variance_(freqs_, |
| + config.var_type, |
| + config.var_window_size, |
| + config.var_decay_rate), |
| filtered_clear_var_(new float[bank_size_]), |
| filtered_noise_var_(new float[bank_size_]), |
| filter_bank_(bank_size_), |
| center_freqs_(new float[bank_size_]), |
| rho_(new float[bank_size_]), |
| gains_eq_(new float[bank_size_]), |
| - gain_applier_(freqs_, gain_limit), |
| - temp_out_buffer_(nullptr), |
| - input_audio_(new float* [channels]), |
| + gain_applier_(freqs_, config.gain_change_limit), |
| + temp_render_out_buffer_(chunk_length_, num_render_channels_), |
| + temp_capture_out_buffer_(chunk_length_, num_capture_channels_), |
| kbd_window_(new float[window_size_]), |
| render_callback_(this, AudioSource::kRenderStream), |
| capture_callback_(this, AudioSource::kCaptureStream), |
| block_count_(0), |
| analysis_step_(0), |
| - vad_high_(WebRtcVad_Create()), |
| - vad_low_(WebRtcVad_Create()), |
| + using_capture_vad_(true), |
| + using_render_vad_(true), |
| vad_tmp_buffer_(new int16_t[chunk_length_]) { |
| - DCHECK_LE(kConfigRho, 1.0f); |
| + DCHECK_LE(config.rho, 1.0f); |
| CreateErbBank(); |
| - WebRtcVad_Init(vad_high_); |
| - WebRtcVad_set_mode(vad_high_, 0); // High likelihood of speech. |
| - WebRtcVad_Init(vad_low_); |
| - WebRtcVad_set_mode(vad_low_, 3); // Low likelihood of speech. |
| - |
| - temp_out_buffer_ = static_cast<float**>( |
| - malloc(sizeof(*temp_out_buffer_) * channels_ + |
| - sizeof(**temp_out_buffer_) * chunk_length_ * channels_)); |
| - for (int i = 0; i < channels_; ++i) { |
| - temp_out_buffer_[i] = |
| - reinterpret_cast<float*>(temp_out_buffer_ + channels_) + |
| - chunk_length_ * i; |
| - } |
| - |
| // Assumes all rho equal. |
| for (int i = 0; i < bank_size_; ++i) { |
| - rho_[i] = kConfigRho * kConfigRho; |
| + rho_[i] = config.rho * config.rho; |
| } |
| float freqs_khz = kClipFreq / 1000.0f; |
| int erb_index = static_cast<int>(ceilf( |
| 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f)); |
| - start_freq_ = max(1, erb_index * kErbResolution); |
| + start_freq_ = max(1, erb_index * erb_resolution_); |
| WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_, |
| kbd_window_.get()); |
| render_mangler_.reset(new LappedTransform( |
| - channels_, channels_, chunk_length_, kbd_window_.get(), window_size_, |
| - window_size_ / 2, &render_callback_)); |
| + num_render_channels_, num_render_channels_, chunk_length_, |
| + kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_)); |
| capture_mangler_.reset(new LappedTransform( |
| - channels_, channels_, chunk_length_, kbd_window_.get(), window_size_, |
| - window_size_ / 2, &capture_callback_)); |
| + num_capture_channels_, num_capture_channels_, chunk_length_, |
| + kbd_window_.get(), window_size_, window_size_ / 2, &capture_callback_)); |
| } |
| -IntelligibilityEnhancer::~IntelligibilityEnhancer() { |
| - WebRtcVad_Free(vad_low_); |
| - WebRtcVad_Free(vad_high_); |
| - free(temp_out_buffer_); |
| +void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels, |
| + float voice_probability) { |
| + render_voice_probability_ = voice_probability; |
| + using_render_vad_ = false; |
| + ProcessRenderAudio(audio, sample_rate_hz, num_channels); |
| } |
| -void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) { |
| - for (int i = 0; i < chunk_length_; ++i) { |
| - vad_tmp_buffer_[i] = (int16_t)audio[0][i]; |
| +void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels) { |
| + CHECK_EQ(sample_rate_hz_, sample_rate_hz); |
| + CHECK_EQ(num_render_channels_, num_channels); |
| + |
| + if (using_render_vad_) { |
| + FloatToS16(audio[0], chunk_length_, vad_tmp_buffer_.get()); |
| + render_vad_.ProcessChunk(vad_tmp_buffer_.get(), chunk_length_, |
| + sample_rate_hz_); |
| + render_voice_probability_ = render_vad_.last_voice_probability(); |
| } |
| - has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_, |
| - vad_tmp_buffer_.get(), chunk_length_) == 1; |
| - // Process and enhance chunk of |audio| |
| - render_mangler_->ProcessChunk(audio, temp_out_buffer_); |
| + if (render_voice_probability_ >= render_vad_thresh_ || active_) { |
| + render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); |
| + } |
| - for (int i = 0; i < channels_; ++i) { |
| - memcpy(audio[i], temp_out_buffer_[i], |
| - chunk_length_ * sizeof(**temp_out_buffer_)); |
| + for (int i = 0; i < num_render_channels_; ++i) { |
| + memcpy(audio[i], temp_render_out_buffer_.channels()[i], |
| + chunk_length_ * sizeof(**audio)); |
| } |
| } |
| -void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) { |
| - for (int i = 0; i < chunk_length_; ++i) { |
| - vad_tmp_buffer_[i] = (int16_t)audio[0][i]; |
| +void IntelligibilityEnhancer::AnalyzeCaptureAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels, |
| + float voice_probability) { |
| + capture_voice_probability_ = voice_probability; |
| + using_capture_vad_ = false; |
| + AnalyzeCaptureAudio(audio, sample_rate_hz, num_channels); |
| +} |
| + |
| +void IntelligibilityEnhancer::AnalyzeCaptureAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels) { |
| + CHECK_EQ(sample_rate_hz_, sample_rate_hz); |
| + CHECK_EQ(num_capture_channels_, num_channels); |
| + |
| + if (using_capture_vad_) { |
| + FloatToS16(audio[0], chunk_length_, vad_tmp_buffer_.get()); |
| + capture_vad_.ProcessChunk(vad_tmp_buffer_.get(), chunk_length_, |
| + sample_rate_hz_); |
| + capture_voice_probability_ = capture_vad_.last_voice_probability(); |
| } |
| - // TODO(bercic): The VAD was always detecting voice in the noise stream, |
| - // no matter what the aggressiveness, so it was temporarily disabled here. |
| - |
| - #if 0 |
| - if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(), |
| - chunk_length_) == 1) { |
| - printf("capture HAS speech\n"); |
| - return; |
| - } |
| - printf("capture NO speech\n"); |
| - #endif |
| - capture_mangler_->ProcessChunk(audio, temp_out_buffer_); |
| + if (capture_voice_probability_ <= capture_vad_thresh_) { |
| + capture_mangler_->ProcessChunk(audio, temp_capture_out_buffer_.channels()); |
| + } |
| } |
| void IntelligibilityEnhancer::DispatchAudio( |
| @@ -207,28 +216,23 @@ void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block, |
| return; |
| } |
| - // For now, always assumes enhancement is necessary. |
| - // TODO(ekmeyerson): Change to only enhance if necessary, |
| - // based on experiments with different cutoffs. |
| - if (has_voice_low_ || true) { |
| + if (render_voice_probability_ >= render_vad_thresh_) { |
| clear_variance_.Step(in_block, false); |
| - const float power_target = std::accumulate( |
| - clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.0f); |
| - |
| - if (block_count_ % analysis_rate_ == analysis_rate_ - 1) { |
| + if (active_ && !deactivating_ && |
| + block_count_ % analysis_rate_ == analysis_rate_ - 1) { |
| + const float power_target = std::accumulate( |
| + clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f); |
| AnalyzeClearBlock(power_target); |
| ++analysis_step_; |
| - if (analysis_step_ == variance_rate_) { |
| - analysis_step_ = 0; |
| - clear_variance_.Clear(); |
| - noise_variance_.Clear(); |
| - } |
| } |
| ++block_count_; |
| } |
| - /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */ |
| - gain_applier_.Apply(in_block, out_block); |
| + UpdateActivity(); |
|
turaj
2015/07/27 20:01:05
If we are deactivated, then gains are all one, rig
|
| + if (active_) { |
| + // efidata(n,:) = sqrt(b(n)) * fidata(n,:) |
|
Andrew MacDonald
2015/07/24 23:50:40
Remove if you don't want this.
ekm
2015/07/29 00:37:19
Done.
|
| + gain_applier_.Apply(in_block, out_block); |
| + } |
| } |
| void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) { |
| @@ -395,6 +399,29 @@ void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) { |
| } |
| } |
| +float IntelligibilityEnhancer::SNR() { |
| + float total_clear_var = std::accumulate( |
|
turaj
2015/07/27 20:01:05
these two variables can be const.
|
| + clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f); |
| + float total_noise_var = |
| + std::accumulate(noise_variance_.variance(), |
| + noise_variance_.variance() + freqs_, kMinNoise); |
| + return total_clear_var / total_noise_var; |
| +} |
| + |
| +void IntelligibilityEnhancer::UpdateActivity() { |
| + const float snr = SNR(); |
| + if (snr <= activate_snr_thresh_) { |
| + active_ = true; |
| + deactivating_ = false; |
| + } else if (active_ && !deactivating_ && snr >= deactivate_snr_thresh_) { |
| + gain_applier_.Clear(); |
| + deactivating_ = true; |
| + } else if (deactivating_ && gain_applier_.IsIdentity()) { |
| + active_ = false; |
| + deactivating_ = false; |
| + } |
| +} |
| + |
| float IntelligibilityEnhancer::DotProduct(const float* a, |
| const float* b, |
| int length) { |