Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(406)

Unified Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master
Patch Set: Fix Mac Error (3) Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
index dbb7e638b2d063c52e876ec53df090893e2d4d4c..8eccde452c359cd49c0a29096a1cfc9628b261a7 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
@@ -19,18 +19,18 @@
#include <math.h>
#include <stdlib.h>
-
#include <algorithm>
#include <numeric>
#include "webrtc/base/checks.h"
-#include "webrtc/common_audio/vad/include/webrtc_vad.h"
+#include "webrtc/common_audio/include/audio_util.h"
#include "webrtc/common_audio/window_generator.h"
namespace webrtc {
namespace {
+const int kErbResolution = 2;
const int kWindowSizeMs = 2;
const int kChunkSizeMs = 10; // Size provided by APM.
const float kClipFreq = 200.0f;
@@ -64,124 +64,93 @@ void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
}
}
-IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,
- int sample_rate_hz,
- int channels,
- int cv_type,
- float cv_alpha,
- int cv_win,
- int analysis_rate,
- int variance_rate,
- float gain_limit)
+IntelligibilityEnhancer::IntelligibilityEnhancer()
+ : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {
+}
+
+IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
: freqs_(RealFourier::ComplexLength(
- RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
+ RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),
window_size_(1 << RealFourier::FftOrder(freqs_)),
- chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),
- bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),
- sample_rate_hz_(sample_rate_hz),
- erb_resolution_(erb_resolution),
- channels_(channels),
- analysis_rate_(analysis_rate),
- variance_rate_(variance_rate),
+ chunk_length_(config.sample_rate_hz * kChunkSizeMs / 1000),
+ bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),
+ sample_rate_hz_(config.sample_rate_hz),
+ erb_resolution_(kErbResolution),
+ num_capture_channels_(config.num_capture_channels),
+ num_render_channels_(config.num_render_channels),
+ analysis_rate_(config.analysis_rate),
+ active_(true),
clear_variance_(freqs_,
- static_cast<VarianceType>(cv_type),
- cv_win,
- cv_alpha),
- noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f),
+ config.var_type,
+ config.var_window_size,
+ config.var_decay_rate),
+ noise_variance_(freqs_,
+ config.var_type,
+ config.var_window_size,
+ config.var_decay_rate),
filtered_clear_var_(new float[bank_size_]),
filtered_noise_var_(new float[bank_size_]),
filter_bank_(bank_size_),
center_freqs_(new float[bank_size_]),
rho_(new float[bank_size_]),
gains_eq_(new float[bank_size_]),
- gain_applier_(freqs_, gain_limit),
- temp_out_buffer_(nullptr),
- input_audio_(new float* [channels]),
+ gain_applier_(freqs_, config.gain_change_limit),
+ temp_render_out_buffer_(chunk_length_, num_render_channels_),
+ temp_capture_out_buffer_(chunk_length_, num_capture_channels_),
kbd_window_(new float[window_size_]),
render_callback_(this, AudioSource::kRenderStream),
capture_callback_(this, AudioSource::kCaptureStream),
block_count_(0),
- analysis_step_(0),
- vad_high_(WebRtcVad_Create()),
- vad_low_(WebRtcVad_Create()),
- vad_tmp_buffer_(new int16_t[chunk_length_]) {
- DCHECK_LE(kConfigRho, 1.0f);
+ analysis_step_(0) {
+ DCHECK_LE(config.rho, 1.0f);
CreateErbBank();
- WebRtcVad_Init(vad_high_);
- WebRtcVad_set_mode(vad_high_, 0); // High likelihood of speech.
- WebRtcVad_Init(vad_low_);
- WebRtcVad_set_mode(vad_low_, 3); // Low likelihood of speech.
-
- temp_out_buffer_ = static_cast<float**>(
- malloc(sizeof(*temp_out_buffer_) * channels_ +
- sizeof(**temp_out_buffer_) * chunk_length_ * channels_));
- for (int i = 0; i < channels_; ++i) {
- temp_out_buffer_[i] =
- reinterpret_cast<float*>(temp_out_buffer_ + channels_) +
- chunk_length_ * i;
- }
-
// Assumes all rho equal.
for (int i = 0; i < bank_size_; ++i) {
- rho_[i] = kConfigRho * kConfigRho;
+ rho_[i] = config.rho * config.rho;
}
float freqs_khz = kClipFreq / 1000.0f;
int erb_index = static_cast<int>(ceilf(
11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));
- start_freq_ = std::max(1, erb_index * erb_resolution);
+ start_freq_ = max(1, erb_index * erb_resolution_);
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
kbd_window_.get());
render_mangler_.reset(new LappedTransform(
- channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,
- window_size_ / 2, &render_callback_));
+ num_render_channels_, num_render_channels_, chunk_length_,
+ kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));
capture_mangler_.reset(new LappedTransform(
- channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,
- window_size_ / 2, &capture_callback_));
+ num_capture_channels_, num_capture_channels_, chunk_length_,
+ kbd_window_.get(), window_size_, window_size_ / 2, &capture_callback_));
}
-IntelligibilityEnhancer::~IntelligibilityEnhancer() {
- WebRtcVad_Free(vad_low_);
- WebRtcVad_Free(vad_high_);
- free(temp_out_buffer_);
-}
+void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
+ int sample_rate_hz,
+ int num_channels) {
+ CHECK_EQ(sample_rate_hz_, sample_rate_hz);
+ CHECK_EQ(num_render_channels_, num_channels);
-void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {
- for (int i = 0; i < chunk_length_; ++i) {
- vad_tmp_buffer_[i] = (int16_t)audio[0][i];
+ if (active_) {
+ render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
}
- has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,
- vad_tmp_buffer_.get(), chunk_length_) == 1;
- // Process and enhance chunk of |audio|
- render_mangler_->ProcessChunk(audio, temp_out_buffer_);
-
- for (int i = 0; i < channels_; ++i) {
- memcpy(audio[i], temp_out_buffer_[i],
- chunk_length_ * sizeof(**temp_out_buffer_));
+ if (active_) {
+ for (int i = 0; i < num_render_channels_; ++i) {
+ memcpy(audio[i], temp_render_out_buffer_.channels()[i],
+ chunk_length_ * sizeof(**audio));
+ }
}
}
-void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {
- for (int i = 0; i < chunk_length_; ++i) {
- vad_tmp_buffer_[i] = (int16_t)audio[0][i];
- }
- // TODO(bercic): The VAD was always detecting voice in the noise stream,
- // no matter what the aggressiveness, so it was temporarily disabled here.
-
- #if 0
- if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),
- chunk_length_) == 1) {
- printf("capture HAS speech\n");
- return;
- }
- printf("capture NO speech\n");
- #endif
+void IntelligibilityEnhancer::AnalyzeCaptureAudio(float* const* audio,
+ int sample_rate_hz,
+ int num_channels) {
+ CHECK_EQ(sample_rate_hz_, sample_rate_hz);
+ CHECK_EQ(num_capture_channels_, num_channels);
- capture_mangler_->ProcessChunk(audio, temp_out_buffer_);
+ capture_mangler_->ProcessChunk(audio, temp_capture_out_buffer_.channels());
}
void IntelligibilityEnhancer::DispatchAudio(
@@ -206,28 +175,21 @@ void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,
return;
}
- // For now, always assumes enhancement is necessary.
- // TODO(ekmeyerson): Change to only enhance if necessary,
- // based on experiments with different cutoffs.
- if (has_voice_low_ || true) {
+ // TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary.
+ if (true) {
clear_variance_.Step(in_block, false);
- const float power_target = std::accumulate(
- clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.0f);
-
if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
+ const float power_target = std::accumulate(
+ clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f);
AnalyzeClearBlock(power_target);
++analysis_step_;
- if (analysis_step_ == variance_rate_) {
- analysis_step_ = 0;
- clear_variance_.Clear();
- noise_variance_.Clear();
- }
}
++block_count_;
}
- /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */
- gain_applier_.Apply(in_block, out_block);
+ if (active_) {
+ gain_applier_.Apply(in_block, out_block);
+ }
}
void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {
@@ -406,4 +368,8 @@ float IntelligibilityEnhancer::DotProduct(const float* a,
return ret;
}
+bool IntelligibilityEnhancer::active() const {
+ return active_;
+}
+
} // namespace webrtc

Powered by Google App Engine
This is Rietveld 408576698