Chromium Code Reviews

Unified Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
index 1e766875caedc519004077e4a2ebfc1f993c9262..0e7e76717dc4ac7c4b670c91302cf5cb8d31b802 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
@@ -19,7 +19,6 @@
#include <math.h>
#include <stdlib.h>
-
#include <algorithm>
#include <numeric>
@@ -39,6 +38,8 @@ const float kConfigRho = 0.02f; // Default production and interpretation SNR.
const float kKbdAlpha = 1.5f;
const float kLambdaBot = -1.0f; // Extreme values in bisection
const float kLambdaTop = -10e-18f; // search for lamda.
+const float kVoiceDetected = 1.f;
+const float kNoiseDetected = 0.f;
} // namespace
@@ -65,39 +66,38 @@ void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
}
}
-IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,
- int sample_rate_hz,
- int channels,
- int cv_type,
- float cv_alpha,
- int cv_win,
- int analysis_rate,
- int variance_rate,
- float gain_limit)
+IntelligibilityEnhancer::IntelligibilityEnhancer()
+ : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {
+}
+
+IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
: freqs_(RealFourier::ComplexLength(
- RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
+ RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),
window_size_(1 << RealFourier::FftOrder(freqs_)),
- chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),
- bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),
- sample_rate_hz_(sample_rate_hz),
- erb_resolution_(erb_resolution),
- channels_(channels),
- analysis_rate_(analysis_rate),
- variance_rate_(variance_rate),
+ chunk_length_(config.sample_rate_hz * kChunkSizeMs / 1000),
+ bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),
+ sample_rate_hz_(config.sample_rate_hz),
+ erb_resolution_(kErbResolution),
+ channels_(config.channels),
+ analysis_rate_(config.analysis_rate),
+ capture_vad_thresh_(config.capture_vad_thresh),
+ render_vad_thresh_(config.render_vad_thresh),
clear_variance_(freqs_,
- static_cast<VarianceType>(cv_type),
- cv_win,
- cv_alpha),
- noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f),
+ config.var_type,
+ config.var_window_size,
+ config.var_decay_rate),
+ noise_variance_(freqs_,
+ config.var_type,
+ config.var_window_size,
+ config.var_decay_rate),
filtered_clear_var_(new float[bank_size_]),
filtered_noise_var_(new float[bank_size_]),
filter_bank_(bank_size_),
center_freqs_(new float[bank_size_]),
rho_(new float[bank_size_]),
gains_eq_(new float[bank_size_]),
- gain_applier_(freqs_, gain_limit),
+ gain_applier_(freqs_, config.gain_change_limit),
temp_out_buffer_(nullptr),
- input_audio_(new float* [channels]),
kbd_window_(new float[window_size_]),
render_callback_(this, AudioSource::kRenderStream),
capture_callback_(this, AudioSource::kCaptureStream),
@@ -106,7 +106,7 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,
vad_high_(WebRtcVad_Create()),
vad_low_(WebRtcVad_Create()),
vad_tmp_buffer_(new int16_t[chunk_length_]) {
- DCHECK_LE(kConfigRho, 1.0f);
+ DCHECK_LE(config.rho, 1.0f);
CreateErbBank();
@@ -126,13 +126,13 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,
// Assumes all rho equal.
for (int i = 0; i < bank_size_; ++i) {
- rho_[i] = kConfigRho * kConfigRho;
+ rho_[i] = config.rho * config.rho;
}
float freqs_khz = kClipFreq / 1000.0f;
int erb_index = static_cast<int>(ceilf(
11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));
- start_freq_ = max(1, erb_index * kErbResolution);
+ start_freq_ = max(1, erb_index * erb_resolution_);
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
kbd_window_.get());
@@ -151,6 +151,11 @@ IntelligibilityEnhancer::~IntelligibilityEnhancer() {
}
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {
+ ProcessRenderAudio(audio, kVoiceDetected);
+}
+
+void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
turaj 2015/07/14 18:28:51 I did not comprehend the logic here. |voice_proba
ekm 2015/07/17 19:59:38 Done. You're right, the logic was off and not full
+ float voice_probability) {
for (int i = 0; i < chunk_length_; ++i) {
vad_tmp_buffer_[i] = (int16_t)audio[0][i];
turaj 2015/07/14 18:28:51 You better check with APM guys, but I suppose you
aluebs-webrtc 2015/07/15 01:02:04 No, you get audio in the int16_t range. But for th
ekm 2015/07/17 19:59:38 Done. audio_util is great!
}
@@ -158,7 +163,9 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {
vad_tmp_buffer_.get(), chunk_length_) == 1;
// Process and enhance chunk of |audio|
- render_mangler_->ProcessChunk(audio, temp_out_buffer_);
+ if (voice_probability >= render_vad_thresh_) {
+ render_mangler_->ProcessChunk(audio, temp_out_buffer_);
+ }
for (int i = 0; i < channels_; ++i) {
memcpy(audio[i], temp_out_buffer_[i],
@@ -167,6 +174,11 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {
}
void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {
+ ProcessCaptureAudio(audio, kNoiseDetected);
+}
+
+void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio,
+ float voice_probability) {
for (int i = 0; i < chunk_length_; ++i) {
vad_tmp_buffer_[i] = (int16_t)audio[0][i];
}
@@ -182,7 +194,9 @@ void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {
printf("capture NO speech\n");
#endif
- capture_mangler_->ProcessChunk(audio, temp_out_buffer_);
+ if (voice_probability <= capture_vad_thresh_) {
+ capture_mangler_->ProcessChunk(audio, temp_out_buffer_);
+ }
}
void IntelligibilityEnhancer::DispatchAudio(
@@ -218,11 +232,6 @@ void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,
if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
AnalyzeClearBlock(power_target);
++analysis_step_;
- if (analysis_step_ == variance_rate_) {
- analysis_step_ = 0;
- clear_variance_.Clear();
- noise_variance_.Clear();
- }
}
++block_count_;
}

Powered by Google App Engine