webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1234463003: Integrate Intelligibility with APM

Unified Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Updated interface, how VAD is used, other issues Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« webrtc/modules/audio_processing/include/mock_audio_processing.h ('K') | « webrtc/modules/audio_processing/include/mock_audio_processing.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

index df47de597885ed61d9dd9a824d2c6505c1be99a4..ae369d2fa3a7e3d74cd2da15d9ac49b357b99c9e 100644

--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

@@ -20,11 +20,9 @@

#include "webrtc/base/scoped_ptr.h"

#include "webrtc/common_audio/lapped_transform.h"

+#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"

-struct WebRtcVadInst;

-typedef struct WebRtcVadInst VadInst;

namespace webrtc {

// Speech intelligibility enhancement module. Reads render and capture

@@ -33,32 +31,65 @@ namespace webrtc {

// Note: assumes speech and noise streams are already separated.

class IntelligibilityEnhancer {

public:

- // Construct a new instance with the given filter bank resolution,

- // sampling rate, number of channels and analysis rates.

- // |analysis_rate| sets the number of input blocks (containing speech!)

- // to elapse before a new gain computation is made. |variance_rate| specifies

- // the number of gain recomputations after which the variances are reset.

- // |cv_*| are parameters for the VarianceArray constructor for the

- // clear speech stream.

- // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should

- // probably go away once fine tuning is done. They override the internal

- // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

- IntelligibilityEnhancer(int erb_resolution,

- int sample_rate_hz,

- int channels,

- int cv_type,

- float cv_alpha,

- int cv_win,

- int analysis_rate,

- int variance_rate,

- float gain_limit);

+ struct Config {

+ // |var_*| are parameters for the VarianceArray constructor for the

+ // clear speech stream.

+ // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should

+ // probably go away once fine tuning is done. They override the internal

+ // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

+ Config()

+ : sample_rate_hz(16000),

+ channels(1),

+ var_type(intelligibility::VarianceArray::kStepDecaying),

+ var_decay_rate(0.9f),

+ var_window_size(10),

+ analysis_rate(800),

+ gain_change_limit(0.1f),

+ rho(0.02f),

+ capture_vad_thresh(1.f),

+ render_vad_thresh(0.f),

+ activate_snr_thresh(0.f),

+ deactivate_snr_thresh(100000.f) {}

+ int sample_rate_hz;

+ int channels;

+ intelligibility::VarianceArray::StepType var_type;

+ float var_decay_rate;

+ int var_window_size;

+ int analysis_rate;

+ float gain_change_limit;

+ float rho;

+ float capture_vad_thresh;

+ float render_vad_thresh;

+ float activate_snr_thresh;

+ float deactivate_snr_thresh;

+ };

+ explicit IntelligibilityEnhancer(const Config& config);

+ IntelligibilityEnhancer(); // Initialize with default config.

~IntelligibilityEnhancer();

- // Reads and processes chunk of noise stream in time domain.

- void ProcessCaptureAudio(float* const* audio);

+ // Reads and processes chunk of noise stream in time domain. Only updates

+ // noise estimate when |voice_probability| below a threshold. Uses internal

+ // VAD when |voice_probability| not provided.

+ void AnalyzeCaptureAudio(float* const* audio,

+ int sample_rate_hz,

+ int num_channels,

+ float voice_probability);

+ void AnalyzeCaptureAudio(float* const* audio,

+ int sample_rate_hz,

+ int num_channels);

// Reads chunk of speech in time domain and updates with modified signal.

- void ProcessRenderAudio(float* const* audio);

+ // Only updates speech estimate when |voice_probability| above a threshold.

+ // Uses internal VAD when |voice_probability| not provided.

+ void ProcessRenderAudio(float* const* audio,

+ int sample_rate_hz,

+ int num_channels,

+ float voice_probability);

+ void ProcessRenderAudio(float* const* audio,

+ int sample_rate_hz,

+ int num_channels);

private:

enum AudioSource {

@@ -124,6 +155,12 @@ class IntelligibilityEnhancer {

// Stores in |result|.

void FilterVariance(const float* var, float* result);

+ // Returns ratio of total variance of clear to noise.

+ float SNR();

+ // Updates |active_| based on SNR.

+ void UpdateActivity();

// Returns dot product of vectors specified by size |length| arrays |a|,|b|.

static float DotProduct(const float* a, const float* b, int length);

@@ -135,7 +172,13 @@ class IntelligibilityEnhancer {

const int erb_resolution_;

const int channels_; // Num channels.

const int analysis_rate_; // Num blocks before gains recalculated.

- const int variance_rate_; // Num recalculations before history is cleared.

+ const float capture_vad_thresh_; // Threshold for updating noise estimate.

+ const float render_vad_thresh_; // Threshold for updating speech estimate.

+ const float activate_snr_thresh_; // Threshold for activating gain updates.

+ const float deactivate_snr_thresh_; // Threshold for deactivating.

+ bool active_; // Whether render gains are being updated.

+ bool deactivating_; // True when we are smoothing enhancer off.

intelligibility::VarianceArray clear_variance_;

intelligibility::VarianceArray noise_variance_;

@@ -154,7 +197,6 @@ class IntelligibilityEnhancer {

// TODO(ekmeyerson): Switch to using ChannelBuffer.

float** temp_out_buffer_;

- rtc::scoped_ptr<float* []> input_audio_;

rtc::scoped_ptr<float[]> kbd_window_;

TransformCallback render_callback_;

TransformCallback capture_callback_;

@@ -163,13 +205,13 @@ class IntelligibilityEnhancer {

int block_count_;

int analysis_step_;

- // TODO(bercic): Quick stopgap measure for voice detection in the clear

- // and noise streams.

- // Note: VAD currently does not affect anything in IntelligibilityEnhancer.

- VadInst* vad_high_;

- VadInst* vad_low_;

+ VoiceActivityDetector capture_vad_;

+ VoiceActivityDetector render_vad_;

+ float capture_voice_probability_;

+ float render_voice_probability_;

+ bool using_capture_vad_;

+ bool using_render_vad_;

rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

- bool has_voice_low_; // Whether voice detected in speech stream.

};

} // namespace webrtc