| Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
|
| diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
|
| index df47de597885ed61d9dd9a824d2c6505c1be99a4..ae369d2fa3a7e3d74cd2da15d9ac49b357b99c9e 100644
|
| --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
|
| +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
|
| @@ -20,11 +20,9 @@
|
|
|
| #include "webrtc/base/scoped_ptr.h"
|
| #include "webrtc/common_audio/lapped_transform.h"
|
| +#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
|
| #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
|
|
|
| -struct WebRtcVadInst;
|
| -typedef struct WebRtcVadInst VadInst;
|
| -
|
| namespace webrtc {
|
|
|
| // Speech intelligibility enhancement module. Reads render and capture
|
| @@ -33,32 +31,65 @@ namespace webrtc {
|
| // Note: assumes speech and noise streams are already separated.
|
| class IntelligibilityEnhancer {
|
| public:
|
| - // Construct a new instance with the given filter bank resolution,
|
| - // sampling rate, number of channels and analysis rates.
|
| - // |analysis_rate| sets the number of input blocks (containing speech!)
|
| - // to elapse before a new gain computation is made. |variance_rate| specifies
|
| - // the number of gain recomputations after which the variances are reset.
|
| - // |cv_*| are parameters for the VarianceArray constructor for the
|
| - // clear speech stream.
|
| - // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should
|
| - // probably go away once fine tuning is done. They override the internal
|
| - // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).
|
| - IntelligibilityEnhancer(int erb_resolution,
|
| - int sample_rate_hz,
|
| - int channels,
|
| - int cv_type,
|
| - float cv_alpha,
|
| - int cv_win,
|
| - int analysis_rate,
|
| - int variance_rate,
|
| - float gain_limit);
|
| + struct Config {
|
| + // |var_*| are parameters for the VarianceArray constructor for the
|
| + // clear speech stream.
|
| + // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should
|
| + // probably go away once fine tuning is done. They override the internal
|
| + // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).
|
| + Config()
|
| + : sample_rate_hz(16000),
|
| + channels(1),
|
| + var_type(intelligibility::VarianceArray::kStepDecaying),
|
| + var_decay_rate(0.9f),
|
| + var_window_size(10),
|
| + analysis_rate(800),
|
| + gain_change_limit(0.1f),
|
| + rho(0.02f),
|
| + capture_vad_thresh(1.f),
|
| + render_vad_thresh(0.f),
|
| + activate_snr_thresh(0.f),
|
| + deactivate_snr_thresh(100000.f) {}
|
| + int sample_rate_hz;
|
| + int channels;
|
| + intelligibility::VarianceArray::StepType var_type;
|
| + float var_decay_rate;
|
| + int var_window_size;
|
| + int analysis_rate;
|
| + float gain_change_limit;
|
| + float rho;
|
| + float capture_vad_thresh;
|
| + float render_vad_thresh;
|
| + float activate_snr_thresh;
|
| + float deactivate_snr_thresh;
|
| + };
|
| +
|
| + explicit IntelligibilityEnhancer(const Config& config);
|
| + IntelligibilityEnhancer(); // Initialize with default config.
|
| +
|
| ~IntelligibilityEnhancer();
|
|
|
| - // Reads and processes chunk of noise stream in time domain.
|
| - void ProcessCaptureAudio(float* const* audio);
|
| + // Reads and processes chunk of noise stream in time domain. Only updates
|
| + // noise estimate when |voice_probability| below a threshold. Uses internal
|
| + // VAD when |voice_probability| not provided.
|
| + void AnalyzeCaptureAudio(float* const* audio,
|
| + int sample_rate_hz,
|
| + int num_channels,
|
| + float voice_probability);
|
| + void AnalyzeCaptureAudio(float* const* audio,
|
| + int sample_rate_hz,
|
| + int num_channels);
|
|
|
| // Reads chunk of speech in time domain and updates with modified signal.
|
| - void ProcessRenderAudio(float* const* audio);
|
| + // Only updates speech estimate when |voice_probability| above a threshold.
|
| + // Uses internal VAD when |voice_probability| not provided.
|
| + void ProcessRenderAudio(float* const* audio,
|
| + int sample_rate_hz,
|
| + int num_channels,
|
| + float voice_probability);
|
| + void ProcessRenderAudio(float* const* audio,
|
| + int sample_rate_hz,
|
| + int num_channels);
|
|
|
| private:
|
| enum AudioSource {
|
| @@ -124,6 +155,12 @@ class IntelligibilityEnhancer {
|
| // Stores in |result|.
|
| void FilterVariance(const float* var, float* result);
|
|
|
| + // Returns ratio of total variance of clear to noise.
|
| + float SNR();
|
| +
|
| + // Updates |active_| based on SNR.
|
| + void UpdateActivity();
|
| +
|
| // Returns dot product of vectors specified by size |length| arrays |a|,|b|.
|
| static float DotProduct(const float* a, const float* b, int length);
|
|
|
| @@ -135,7 +172,13 @@ class IntelligibilityEnhancer {
|
| const int erb_resolution_;
|
| const int channels_; // Num channels.
|
| const int analysis_rate_; // Num blocks before gains recalculated.
|
| - const int variance_rate_; // Num recalculations before history is cleared.
|
| + const float capture_vad_thresh_; // Threshold for updating noise estimate.
|
| + const float render_vad_thresh_; // Threshold for updating speech estimate.
|
| + const float activate_snr_thresh_; // Threshold for activating gain updates.
|
| + const float deactivate_snr_thresh_; // Threshold for deactivating.
|
| +
|
| + bool active_; // Whether render gains are being updated.
|
| + bool deactivating_; // True when we are smoothing enhancer off.
|
|
|
| intelligibility::VarianceArray clear_variance_;
|
| intelligibility::VarianceArray noise_variance_;
|
| @@ -154,7 +197,6 @@ class IntelligibilityEnhancer {
|
| // TODO(ekmeyerson): Switch to using ChannelBuffer.
|
| float** temp_out_buffer_;
|
|
|
| - rtc::scoped_ptr<float* []> input_audio_;
|
| rtc::scoped_ptr<float[]> kbd_window_;
|
| TransformCallback render_callback_;
|
| TransformCallback capture_callback_;
|
| @@ -163,13 +205,13 @@ class IntelligibilityEnhancer {
|
| int block_count_;
|
| int analysis_step_;
|
|
|
| - // TODO(bercic): Quick stopgap measure for voice detection in the clear
|
| - // and noise streams.
|
| - // Note: VAD currently does not affect anything in IntelligibilityEnhancer.
|
| - VadInst* vad_high_;
|
| - VadInst* vad_low_;
|
| + VoiceActivityDetector capture_vad_;
|
| + VoiceActivityDetector render_vad_;
|
| + float capture_voice_probability_;
|
| + float render_voice_probability_;
|
| + bool using_capture_vad_;
|
| + bool using_render_vad_;
|
| rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;
|
| - bool has_voice_low_; // Whether voice detected in speech stream.
|
| };
|
|
|
| } // namespace webrtc
|
|
|