Chromium Code Reviews| Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h |
| diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h |
| index df47de597885ed61d9dd9a824d2c6505c1be99a4..12c7e732b99d9fd8568c7d108717a9d260acf5e6 100644 |
| --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h |
| +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h |
| @@ -20,11 +20,10 @@ |
| #include "webrtc/base/scoped_ptr.h" |
| #include "webrtc/common_audio/lapped_transform.h" |
| +#include "webrtc/common_audio/channel_buffer.h" |
| +#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" |
| #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" |
| -struct WebRtcVadInst; |
| -typedef struct WebRtcVadInst VadInst; |
| - |
| namespace webrtc { |
| // Speech intelligibility enhancement module. Reads render and capture |
| @@ -33,32 +32,64 @@ namespace webrtc { |
| // Note: assumes speech and noise streams are already separated. |
| class IntelligibilityEnhancer { |
| public: |
| - // Construct a new instance with the given filter bank resolution, |
| - // sampling rate, number of channels and analysis rates. |
| - // |analysis_rate| sets the number of input blocks (containing speech!) |
| - // to elapse before a new gain computation is made. |variance_rate| specifies |
| - // the number of gain recomputations after which the variances are reset. |
| - // |cv_*| are parameters for the VarianceArray constructor for the |
| - // clear speech stream. |
| - // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should |
| - // probably go away once fine tuning is done. They override the internal |
| - // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate). |
| - IntelligibilityEnhancer(int erb_resolution, |
| - int sample_rate_hz, |
| - int channels, |
| - int cv_type, |
| - float cv_alpha, |
| - int cv_win, |
| - int analysis_rate, |
| - int variance_rate, |
| - float gain_limit); |
| - ~IntelligibilityEnhancer(); |
| - |
| - // Reads and processes chunk of noise stream in time domain. |
| - void ProcessCaptureAudio(float* const* audio); |
| + struct Config { |
| + // |var_*| are parameters for the VarianceArray constructor for the |
| + // clear speech stream. |
| + // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should |
| + // probably go away once fine tuning is done. |
| + Config() |
| + : sample_rate_hz(16000), |
| + num_capture_channels(1), |
| + num_render_channels(1), |
| + var_type(intelligibility::VarianceArray::kStepDecaying), |
| + var_decay_rate(0.9f), |
| + var_window_size(10), |
| + analysis_rate(800), |
| + gain_change_limit(0.1f), |
| + rho(0.02f), |
| + capture_vad_thresh(1.f), |
| + render_vad_thresh(0.f), |
| + activate_snr_thresh(0.f), |
| + deactivate_snr_thresh(100000.f) {} |
| + int sample_rate_hz; |
| + int num_capture_channels; |
| + int num_render_channels; |
| + intelligibility::VarianceArray::StepType var_type; |
| + float var_decay_rate; |
| + int var_window_size; |
| + int analysis_rate; |
| + float gain_change_limit; |
| + float rho; |
| + float capture_vad_thresh; |
| + float render_vad_thresh; |
| + float activate_snr_thresh; |
| + float deactivate_snr_thresh; |
| + }; |
| + |
| + explicit IntelligibilityEnhancer(const Config& config); |
| + IntelligibilityEnhancer(); // Initialize with default config. |
| + |
| + // Reads and processes chunk of noise stream in time domain. Only updates |
| + // noise estimate when |voice_probability| below a threshold. Uses internal |
| + // VAD when |voice_probability| not provided. |
| + void AnalyzeCaptureAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels, |
| + float voice_probability); |
| + void AnalyzeCaptureAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels); |
| // Reads chunk of speech in time domain and updates with modified signal. |
| - void ProcessRenderAudio(float* const* audio); |
| + // Only updates speech estimate when |voice_probability| above a threshold. |
| + // Uses internal VAD when |voice_probability| not provided. |
| + void ProcessRenderAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels, |
| + float voice_probability); |
| + void ProcessRenderAudio(float* const* audio, |
| + int sample_rate_hz, |
| + int num_channels); |
| private: |
| enum AudioSource { |
| @@ -124,6 +155,12 @@ class IntelligibilityEnhancer { |
| // Stores in |result|. |
| void FilterVariance(const float* var, float* result); |
| + // Returns ratio of total variance of clear to noise. |
| + float SNR(); |
|
Andrew MacDonald
2015/07/24 23:50:40
Sorry to do this, but could you please move these
turaj
2015/07/27 20:01:05
This method is const.
ekm
2015/07/29 00:37:19
Done.
|
| + |
| + // Updates |active_| based on SNR. |
| + void UpdateActivity(); |
| + |
| // Returns dot product of vectors specified by size |length| arrays |a|,|b|. |
| static float DotProduct(const float* a, const float* b, int length); |
| @@ -133,9 +170,16 @@ class IntelligibilityEnhancer { |
| const int bank_size_; // Num ERB filters. |
| const int sample_rate_hz_; |
| const int erb_resolution_; |
| - const int channels_; // Num channels. |
| + const int num_capture_channels_; |
| + const int num_render_channels_; |
| const int analysis_rate_; // Num blocks before gains recalculated. |
| - const int variance_rate_; // Num recalculations before history is cleared. |
| + const float capture_vad_thresh_; // Threshold for updating noise estimate. |
| + const float render_vad_thresh_; // Threshold for updating speech estimate. |
| + const float activate_snr_thresh_; // Threshold for activating gain updates. |
| + const float deactivate_snr_thresh_; // Threshold for deactivating. |
| + |
| + bool active_; // Whether render gains are being updated. |
| + bool deactivating_; // True when we are smoothing enhancer off. |
| intelligibility::VarianceArray clear_variance_; |
| intelligibility::VarianceArray noise_variance_; |
| @@ -149,12 +193,11 @@ class IntelligibilityEnhancer { |
| rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains. |
| intelligibility::GainApplier gain_applier_; |
| - // Destination buffer used to reassemble blocked chunks before overwriting |
| + // Destination buffers used to reassemble blocked chunks before overwriting |
| // the original input array with modifications. |
| - // TODO(ekmeyerson): Switch to using ChannelBuffer. |
| - float** temp_out_buffer_; |
| + ChannelBuffer<float> temp_render_out_buffer_; |
| + ChannelBuffer<float> temp_capture_out_buffer_; |
| - rtc::scoped_ptr<float* []> input_audio_; |
| rtc::scoped_ptr<float[]> kbd_window_; |
| TransformCallback render_callback_; |
| TransformCallback capture_callback_; |
| @@ -163,13 +206,13 @@ class IntelligibilityEnhancer { |
| int block_count_; |
| int analysis_step_; |
| - // TODO(bercic): Quick stopgap measure for voice detection in the clear |
| - // and noise streams. |
| - // Note: VAD currently does not affect anything in IntelligibilityEnhancer. |
| - VadInst* vad_high_; |
| - VadInst* vad_low_; |
| + VoiceActivityDetector capture_vad_; |
| + VoiceActivityDetector render_vad_; |
| + float capture_voice_probability_; |
| + float render_voice_probability_; |
| + bool using_capture_vad_; |
| + bool using_render_vad_; |
| rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_; |
| - bool has_voice_low_; // Whether voice detected in speech stream. |
| }; |
| } // namespace webrtc |