Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(55)

Unified Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master
Patch Set: Fixed memcpy Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
index df47de597885ed61d9dd9a824d2c6505c1be99a4..12c7e732b99d9fd8568c7d108717a9d260acf5e6 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
@@ -20,11 +20,10 @@
#include "webrtc/base/scoped_ptr.h"
#include "webrtc/common_audio/lapped_transform.h"
+#include "webrtc/common_audio/channel_buffer.h"
+#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
-struct WebRtcVadInst;
-typedef struct WebRtcVadInst VadInst;
-
namespace webrtc {
// Speech intelligibility enhancement module. Reads render and capture
@@ -33,32 +32,64 @@ namespace webrtc {
// Note: assumes speech and noise streams are already separated.
class IntelligibilityEnhancer {
public:
- // Construct a new instance with the given filter bank resolution,
- // sampling rate, number of channels and analysis rates.
- // |analysis_rate| sets the number of input blocks (containing speech!)
- // to elapse before a new gain computation is made. |variance_rate| specifies
- // the number of gain recomputations after which the variances are reset.
- // |cv_*| are parameters for the VarianceArray constructor for the
- // clear speech stream.
- // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should
- // probably go away once fine tuning is done. They override the internal
- // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).
- IntelligibilityEnhancer(int erb_resolution,
- int sample_rate_hz,
- int channels,
- int cv_type,
- float cv_alpha,
- int cv_win,
- int analysis_rate,
- int variance_rate,
- float gain_limit);
- ~IntelligibilityEnhancer();
-
- // Reads and processes chunk of noise stream in time domain.
- void ProcessCaptureAudio(float* const* audio);
+ struct Config {
+ // |var_*| are parameters for the VarianceArray constructor for the
+ // clear speech stream.
+ // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should
+ // probably go away once fine tuning is done.
+ Config()
+ : sample_rate_hz(16000),
+ num_capture_channels(1),
+ num_render_channels(1),
+ var_type(intelligibility::VarianceArray::kStepDecaying),
+ var_decay_rate(0.9f),
+ var_window_size(10),
+ analysis_rate(800),
+ gain_change_limit(0.1f),
+ rho(0.02f),
+ capture_vad_thresh(1.f),
+ render_vad_thresh(0.f),
+ activate_snr_thresh(0.f),
+ deactivate_snr_thresh(100000.f) {}
+ int sample_rate_hz;
+ int num_capture_channels;
+ int num_render_channels;
+ intelligibility::VarianceArray::StepType var_type;
+ float var_decay_rate;
+ int var_window_size;
+ int analysis_rate;
+ float gain_change_limit;
+ float rho;
+ float capture_vad_thresh;
+ float render_vad_thresh;
+ float activate_snr_thresh;
+ float deactivate_snr_thresh;
+ };
+
+ explicit IntelligibilityEnhancer(const Config& config);
+ IntelligibilityEnhancer(); // Initialize with default config.
+
+ // Reads and processes chunk of noise stream in time domain. Only updates
+ // noise estimate when |voice_probability| below a threshold. Uses internal
+ // VAD when |voice_probability| not provided.
+ void AnalyzeCaptureAudio(float* const* audio,
+ int sample_rate_hz,
+ int num_channels,
+ float voice_probability);
+ void AnalyzeCaptureAudio(float* const* audio,
+ int sample_rate_hz,
+ int num_channels);
// Reads chunk of speech in time domain and updates with modified signal.
- void ProcessRenderAudio(float* const* audio);
+ // Only updates speech estimate when |voice_probability| above a threshold.
+ // Uses internal VAD when |voice_probability| not provided.
+ void ProcessRenderAudio(float* const* audio,
+ int sample_rate_hz,
+ int num_channels,
+ float voice_probability);
+ void ProcessRenderAudio(float* const* audio,
+ int sample_rate_hz,
+ int num_channels);
private:
enum AudioSource {
@@ -124,6 +155,12 @@ class IntelligibilityEnhancer {
// Stores in |result|.
void FilterVariance(const float* var, float* result);
+ // Returns ratio of total variance of clear to noise.
+ float SNR();
Andrew MacDonald 2015/07/24 23:50:40 Sorry to do this, but could you please move these
turaj 2015/07/27 20:01:05 This method is const.
ekm 2015/07/29 00:37:19 Done.
+
+ // Updates |active_| based on SNR.
+ void UpdateActivity();
+
// Returns dot product of vectors specified by size |length| arrays |a|,|b|.
static float DotProduct(const float* a, const float* b, int length);
@@ -133,9 +170,16 @@ class IntelligibilityEnhancer {
const int bank_size_; // Num ERB filters.
const int sample_rate_hz_;
const int erb_resolution_;
- const int channels_; // Num channels.
+ const int num_capture_channels_;
+ const int num_render_channels_;
const int analysis_rate_; // Num blocks before gains recalculated.
- const int variance_rate_; // Num recalculations before history is cleared.
+ const float capture_vad_thresh_; // Threshold for updating noise estimate.
+ const float render_vad_thresh_; // Threshold for updating speech estimate.
+ const float activate_snr_thresh_; // Threshold for activating gain updates.
+ const float deactivate_snr_thresh_; // Threshold for deactivating.
+
+ bool active_; // Whether render gains are being updated.
+ bool deactivating_; // True when we are smoothing enhancer off.
intelligibility::VarianceArray clear_variance_;
intelligibility::VarianceArray noise_variance_;
@@ -149,12 +193,11 @@ class IntelligibilityEnhancer {
rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.
intelligibility::GainApplier gain_applier_;
- // Destination buffer used to reassemble blocked chunks before overwriting
+ // Destination buffers used to reassemble blocked chunks before overwriting
// the original input array with modifications.
- // TODO(ekmeyerson): Switch to using ChannelBuffer.
- float** temp_out_buffer_;
+ ChannelBuffer<float> temp_render_out_buffer_;
+ ChannelBuffer<float> temp_capture_out_buffer_;
- rtc::scoped_ptr<float* []> input_audio_;
rtc::scoped_ptr<float[]> kbd_window_;
TransformCallback render_callback_;
TransformCallback capture_callback_;
@@ -163,13 +206,13 @@ class IntelligibilityEnhancer {
int block_count_;
int analysis_step_;
- // TODO(bercic): Quick stopgap measure for voice detection in the clear
- // and noise streams.
- // Note: VAD currently does not affect anything in IntelligibilityEnhancer.
- VadInst* vad_high_;
- VadInst* vad_low_;
+ VoiceActivityDetector capture_vad_;
+ VoiceActivityDetector render_vad_;
+ float capture_voice_probability_;
+ float render_voice_probability_;
+ bool using_capture_vad_;
+ bool using_render_vad_;
rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;
- bool has_voice_low_; // Whether voice detected in speech stream.
};
} // namespace webrtc

Powered by Google App Engine
This is Rietveld 408576698