webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1182323005: Allow intelligibility to compile in apm

Unified Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1182323005: Allow intelligibility to compile in apm (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Addressed comments Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « webrtc/modules/audio_processing/audio_processing_tests.gypi ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

index d0818f688c5e6750b430dd81a429ccb3432a7f10..8125707f120981c40817152a80bcbea43f2e3006 100644

--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

@@ -8,14 +8,18 @@

* be found in the AUTHORS file in the root of the source tree.

+//

+// Specifies core class for intelligbility enhancement.

+//

#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_

#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_

#include <complex>

+#include "webrtc/base/scoped_ptr.h"

#include "webrtc/common_audio/lapped_transform.h"

#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"

-#include "webrtc/system_wrappers/interface/scoped_ptr.h"

struct WebRtcVadInst;

typedef struct WebRtcVadInst VadInst;

@@ -25,6 +29,7 @@ namespace webrtc {

// Speech intelligibility enhancement module. Reads render and capture

// audio streams and modifies the render stream with a set of gains per

// frequency bin to enhance speech against the noise background.

+// Note: assumes speech and noise streams are already separated.

class IntelligibilityEnhancer {

public:

// Construct a new instance with the given filter bank resolution,

@@ -33,30 +38,43 @@ class IntelligibilityEnhancer {

// to elapse before a new gain computation is made. |variance_rate| specifies

// the number of gain recomputations after which the variances are reset.

// |cv_*| are parameters for the VarianceArray constructor for the

- // lear speech stream.

+ // clear speech stream.

// TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should

// probably go away once fine tuning is done. They override the internal

// constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

- IntelligibilityEnhancer(int erb_resolution, int sample_rate_hz, int channels,

- int cv_type, float cv_alpha, int cv_win,

- int analysis_rate, int variance_rate,

+ IntelligibilityEnhancer(int erb_resolution,

+ int sample_rate_hz,

+ int channels,

+ int cv_type,

+ float cv_alpha,

+ int cv_win,

+ int analysis_rate,

+ int variance_rate,

float gain_limit);

~IntelligibilityEnhancer();

- void ProcessRenderAudio(float* const* audio);

+ // Reads and processes chunk of noise stream in time domain.

void ProcessCaptureAudio(float* const* audio);

+ // Reads chunk of speech in time domain and updates with modified signal.

+ void ProcessRenderAudio(float* const* audio);

private:

enum AudioSource {

- kRenderStream = 0,

- kCaptureStream,

+ kRenderStream = 0, // Clear speech stream.

+ kCaptureStream, // Noise stream.

};

+ // Provides access point to the frequency domain.

class TransformCallback : public LappedTransform::Callback {

public:

TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);

+ // All in frequency domain, receives input |in_block|, applies

+ // intelligibility enhancement, and writes result to |out_block|.

virtual void ProcessAudioBlock(const std::complex<float>* const* in_block,

- int in_channels, int frames,

+ int in_channels,

+ int frames,

int out_channels,

std::complex<float>* const* out_block);

@@ -66,72 +84,95 @@ class IntelligibilityEnhancer {

};

friend class TransformCallback;

- void DispatchAudio(AudioSource source, const std::complex<float>* in_block,

+ // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source.

+ void DispatchAudio(AudioSource source,

+ const std::complex<float>* in_block,

std::complex<float>* out_block);

+ // Updates variance computation and analysis with |in_block_|,

+ // and writes modified speech to |out_block|.

void ProcessClearBlock(const std::complex<float>* in_block,

std::complex<float>* out_block);

+ // Computes and sets modified gains.

void AnalyzeClearBlock(float power_target);

+ // Updates variance calculation for noise input with |in_block|.

void ProcessNoiseBlock(const std::complex<float>* in_block,

std::complex<float>* out_block);

+ // Returns number of ERB filters.

static int GetBankSize(int sample_rate, int erb_resolution);

+ // Initializes ERB filterbank.

void CreateErbBank();

- void SolveEquation14(float lambda, int start_freq, float* sols);

+ // Analytically solves quadratic for optimal gains given |lambda|.

+ // Negative gains are set to 0. Stores the results in |sols|.

+ void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);

+ // Computes variance across ERB filters from freq variance |var|.

+ // Stores in |result|.

void FilterVariance(const float* var, float* result);

+ // Returns dot product of vectors specified by size |length| arrays |a|,|b|.

static float DotProduct(const float* a, const float* b, int length);

static const int kErbResolution;

static const int kWindowSizeMs;

static const int kChunkSizeMs;

- static const int kAnalyzeRate;

- static const int kVarianceRate;

+ static const int kAnalyzeRate; // Default for |analysis_rate_|.

+ static const int kVarianceRate; // Default for |variance_rate_|.

static const float kClipFreq;

- static const float kConfigRho;

+ static const float kConfigRho; // Default production and interpretation SNR.

static const float kKbdAlpha;

static const float kGainChangeLimit;

- const int freqs_;

- const int window_size_; // window size in samples; also the block size

- const int chunk_length_; // chunk size in samples

- const int bank_size_;

+ const int freqs_; // Num frequencies in frequency domain.

+ const int window_size_; // Window size in samples; also the block size.

+ const int chunk_length_; // Chunk size in samples.

+ const int bank_size_; // Num ERB filters.

const int sample_rate_hz_;

const int erb_resolution_;

- const int channels_;

- const int analysis_rate_;

- const int variance_rate_;

+ const int channels_; // Num channels.

+ const int analysis_rate_; // Num blocks before gains recalculated.

+ const int variance_rate_; // Num recalculations before history is cleared.

intelligibility::VarianceArray clear_variance_;

intelligibility::VarianceArray noise_variance_;

- scoped_ptr<float[]> filtered_clear_var_;

- scoped_ptr<float[]> filtered_noise_var_;

- float** filter_bank_;

- scoped_ptr<float[]> center_freqs_;

+ rtc::scoped_ptr<float[]> filtered_clear_var_;

+ rtc::scoped_ptr<float[]> filtered_noise_var_;

+ float** filter_bank_; // TODO(ekmeyerson): Switch to using ChannelBuffer.

+ rtc::scoped_ptr<float[]> center_freqs_;

int start_freq_;

- scoped_ptr<float[]> rho_;

- scoped_ptr<float[]> gains_eq_;

+ rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.

+ // for each ERB band.

+ rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

intelligibility::GainApplier gain_applier_;

// Destination buffer used to reassemble blocked chunks before overwriting

// the original input array with modifications.

+ // TODO(ekmeyerson): Switch to using ChannelBuffer.

float** temp_out_buffer_;

- scoped_ptr<float*[]> input_audio_;

- scoped_ptr<float[]> kbd_window_;

+ rtc::scoped_ptr<float* []> input_audio_;

+ rtc::scoped_ptr<float[]> kbd_window_;

TransformCallback render_callback_;

TransformCallback capture_callback_;

- scoped_ptr<LappedTransform> render_mangler_;

- scoped_ptr<LappedTransform> capture_mangler_;

+ rtc::scoped_ptr<LappedTransform> render_mangler_;

+ rtc::scoped_ptr<LappedTransform> capture_mangler_;

int block_count_;

int analysis_step_;

// TODO(bercic): Quick stopgap measure for voice detection in the clear

// and noise streams.

+ // Note: VAD currently does not affect anything in IntelligibilityEnhancer.

VadInst* vad_high_;

VadInst* vad_low_;

- scoped_ptr<int16_t[]> vad_tmp_buffer_;

- bool has_voice_low_;

+ rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

+ bool has_voice_low_; // Whether voice detected in speech stream.

};

} // namespace webrtc

#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_