webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Updated interface, how VAD is used, other issues Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/include/mock_audio_processing.h ('K') | « webrtc/modules/audio_processing/include/mock_audio_processing.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Specifies core class for intelligbility enhancement.	12 // Specifies core class for intelligbility enhancement.

13 //	13 //

14	14

15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

17	17

18 #include <complex>	18 #include <complex>

19 #include <vector>	19 #include <vector>

20	20

21 #include "webrtc/base/scoped_ptr.h"	21 #include "webrtc/base/scoped_ptr.h"

22 #include "webrtc/common_audio/lapped_transform.h"	22 #include "webrtc/common_audio/lapped_transform.h"

	23 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

23 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

24	25

25 struct WebRtcVadInst;

26 typedef struct WebRtcVadInst VadInst;

27

28 namespace webrtc {	26 namespace webrtc {

29	27

30 // Speech intelligibility enhancement module. Reads render and capture	28 // Speech intelligibility enhancement module. Reads render and capture

31 // audio streams and modifies the render stream with a set of gains per	29 // audio streams and modifies the render stream with a set of gains per

32 // frequency bin to enhance speech against the noise background.	30 // frequency bin to enhance speech against the noise background.

33 // Note: assumes speech and noise streams are already separated.	31 // Note: assumes speech and noise streams are already separated.

34 class IntelligibilityEnhancer {	32 class IntelligibilityEnhancer {

35 public:	33 public:

36 // Construct a new instance with the given filter bank resolution,	34 struct Config {

37 // sampling rate, number of channels and analysis rates.	35 // \|var_*\| are parameters for the VarianceArray constructor for the

38 // \|analysis_rate\| sets the number of input blocks (containing speech!)	36 // clear speech stream.

39 // to elapse before a new gain computation is made. \|variance_rate\| specifies	37 // TODO(bercic): the \|var_\|, \|_rate\| and \|gain_limit\| parameters should

40 // the number of gain recomputations after which the variances are reset.	38 // probably go away once fine tuning is done. They override the internal

41 // \|cv_*\| are parameters for the VarianceArray constructor for the	39 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

42 // clear speech stream.	40 Config()

43 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should	41 : sample_rate_hz(16000),

44 // probably go away once fine tuning is done. They override the internal	42 channels(1),

45 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).	43 var_type(intelligibility::VarianceArray::kStepDecaying),

46 IntelligibilityEnhancer(int erb_resolution,	44 var_decay_rate(0.9f),

47 int sample_rate_hz,	45 var_window_size(10),

48 int channels,	46 analysis_rate(800),

49 int cv_type,	47 gain_change_limit(0.1f),

50 float cv_alpha,	48 rho(0.02f),

51 int cv_win,	49 capture_vad_thresh(1.f),

52 int analysis_rate,	50 render_vad_thresh(0.f),

53 int variance_rate,	51 activate_snr_thresh(0.f),

54 float gain_limit);	52 deactivate_snr_thresh(100000.f) {}

	53 int sample_rate_hz;

	54 int channels;

	55 intelligibility::VarianceArray::StepType var_type;

	56 float var_decay_rate;

	57 int var_window_size;

	58 int analysis_rate;

	59 float gain_change_limit;

	60 float rho;

	61 float capture_vad_thresh;

	62 float render_vad_thresh;

	63 float activate_snr_thresh;

	64 float deactivate_snr_thresh;

	65 };

	66

	67 explicit IntelligibilityEnhancer(const Config& config);

	68 IntelligibilityEnhancer(); // Initialize with default config.

	69

55 ~IntelligibilityEnhancer();	70 ~IntelligibilityEnhancer();

56	71

57 // Reads and processes chunk of noise stream in time domain.	72 // Reads and processes chunk of noise stream in time domain. Only updates

58 void ProcessCaptureAudio(float* const* audio);	73 // noise estimate when \|voice_probability\| below a threshold. Uses internal

	74 // VAD when \|voice_probability\| not provided.

	75 void AnalyzeCaptureAudio(float* const* audio,

	76 int sample_rate_hz,

	77 int num_channels,

	78 float voice_probability);

	79 void AnalyzeCaptureAudio(float* const* audio,

	80 int sample_rate_hz,

	81 int num_channels);

59	82

60 // Reads chunk of speech in time domain and updates with modified signal.	83 // Reads chunk of speech in time domain and updates with modified signal.

61 void ProcessRenderAudio(float* const* audio);	84 // Only updates speech estimate when \|voice_probability\| above a threshold.

	85 // Uses internal VAD when \|voice_probability\| not provided.

	86 void ProcessRenderAudio(float* const* audio,

	87 int sample_rate_hz,

	88 int num_channels,

	89 float voice_probability);

	90 void ProcessRenderAudio(float* const* audio,

	91 int sample_rate_hz,

	92 int num_channels);

62	93

63 private:	94 private:

64 enum AudioSource {	95 enum AudioSource {

65 kRenderStream = 0, // Clear speech stream.	96 kRenderStream = 0, // Clear speech stream.

66 kCaptureStream, // Noise stream.	97 kCaptureStream, // Noise stream.

67 };	98 };

68	99

69 // Provides access point to the frequency domain.	100 // Provides access point to the frequency domain.

70 class TransformCallback : public LappedTransform::Callback {	101 class TransformCallback : public LappedTransform::Callback {

71 public:	102 public:

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
117 void CreateErbBank();	148 void CreateErbBank();

118	149

119 // Analytically solves quadratic for optimal gains given \|lambda\|.	150 // Analytically solves quadratic for optimal gains given \|lambda\|.

120 // Negative gains are set to 0. Stores the results in \|sols\|.	151 // Negative gains are set to 0. Stores the results in \|sols\|.

121 void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);	152 void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);

122	153

123 // Computes variance across ERB filters from freq variance \|var\|.	154 // Computes variance across ERB filters from freq variance \|var\|.

124 // Stores in \|result\|.	155 // Stores in \|result\|.

125 void FilterVariance(const float* var, float* result);	156 void FilterVariance(const float* var, float* result);

126	157

	158 // Returns ratio of total variance of clear to noise.

	159 float SNR();

	160

	161 // Updates \|active_\| based on SNR.

	162 void UpdateActivity();

	163

127 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.	164 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.

128 static float DotProduct(const float* a, const float* b, int length);	165 static float DotProduct(const float* a, const float* b, int length);

129	166

130 const int freqs_; // Num frequencies in frequency domain.	167 const int freqs_; // Num frequencies in frequency domain.

131 const int window_size_; // Window size in samples; also the block size.	168 const int window_size_; // Window size in samples; also the block size.

132 const int chunk_length_; // Chunk size in samples.	169 const int chunk_length_; // Chunk size in samples.

133 const int bank_size_; // Num ERB filters.	170 const int bank_size_; // Num ERB filters.

134 const int sample_rate_hz_;	171 const int sample_rate_hz_;

135 const int erb_resolution_;	172 const int erb_resolution_;

136 const int channels_; // Num channels.	173 const int channels_; // Num channels.

137 const int analysis_rate_; // Num blocks before gains recalculated.	174 const int analysis_rate_; // Num blocks before gains recalculated.

138 const int variance_rate_; // Num recalculations before history is cleared.	175 const float capture_vad_thresh_; // Threshold for updating noise estimate.

	176 const float render_vad_thresh_; // Threshold for updating speech estimate.

	177 const float activate_snr_thresh_; // Threshold for activating gain updates.

	178 const float deactivate_snr_thresh_; // Threshold for deactivating.

	179

	180 bool active_; // Whether render gains are being updated.

	181 bool deactivating_; // True when we are smoothing enhancer off.

139	182

140 intelligibility::VarianceArray clear_variance_;	183 intelligibility::VarianceArray clear_variance_;

141 intelligibility::VarianceArray noise_variance_;	184 intelligibility::VarianceArray noise_variance_;

142 rtc::scoped_ptr<float[]> filtered_clear_var_;	185 rtc::scoped_ptr<float[]> filtered_clear_var_;

143 rtc::scoped_ptr<float[]> filtered_noise_var_;	186 rtc::scoped_ptr<float[]> filtered_noise_var_;

144 std::vector<std::vector<float>> filter_bank_;	187 std::vector<std::vector<float>> filter_bank_;

145 rtc::scoped_ptr<float[]> center_freqs_;	188 rtc::scoped_ptr<float[]> center_freqs_;

146 int start_freq_;	189 int start_freq_;

147 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.	190 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.

148 // for each ERB band.	191 // for each ERB band.

149 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.	192 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

150 intelligibility::GainApplier gain_applier_;	193 intelligibility::GainApplier gain_applier_;

151	194

152 // Destination buffer used to reassemble blocked chunks before overwriting	195 // Destination buffer used to reassemble blocked chunks before overwriting

153 // the original input array with modifications.	196 // the original input array with modifications.

154 // TODO(ekmeyerson): Switch to using ChannelBuffer.	197 // TODO(ekmeyerson): Switch to using ChannelBuffer.

155 float** temp_out_buffer_;	198 float** temp_out_buffer_;

156	199

157 rtc::scoped_ptr<float* []> input_audio_;

158 rtc::scoped_ptr<float[]> kbd_window_;	200 rtc::scoped_ptr<float[]> kbd_window_;

159 TransformCallback render_callback_;	201 TransformCallback render_callback_;

160 TransformCallback capture_callback_;	202 TransformCallback capture_callback_;

161 rtc::scoped_ptr<LappedTransform> render_mangler_;	203 rtc::scoped_ptr<LappedTransform> render_mangler_;

162 rtc::scoped_ptr<LappedTransform> capture_mangler_;	204 rtc::scoped_ptr<LappedTransform> capture_mangler_;

163 int block_count_;	205 int block_count_;

164 int analysis_step_;	206 int analysis_step_;

165	207

166 // TODO(bercic): Quick stopgap measure for voice detection in the clear	208 VoiceActivityDetector capture_vad_;

167 // and noise streams.	209 VoiceActivityDetector render_vad_;

168 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.	210 float capture_voice_probability_;

169 VadInst* vad_high_;	211 float render_voice_probability_;

170 VadInst* vad_low_;	212 bool using_capture_vad_;

	213 bool using_render_vad_;

171 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;	214 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

172 bool has_voice_low_; // Whether voice detected in speech stream.

173 };	215 };

174	216

175 } // namespace webrtc	217 } // namespace webrtc

176	218

177 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	219 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW