webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Fixed memcpy Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/audio_processing_impl.cc ('K') | « webrtc/modules/audio_processing/include/mock_audio_processing.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Specifies core class for intelligbility enhancement.	12 // Specifies core class for intelligbility enhancement.

13 //	13 //

14	14

15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

17	17

18 #include <complex>	18 #include <complex>

19 #include <vector>	19 #include <vector>

20	20

21 #include "webrtc/base/scoped_ptr.h"	21 #include "webrtc/base/scoped_ptr.h"

22 #include "webrtc/common_audio/lapped_transform.h"	22 #include "webrtc/common_audio/lapped_transform.h"

	23 #include "webrtc/common_audio/channel_buffer.h"

	24 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

23 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	25 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

24	26

25 struct WebRtcVadInst;

26 typedef struct WebRtcVadInst VadInst;

27

28 namespace webrtc {	27 namespace webrtc {

29	28

30 // Speech intelligibility enhancement module. Reads render and capture	29 // Speech intelligibility enhancement module. Reads render and capture

31 // audio streams and modifies the render stream with a set of gains per	30 // audio streams and modifies the render stream with a set of gains per

32 // frequency bin to enhance speech against the noise background.	31 // frequency bin to enhance speech against the noise background.

33 // Note: assumes speech and noise streams are already separated.	32 // Note: assumes speech and noise streams are already separated.

34 class IntelligibilityEnhancer {	33 class IntelligibilityEnhancer {

35 public:	34 public:

36 // Construct a new instance with the given filter bank resolution,	35 struct Config {

37 // sampling rate, number of channels and analysis rates.	36 // \|var_*\| are parameters for the VarianceArray constructor for the

38 // \|analysis_rate\| sets the number of input blocks (containing speech!)	37 // clear speech stream.

39 // to elapse before a new gain computation is made. \|variance_rate\| specifies	38 // TODO(bercic): the \|var_\|, \|_rate\| and \|gain_limit\| parameters should

40 // the number of gain recomputations after which the variances are reset.	39 // probably go away once fine tuning is done.

41 // \|cv_*\| are parameters for the VarianceArray constructor for the	40 Config()

42 // clear speech stream.	41 : sample_rate_hz(16000),

43 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should	42 num_capture_channels(1),

44 // probably go away once fine tuning is done. They override the internal	43 num_render_channels(1),

45 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).	44 var_type(intelligibility::VarianceArray::kStepDecaying),

46 IntelligibilityEnhancer(int erb_resolution,	45 var_decay_rate(0.9f),

47 int sample_rate_hz,	46 var_window_size(10),

48 int channels,	47 analysis_rate(800),

49 int cv_type,	48 gain_change_limit(0.1f),

50 float cv_alpha,	49 rho(0.02f),

51 int cv_win,	50 capture_vad_thresh(1.f),

52 int analysis_rate,	51 render_vad_thresh(0.f),

53 int variance_rate,	52 activate_snr_thresh(0.f),

54 float gain_limit);	53 deactivate_snr_thresh(100000.f) {}

55 ~IntelligibilityEnhancer();	54 int sample_rate_hz;

	55 int num_capture_channels;

	56 int num_render_channels;

	57 intelligibility::VarianceArray::StepType var_type;

	58 float var_decay_rate;

	59 int var_window_size;

	60 int analysis_rate;

	61 float gain_change_limit;

	62 float rho;

	63 float capture_vad_thresh;

	64 float render_vad_thresh;

	65 float activate_snr_thresh;

	66 float deactivate_snr_thresh;

	67 };

56	68

57 // Reads and processes chunk of noise stream in time domain.	69 explicit IntelligibilityEnhancer(const Config& config);

58 void ProcessCaptureAudio(float* const* audio);	70 IntelligibilityEnhancer(); // Initialize with default config.

	71

	72 // Reads and processes chunk of noise stream in time domain. Only updates

	73 // noise estimate when \|voice_probability\| below a threshold. Uses internal

	74 // VAD when \|voice_probability\| not provided.

	75 void AnalyzeCaptureAudio(float* const* audio,

	76 int sample_rate_hz,

	77 int num_channels,

	78 float voice_probability);

	79 void AnalyzeCaptureAudio(float* const* audio,

	80 int sample_rate_hz,

	81 int num_channels);

59	82

60 // Reads chunk of speech in time domain and updates with modified signal.	83 // Reads chunk of speech in time domain and updates with modified signal.

61 void ProcessRenderAudio(float* const* audio);	84 // Only updates speech estimate when \|voice_probability\| above a threshold.

	85 // Uses internal VAD when \|voice_probability\| not provided.

	86 void ProcessRenderAudio(float* const* audio,

	87 int sample_rate_hz,

	88 int num_channels,

	89 float voice_probability);

	90 void ProcessRenderAudio(float* const* audio,

	91 int sample_rate_hz,

	92 int num_channels);

62	93

63 private:	94 private:

64 enum AudioSource {	95 enum AudioSource {

65 kRenderStream = 0, // Clear speech stream.	96 kRenderStream = 0, // Clear speech stream.

66 kCaptureStream, // Noise stream.	97 kCaptureStream, // Noise stream.

67 };	98 };

68	99

69 // Provides access point to the frequency domain.	100 // Provides access point to the frequency domain.

70 class TransformCallback : public LappedTransform::Callback {	101 class TransformCallback : public LappedTransform::Callback {

71 public:	102 public:

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
117 void CreateErbBank();	148 void CreateErbBank();

118	149

119 // Analytically solves quadratic for optimal gains given \|lambda\|.	150 // Analytically solves quadratic for optimal gains given \|lambda\|.

120 // Negative gains are set to 0. Stores the results in \|sols\|.	151 // Negative gains are set to 0. Stores the results in \|sols\|.

121 void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);	152 void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);

122	153

123 // Computes variance across ERB filters from freq variance \|var\|.	154 // Computes variance across ERB filters from freq variance \|var\|.

124 // Stores in \|result\|.	155 // Stores in \|result\|.

125 void FilterVariance(const float* var, float* result);	156 void FilterVariance(const float* var, float* result);

126	157

	158 // Returns ratio of total variance of clear to noise.

	159 float SNR();
	Andrew MacDonald 2015/07/24 23:50:40 Sorry to do this, but could you please move these Sorry to do this, but could you please move these new features which are modifying the output results of IE to another CL? You don't even mention these in the CL description :) turaj 2015/07/27 20:01:05 This method is const. This method is const. ekm 2015/07/29 00:37:19 Done. Show quoted text On 2015/07/24 23:50:40, andrew wrote: > Sorry to do this, but could you please move these new features which are > modifying the output results of IE to another CL? You don't even mention these > in the CL description :) Done.
	160

	161 // Updates \|active_\| based on SNR.

	162 void UpdateActivity();

	163

127 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.	164 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.

128 static float DotProduct(const float* a, const float* b, int length);	165 static float DotProduct(const float* a, const float* b, int length);

129	166

130 const int freqs_; // Num frequencies in frequency domain.	167 const int freqs_; // Num frequencies in frequency domain.
	turaj 2015/07/27 20:01:05 As comments eventually will be mis-aligned, maybe As comments eventually will be mis-aligned, maybe better to stick to the 2-spaces rule. ekm 2015/07/29 23:35:06 'git cl format' keeps re-aligning these. I'll just Show quoted text On 2015/07/27 20:01:05, turaj wrote: > As comments eventually will be mis-aligned, maybe better to stick to the > 2-spaces rule. 'git cl format' keeps re-aligning these. I'll just stick with whatever it does.
131 const int window_size_; // Window size in samples; also the block size.	168 const int window_size_; // Window size in samples; also the block size.

132 const int chunk_length_; // Chunk size in samples.	169 const int chunk_length_; // Chunk size in samples.

133 const int bank_size_; // Num ERB filters.	170 const int bank_size_; // Num ERB filters.

134 const int sample_rate_hz_;	171 const int sample_rate_hz_;

135 const int erb_resolution_;	172 const int erb_resolution_;

136 const int channels_; // Num channels.	173 const int num_capture_channels_;

	174 const int num_render_channels_;

137 const int analysis_rate_; // Num blocks before gains recalculated.	175 const int analysis_rate_; // Num blocks before gains recalculated.

138 const int variance_rate_; // Num recalculations before history is cleared.	176 const float capture_vad_thresh_; // Threshold for updating noise estimate.

	177 const float render_vad_thresh_; // Threshold for updating speech estimate.

	178 const float activate_snr_thresh_; // Threshold for activating gain updates.

	179 const float deactivate_snr_thresh_; // Threshold for deactivating.

	180

	181 bool active_; // Whether render gains are being updated.

	182 bool deactivating_; // True when we are smoothing enhancer off.

139	183

140 intelligibility::VarianceArray clear_variance_;	184 intelligibility::VarianceArray clear_variance_;

141 intelligibility::VarianceArray noise_variance_;	185 intelligibility::VarianceArray noise_variance_;

142 rtc::scoped_ptr<float[]> filtered_clear_var_;	186 rtc::scoped_ptr<float[]> filtered_clear_var_;

143 rtc::scoped_ptr<float[]> filtered_noise_var_;	187 rtc::scoped_ptr<float[]> filtered_noise_var_;

144 std::vector<std::vector<float>> filter_bank_;	188 std::vector<std::vector<float>> filter_bank_;

145 rtc::scoped_ptr<float[]> center_freqs_;	189 rtc::scoped_ptr<float[]> center_freqs_;

146 int start_freq_;	190 int start_freq_;

147 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.	191 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.

148 // for each ERB band.	192 // for each ERB band.

149 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.	193 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

150 intelligibility::GainApplier gain_applier_;	194 intelligibility::GainApplier gain_applier_;

151	195

152 // Destination buffer used to reassemble blocked chunks before overwriting	196 // Destination buffers used to reassemble blocked chunks before overwriting

153 // the original input array with modifications.	197 // the original input array with modifications.

154 // TODO(ekmeyerson): Switch to using ChannelBuffer.	198 ChannelBuffer<float> temp_render_out_buffer_;

155 float** temp_out_buffer_;	199 ChannelBuffer<float> temp_capture_out_buffer_;

156	200

157 rtc::scoped_ptr<float* []> input_audio_;

158 rtc::scoped_ptr<float[]> kbd_window_;	201 rtc::scoped_ptr<float[]> kbd_window_;

159 TransformCallback render_callback_;	202 TransformCallback render_callback_;

160 TransformCallback capture_callback_;	203 TransformCallback capture_callback_;

161 rtc::scoped_ptr<LappedTransform> render_mangler_;	204 rtc::scoped_ptr<LappedTransform> render_mangler_;

162 rtc::scoped_ptr<LappedTransform> capture_mangler_;	205 rtc::scoped_ptr<LappedTransform> capture_mangler_;

163 int block_count_;	206 int block_count_;

164 int analysis_step_;	207 int analysis_step_;

165	208

166 // TODO(bercic): Quick stopgap measure for voice detection in the clear	209 VoiceActivityDetector capture_vad_;

167 // and noise streams.	210 VoiceActivityDetector render_vad_;

168 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.	211 float capture_voice_probability_;

169 VadInst* vad_high_;	212 float render_voice_probability_;

170 VadInst* vad_low_;	213 bool using_capture_vad_;

	214 bool using_render_vad_;

171 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;	215 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

172 bool has_voice_low_; // Whether voice detected in speech stream.

173 };	216 };

174	217

175 } // namespace webrtc	218 } // namespace webrtc

176	219

177 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	220 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW