webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/audio_processing_impl.cc ('K') | « webrtc/modules/audio_processing/include/mock_audio_processing.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 15 matching lines...) Expand all Loading...
26 typedef struct WebRtcVadInst VadInst;	26 typedef struct WebRtcVadInst VadInst;

27	27

28 namespace webrtc {	28 namespace webrtc {

29	29

30 // Speech intelligibility enhancement module. Reads render and capture	30 // Speech intelligibility enhancement module. Reads render and capture

31 // audio streams and modifies the render stream with a set of gains per	31 // audio streams and modifies the render stream with a set of gains per

32 // frequency bin to enhance speech against the noise background.	32 // frequency bin to enhance speech against the noise background.

33 // Note: assumes speech and noise streams are already separated.	33 // Note: assumes speech and noise streams are already separated.

34 class IntelligibilityEnhancer {	34 class IntelligibilityEnhancer {

35 public:	35 public:

36 // Construct a new instance with the given filter bank resolution,	36 struct Config {

37 // sampling rate, number of channels and analysis rates.	37 // \|var_*\| are parameters for the VarianceArray constructor for the

38 // \|analysis_rate\| sets the number of input blocks (containing speech!)	38 // clear speech stream.

39 // to elapse before a new gain computation is made. \|variance_rate\| specifies	39 // TODO(bercic): the \|var_\|, \|_rate\| and \|gain_limit\| parameters should

40 // the number of gain recomputations after which the variances are reset.	40 // probably go away once fine tuning is done. They override the internal

41 // \|cv_*\| are parameters for the VarianceArray constructor for the	41 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

42 // clear speech stream.	42 Config()

43 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should	43 : sample_rate_hz(16000),

44 // probably go away once fine tuning is done. They override the internal	44 channels(1),

45 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).	45 var_type(intelligibility::VarianceArray::kStepDecaying),

46 IntelligibilityEnhancer(int erb_resolution,	46 var_decay_rate(0.9f),

47 int sample_rate_hz,	47 var_window_size(10),

48 int channels,	48 analysis_rate(800),

49 int cv_type,	49 gain_change_limit(0.1f),

50 float cv_alpha,	50 rho(0.02f),

51 int cv_win,	51 capture_vad_thresh(1.f),

52 int analysis_rate,	52 render_vad_thresh(0.f) {}

53 int variance_rate,	53 int sample_rate_hz;

54 float gain_limit);	54 int channels;

	55 intelligibility::VarianceArray::StepType var_type;

	56 float var_decay_rate;

	57 int var_window_size;

	58 int analysis_rate;

	59 float gain_change_limit;

	60 float rho;

	61 float capture_vad_thresh;

	62 float render_vad_thresh;

	63 };

	64

	65 explicit IntelligibilityEnhancer(const Config& config);

	66 IntelligibilityEnhancer(); // Initialize with default config.

	67

55 ~IntelligibilityEnhancer();	68 ~IntelligibilityEnhancer();

56	69

57 // Reads and processes chunk of noise stream in time domain.	70 // Reads and processes chunk of noise stream in time domain. Only updates

58 void ProcessCaptureAudio(float* const* audio);	71 // noise estimate when \|voice_probability\| below a threshold.

	72 void ProcessCaptureAudio(float* const* audio, const float voice_probability);
	aluebs-webrtc 2015/07/15 01:02:04 Does it actually processes the capture audio or do Does it actually processes the capture audio or does it only analyze it? Maybe we need a naming change. aluebs-webrtc 2015/07/15 01:02:05 In all of these methods you assume the sample rate In all of these methods you assume the sample rate and number of channels did not change since the constructor. Maybe you want to have them as parameters and assert on that? ekm 2015/07/17 19:59:38 Agreed. Similarly, in APM reverted AnalyzeReverseS Show quoted text On 2015/07/15 01:02:04, aluebs-webrtc wrote: > Does it actually processes the capture audio or does it only analyze it? Maybe > we need a naming change. Agreed. Similarly, in APM reverted AnalyzeReverseStream to const and added ProcessReverseStream. ekm 2015/07/17 19:59:38 Done. Show quoted text On 2015/07/15 01:02:05, aluebs-webrtc wrote: > In all of these methods you assume the sample rate and number of channels did > not change since the constructor. Maybe you want to have them as parameters and > assert on that? Done.
	73 void ProcessCaptureAudio(float* const* audio); // Assumes noise.
	aluebs-webrtc 2015/07/15 01:02:04 Do we want to surface both interfaces to the user? Do we want to surface both interfaces to the user? ekm 2015/07/17 19:59:38 I think it's nice to give the user the option of u Show quoted text On 2015/07/15 01:02:04, aluebs-webrtc wrote: > Do we want to surface both interfaces to the user? I think it's nice to give the user the option of using their own voice probabilities if they want (especially if they have really good ones). May be useful if there is some very expensive VAD in the future that we only want to run once for all APM. aluebs-webrtc 2015/07/20 19:33:42 Agreed. Show quoted text On 2015/07/17 19:59:38, ekm wrote: > On 2015/07/15 01:02:04, aluebs-webrtc wrote: > > Do we want to surface both interfaces to the user? > > I think it's nice to give the user the option of using their own voice > probabilities if they want (especially if they have really good ones). May be > useful if there is some very expensive VAD in the future that we only want to > run once for all APM. Agreed.
59	74

60 // Reads chunk of speech in time domain and updates with modified signal.	75 // Reads chunk of speech in time domain and updates with modified signal.

61 void ProcessRenderAudio(float* const* audio);	76 // Only updates speech estimate when \|voice_probability\| above a threshold.

	77 void ProcessRenderAudio(float* const* audio, const float voice_probability);

	78 void ProcessRenderAudio(float* const* audio); // Assumes speech.
	aluebs-webrtc 2015/07/15 01:02:04 Do we want to surface both interfaces to the user? Do we want to surface both interfaces to the user? ekm 2015/07/17 19:59:38 See above. Show quoted text On 2015/07/15 01:02:04, aluebs-webrtc wrote: > Do we want to surface both interfaces to the user? See above.
62	79

63 private:	80 private:

64 enum AudioSource {	81 enum AudioSource {

65 kRenderStream = 0, // Clear speech stream.	82 kRenderStream = 0, // Clear speech stream.

66 kCaptureStream, // Noise stream.	83 kCaptureStream, // Noise stream.

67 };	84 };

68	85

69 // Provides access point to the frequency domain.	86 // Provides access point to the frequency domain.

70 class TransformCallback : public LappedTransform::Callback {	87 class TransformCallback : public LappedTransform::Callback {

71 public:	88 public:

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
128 static float DotProduct(const float* a, const float* b, int length);	145 static float DotProduct(const float* a, const float* b, int length);

129	146

130 const int freqs_; // Num frequencies in frequency domain.	147 const int freqs_; // Num frequencies in frequency domain.

131 const int window_size_; // Window size in samples; also the block size.	148 const int window_size_; // Window size in samples; also the block size.

132 const int chunk_length_; // Chunk size in samples.	149 const int chunk_length_; // Chunk size in samples.

133 const int bank_size_; // Num ERB filters.	150 const int bank_size_; // Num ERB filters.

134 const int sample_rate_hz_;	151 const int sample_rate_hz_;

135 const int erb_resolution_;	152 const int erb_resolution_;

136 const int channels_; // Num channels.	153 const int channels_; // Num channels.

137 const int analysis_rate_; // Num blocks before gains recalculated.	154 const int analysis_rate_; // Num blocks before gains recalculated.

138 const int variance_rate_; // Num recalculations before history is cleared.	155 const float capture_vad_thresh_; // Threshold for updating noise estimate.

	156 const float render_vad_thresh_; // Threshold for updating speech estimate.

139	157

140 intelligibility::VarianceArray clear_variance_;	158 intelligibility::VarianceArray clear_variance_;

141 intelligibility::VarianceArray noise_variance_;	159 intelligibility::VarianceArray noise_variance_;

142 rtc::scoped_ptr<float[]> filtered_clear_var_;	160 rtc::scoped_ptr<float[]> filtered_clear_var_;

143 rtc::scoped_ptr<float[]> filtered_noise_var_;	161 rtc::scoped_ptr<float[]> filtered_noise_var_;

144 std::vector<std::vector<float>> filter_bank_;	162 std::vector<std::vector<float>> filter_bank_;

145 rtc::scoped_ptr<float[]> center_freqs_;	163 rtc::scoped_ptr<float[]> center_freqs_;

146 int start_freq_;	164 int start_freq_;

147 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.	165 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.

148 // for each ERB band.	166 // for each ERB band.

149 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.	167 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

150 intelligibility::GainApplier gain_applier_;	168 intelligibility::GainApplier gain_applier_;

151	169

152 // Destination buffer used to reassemble blocked chunks before overwriting	170 // Destination buffer used to reassemble blocked chunks before overwriting

153 // the original input array with modifications.	171 // the original input array with modifications.

154 // TODO(ekmeyerson): Switch to using ChannelBuffer.	172 // TODO(ekmeyerson): Switch to using ChannelBuffer.

155 float** temp_out_buffer_;	173 float** temp_out_buffer_;

156	174

157 rtc::scoped_ptr<float* []> input_audio_;

158 rtc::scoped_ptr<float[]> kbd_window_;	175 rtc::scoped_ptr<float[]> kbd_window_;

159 TransformCallback render_callback_;	176 TransformCallback render_callback_;

160 TransformCallback capture_callback_;	177 TransformCallback capture_callback_;

161 rtc::scoped_ptr<LappedTransform> render_mangler_;	178 rtc::scoped_ptr<LappedTransform> render_mangler_;

162 rtc::scoped_ptr<LappedTransform> capture_mangler_;	179 rtc::scoped_ptr<LappedTransform> capture_mangler_;

163 int block_count_;	180 int block_count_;

164 int analysis_step_;	181 int analysis_step_;

165	182

166 // TODO(bercic): Quick stopgap measure for voice detection in the clear	183 // TODO(bercic): Quick stopgap measure for voice detection in the clear

167 // and noise streams.	184 // and noise streams.

168 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.	185 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.

169 VadInst* vad_high_;	186 VadInst* vad_high_;

170 VadInst* vad_low_;	187 VadInst* vad_low_;

171 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;	188 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

172 bool has_voice_low_; // Whether voice detected in speech stream.	189 bool has_voice_low_; // Whether voice detected in speech stream.

173 };	190 };

174	191

175 } // namespace webrtc	192 } // namespace webrtc

176	193

177 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	194 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW