webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Fix Mac Error (3) Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/include/mock_audio_processing.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Specifies core class for intelligbility enhancement.	12 // Specifies core class for intelligbility enhancement.

13 //	13 //

14	14

15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

17	17

18 #include <complex>	18 #include <complex>

19 #include <vector>	19 #include <vector>

20	20

21 #include "webrtc/base/scoped_ptr.h"	21 #include "webrtc/base/scoped_ptr.h"

22 #include "webrtc/common_audio/lapped_transform.h"	22 #include "webrtc/common_audio/lapped_transform.h"

	23 #include "webrtc/common_audio/channel_buffer.h"

23 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

24	25

25 struct WebRtcVadInst;

26 typedef struct WebRtcVadInst VadInst;

27

28 namespace webrtc {	26 namespace webrtc {

29	27

30 // Speech intelligibility enhancement module. Reads render and capture	28 // Speech intelligibility enhancement module. Reads render and capture

31 // audio streams and modifies the render stream with a set of gains per	29 // audio streams and modifies the render stream with a set of gains per

32 // frequency bin to enhance speech against the noise background.	30 // frequency bin to enhance speech against the noise background.

33 // Note: assumes speech and noise streams are already separated.	31 // Note: assumes speech and noise streams are already separated.

34 class IntelligibilityEnhancer {	32 class IntelligibilityEnhancer {

35 public:	33 public:

36 // Construct a new instance with the given filter bank resolution,	34 struct Config {

37 // sampling rate, number of channels and analysis rates.	35 // \|var_*\| are parameters for the VarianceArray constructor for the

38 // \|analysis_rate\| sets the number of input blocks (containing speech!)	36 // clear speech stream.

39 // to elapse before a new gain computation is made. \|variance_rate\| specifies	37 // TODO(bercic): the \|var_\|, \|_rate\| and \|gain_limit\| parameters should

40 // the number of gain recomputations after which the variances are reset.	38 // probably go away once fine tuning is done.

41 // \|cv_*\| are parameters for the VarianceArray constructor for the	39 Config()

42 // clear speech stream.	40 : sample_rate_hz(16000),

43 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should	41 num_capture_channels(1),

44 // probably go away once fine tuning is done. They override the internal	42 num_render_channels(1),

45 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).	43 var_type(intelligibility::VarianceArray::kStepDecaying),

46 IntelligibilityEnhancer(int erb_resolution,	44 var_decay_rate(0.9f),

47 int sample_rate_hz,	45 var_window_size(10),

48 int channels,	46 analysis_rate(800),

49 int cv_type,	47 gain_change_limit(0.1f),

50 float cv_alpha,	48 rho(0.02f) {}

51 int cv_win,	49 int sample_rate_hz;

52 int analysis_rate,	50 int num_capture_channels;

53 int variance_rate,	51 int num_render_channels;

54 float gain_limit);	52 intelligibility::VarianceArray::StepType var_type;

55 ~IntelligibilityEnhancer();	53 float var_decay_rate;

	54 int var_window_size;

	55 int analysis_rate;

	56 float gain_change_limit;

	57 float rho;

	58 };

	59

	60 explicit IntelligibilityEnhancer(const Config& config);

	61 IntelligibilityEnhancer(); // Initialize with default config.

56	62

57 // Reads and processes chunk of noise stream in time domain.	63 // Reads and processes chunk of noise stream in time domain.

58 void ProcessCaptureAudio(float* const* audio);	64 void AnalyzeCaptureAudio(float* const* audio,

	65 int sample_rate_hz,

	66 int num_channels);

59	67

60 // Reads chunk of speech in time domain and updates with modified signal.	68 // Reads chunk of speech in time domain and updates with modified signal.

61 void ProcessRenderAudio(float* const* audio);	69 void ProcessRenderAudio(float* const* audio,

	70 int sample_rate_hz,

	71 int num_channels);

	72 bool active() const;

62	73

63 private:	74 private:

64 enum AudioSource {	75 enum AudioSource {

65 kRenderStream = 0, // Clear speech stream.	76 kRenderStream = 0, // Clear speech stream.

66 kCaptureStream, // Noise stream.	77 kCaptureStream, // Noise stream.

67 };	78 };

68	79

69 // Provides access point to the frequency domain.	80 // Provides access point to the frequency domain.

70 class TransformCallback : public LappedTransform::Callback {	81 class TransformCallback : public LappedTransform::Callback {

71 public:	82 public:

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
126	137

127 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.	138 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.

128 static float DotProduct(const float* a, const float* b, int length);	139 static float DotProduct(const float* a, const float* b, int length);

129	140

130 const int freqs_; // Num frequencies in frequency domain.	141 const int freqs_; // Num frequencies in frequency domain.

131 const int window_size_; // Window size in samples; also the block size.	142 const int window_size_; // Window size in samples; also the block size.

132 const int chunk_length_; // Chunk size in samples.	143 const int chunk_length_; // Chunk size in samples.

133 const int bank_size_; // Num ERB filters.	144 const int bank_size_; // Num ERB filters.

134 const int sample_rate_hz_;	145 const int sample_rate_hz_;

135 const int erb_resolution_;	146 const int erb_resolution_;

136 const int channels_; // Num channels.	147 const int num_capture_channels_;

	148 const int num_render_channels_;

137 const int analysis_rate_; // Num blocks before gains recalculated.	149 const int analysis_rate_; // Num blocks before gains recalculated.

138 const int variance_rate_; // Num recalculations before history is cleared.	150

	151 const bool active_; // Whether render gains are being updated.

	152 // TODO(ekm): Add logic for updating \|active_\|.

139	153

140 intelligibility::VarianceArray clear_variance_;	154 intelligibility::VarianceArray clear_variance_;

141 intelligibility::VarianceArray noise_variance_;	155 intelligibility::VarianceArray noise_variance_;

142 rtc::scoped_ptr<float[]> filtered_clear_var_;	156 rtc::scoped_ptr<float[]> filtered_clear_var_;

143 rtc::scoped_ptr<float[]> filtered_noise_var_;	157 rtc::scoped_ptr<float[]> filtered_noise_var_;

144 std::vector<std::vector<float>> filter_bank_;	158 std::vector<std::vector<float>> filter_bank_;

145 rtc::scoped_ptr<float[]> center_freqs_;	159 rtc::scoped_ptr<float[]> center_freqs_;

146 int start_freq_;	160 int start_freq_;

147 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.	161 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.

148 // for each ERB band.	162 // for each ERB band.

149 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.	163 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

150 intelligibility::GainApplier gain_applier_;	164 intelligibility::GainApplier gain_applier_;

151	165

152 // Destination buffer used to reassemble blocked chunks before overwriting	166 // Destination buffers used to reassemble blocked chunks before overwriting

153 // the original input array with modifications.	167 // the original input array with modifications.

154 // TODO(ekmeyerson): Switch to using ChannelBuffer.	168 ChannelBuffer<float> temp_render_out_buffer_;

155 float** temp_out_buffer_;	169 ChannelBuffer<float> temp_capture_out_buffer_;

156	170

157 rtc::scoped_ptr<float* []> input_audio_;

158 rtc::scoped_ptr<float[]> kbd_window_;	171 rtc::scoped_ptr<float[]> kbd_window_;

159 TransformCallback render_callback_;	172 TransformCallback render_callback_;

160 TransformCallback capture_callback_;	173 TransformCallback capture_callback_;

161 rtc::scoped_ptr<LappedTransform> render_mangler_;	174 rtc::scoped_ptr<LappedTransform> render_mangler_;

162 rtc::scoped_ptr<LappedTransform> capture_mangler_;	175 rtc::scoped_ptr<LappedTransform> capture_mangler_;

163 int block_count_;	176 int block_count_;

164 int analysis_step_;	177 int analysis_step_;

165

166 // TODO(bercic): Quick stopgap measure for voice detection in the clear

167 // and noise streams.

168 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.

169 VadInst* vad_high_;

170 VadInst* vad_low_;

171 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

172 bool has_voice_low_; // Whether voice detected in speech stream.

173 };	178 };

174	179

175 } // namespace webrtc	180 } // namespace webrtc

176	181

177 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	182 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW