webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1187033005: Revert of Allow intelligibility to compile in apm

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1187033005: Revert of Allow intelligibility to compile in apm (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/audio_processing_tests.gypi ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //

12 // Specifies core class for intelligbility enhancement.

13 //

14

15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

17	13

18 #include <complex>	14 #include <complex>

19	15

20 #include "webrtc/base/scoped_ptr.h"

21 #include "webrtc/common_audio/lapped_transform.h"	16 #include "webrtc/common_audio/lapped_transform.h"

22 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	17 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

	18 #include "webrtc/system_wrappers/interface/scoped_ptr.h"

23	19

24 struct WebRtcVadInst;	20 struct WebRtcVadInst;

25 typedef struct WebRtcVadInst VadInst;	21 typedef struct WebRtcVadInst VadInst;

26	22

27 namespace webrtc {	23 namespace webrtc {

28	24

29 // Speech intelligibility enhancement module. Reads render and capture	25 // Speech intelligibility enhancement module. Reads render and capture

30 // audio streams and modifies the render stream with a set of gains per	26 // audio streams and modifies the render stream with a set of gains per

31 // frequency bin to enhance speech against the noise background.	27 // frequency bin to enhance speech against the noise background.

32 // Note: assumes speech and noise streams are already separated.

33 class IntelligibilityEnhancer {	28 class IntelligibilityEnhancer {

34 public:	29 public:

35 // Construct a new instance with the given filter bank resolution,	30 // Construct a new instance with the given filter bank resolution,

36 // sampling rate, number of channels and analysis rates.	31 // sampling rate, number of channels and analysis rates.

37 // \|analysis_rate\| sets the number of input blocks (containing speech!)	32 // \|analysis_rate\| sets the number of input blocks (containing speech!)

38 // to elapse before a new gain computation is made. \|variance_rate\| specifies	33 // to elapse before a new gain computation is made. \|variance_rate\| specifies

39 // the number of gain recomputations after which the variances are reset.	34 // the number of gain recomputations after which the variances are reset.

40 // \|cv_*\| are parameters for the VarianceArray constructor for the	35 // \|cv_*\| are parameters for the VarianceArray constructor for the

41 // clear speech stream.	36 // lear speech stream.

42 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should	37 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should

43 // probably go away once fine tuning is done. They override the internal	38 // probably go away once fine tuning is done. They override the internal

44 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).	39 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

45 IntelligibilityEnhancer(int erb_resolution,	40 IntelligibilityEnhancer(int erb_resolution, int sample_rate_hz, int channels,

46 int sample_rate_hz,	41 int cv_type, float cv_alpha, int cv_win,

47 int channels,	42 int analysis_rate, int variance_rate,

48 int cv_type,

49 float cv_alpha,

50 int cv_win,

51 int analysis_rate,

52 int variance_rate,

53 float gain_limit);	43 float gain_limit);

54 ~IntelligibilityEnhancer();	44 ~IntelligibilityEnhancer();

55	45

56 // Reads and processes chunk of noise stream in time domain.	46 void ProcessRenderAudio(float* const* audio);

57 void ProcessCaptureAudio(float* const* audio);	47 void ProcessCaptureAudio(float* const* audio);

58	48

59 // Reads chunk of speech in time domain and updates with modified signal.

60 void ProcessRenderAudio(float* const* audio);

61

62 private:	49 private:

63 enum AudioSource {	50 enum AudioSource {

64 kRenderStream = 0, // Clear speech stream.	51 kRenderStream = 0,

65 kCaptureStream, // Noise stream.	52 kCaptureStream,

66 };	53 };

67	54

68 // Provides access point to the frequency domain.

69 class TransformCallback : public LappedTransform::Callback {	55 class TransformCallback : public LappedTransform::Callback {

70 public:	56 public:

71 TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);	57 TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);

72

73 // All in frequency domain, receives input \|in_block\|, applies

74 // intelligibility enhancement, and writes result to \|out_block\|.

75 virtual void ProcessAudioBlock(const std::complex<float>* const* in_block,	58 virtual void ProcessAudioBlock(const std::complex<float>* const* in_block,

76 int in_channels,	59 int in_channels, int frames,

77 int frames,

78 int out_channels,	60 int out_channels,

79 std::complex<float>* const* out_block);	61 std::complex<float>* const* out_block);

80	62

81 private:	63 private:

82 IntelligibilityEnhancer* parent_;	64 IntelligibilityEnhancer* parent_;

83 AudioSource source_;	65 AudioSource source_;

84 };	66 };

85 friend class TransformCallback;	67 friend class TransformCallback;

86	68

87 // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source.	69 void DispatchAudio(AudioSource source, const std::complex<float>* in_block,

88 void DispatchAudio(AudioSource source,

89 const std::complex<float>* in_block,

90 std::complex<float>* out_block);	70 std::complex<float>* out_block);

91

92 // Updates variance computation and analysis with \|in_block_\|,

93 // and writes modified speech to \|out_block\|.

94 void ProcessClearBlock(const std::complex<float>* in_block,	71 void ProcessClearBlock(const std::complex<float>* in_block,

95 std::complex<float>* out_block);	72 std::complex<float>* out_block);

96

97 // Computes and sets modified gains.

98 void AnalyzeClearBlock(float power_target);	73 void AnalyzeClearBlock(float power_target);

99

100 // Updates variance calculation for noise input with \|in_block\|.

101 void ProcessNoiseBlock(const std::complex<float>* in_block,	74 void ProcessNoiseBlock(const std::complex<float>* in_block,

102 std::complex<float>* out_block);	75 std::complex<float>* out_block);

103	76

104 // Returns number of ERB filters.

105 static int GetBankSize(int sample_rate, int erb_resolution);	77 static int GetBankSize(int sample_rate, int erb_resolution);

106

107 // Initializes ERB filterbank.

108 void CreateErbBank();	78 void CreateErbBank();

109	79 void SolveEquation14(float lambda, int start_freq, float* sols);

110 // Analytically solves quadratic for optimal gains given \|lambda\|.

111 // Negative gains are set to 0. Stores the results in \|sols\|.

112 void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);

113

114 // Computes variance across ERB filters from freq variance \|var\|.

115 // Stores in \|result\|.

116 void FilterVariance(const float* var, float* result);	80 void FilterVariance(const float* var, float* result);

117

118 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.

119 static float DotProduct(const float* a, const float* b, int length);	81 static float DotProduct(const float* a, const float* b, int length);

120	82

121 static const int kErbResolution;	83 static const int kErbResolution;

122 static const int kWindowSizeMs;	84 static const int kWindowSizeMs;

123 static const int kChunkSizeMs;	85 static const int kChunkSizeMs;

124 static const int kAnalyzeRate; // Default for \|analysis_rate_\|.	86 static const int kAnalyzeRate;

125 static const int kVarianceRate; // Default for \|variance_rate_\|.	87 static const int kVarianceRate;

126 static const float kClipFreq;	88 static const float kClipFreq;

127 static const float kConfigRho; // Default production and interpretation SNR.	89 static const float kConfigRho;

128 static const float kKbdAlpha;	90 static const float kKbdAlpha;

129 static const float kGainChangeLimit;	91 static const float kGainChangeLimit;

130	92

131 const int freqs_; // Num frequencies in frequency domain.	93 const int freqs_;

132 const int window_size_; // Window size in samples; also the block size.	94 const int window_size_; // window size in samples; also the block size

133 const int chunk_length_; // Chunk size in samples.	95 const int chunk_length_; // chunk size in samples

134 const int bank_size_; // Num ERB filters.	96 const int bank_size_;

135 const int sample_rate_hz_;	97 const int sample_rate_hz_;

136 const int erb_resolution_;	98 const int erb_resolution_;

137 const int channels_; // Num channels.	99 const int channels_;

138 const int analysis_rate_; // Num blocks before gains recalculated.	100 const int analysis_rate_;

139 const int variance_rate_; // Num recalculations before history is cleared.	101 const int variance_rate_;

140	102

141 intelligibility::VarianceArray clear_variance_;	103 intelligibility::VarianceArray clear_variance_;

142 intelligibility::VarianceArray noise_variance_;	104 intelligibility::VarianceArray noise_variance_;

143 rtc::scoped_ptr<float[]> filtered_clear_var_;	105 scoped_ptr<float[]> filtered_clear_var_;

144 rtc::scoped_ptr<float[]> filtered_noise_var_;	106 scoped_ptr<float[]> filtered_noise_var_;

145 float** filter_bank_; // TODO(ekmeyerson): Switch to using ChannelBuffer.	107 float** filter_bank_;

146 rtc::scoped_ptr<float[]> center_freqs_;	108 scoped_ptr<float[]> center_freqs_;

147 int start_freq_;	109 int start_freq_;

148 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.	110 scoped_ptr<float[]> rho_;

149 // for each ERB band.	111 scoped_ptr<float[]> gains_eq_;

150 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

151 intelligibility::GainApplier gain_applier_;	112 intelligibility::GainApplier gain_applier_;

152	113

153 // Destination buffer used to reassemble blocked chunks before overwriting	114 // Destination buffer used to reassemble blocked chunks before overwriting

154 // the original input array with modifications.	115 // the original input array with modifications.

155 // TODO(ekmeyerson): Switch to using ChannelBuffer.

156 float** temp_out_buffer_;	116 float** temp_out_buffer_;

157	117 scoped_ptr<float*[]> input_audio_;

158 rtc::scoped_ptr<float* []> input_audio_;	118 scoped_ptr<float[]> kbd_window_;

159 rtc::scoped_ptr<float[]> kbd_window_;

160 TransformCallback render_callback_;	119 TransformCallback render_callback_;

161 TransformCallback capture_callback_;	120 TransformCallback capture_callback_;

162 rtc::scoped_ptr<LappedTransform> render_mangler_;	121 scoped_ptr<LappedTransform> render_mangler_;

163 rtc::scoped_ptr<LappedTransform> capture_mangler_;	122 scoped_ptr<LappedTransform> capture_mangler_;

164 int block_count_;	123 int block_count_;

165 int analysis_step_;	124 int analysis_step_;

166	125

167 // TODO(bercic): Quick stopgap measure for voice detection in the clear	126 // TODO(bercic): Quick stopgap measure for voice detection in the clear

168 // and noise streams.	127 // and noise streams.

169 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.

170 VadInst* vad_high_;	128 VadInst* vad_high_;

171 VadInst* vad_low_;	129 VadInst* vad_low_;

172 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;	130 scoped_ptr<int16_t[]> vad_tmp_buffer_;

173 bool has_voice_low_; // Whether voice detected in speech stream.	131 bool has_voice_low_;

174 };	132 };

175	133

176 } // namespace webrtc	134 } // namespace webrtc

177	135

178 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	136 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

	137

OLD	NEW