webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1182323005: Allow intelligibility to compile in apm

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1182323005: Allow intelligibility to compile in apm (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Addressed comments Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/audio_processing_tests.gypi ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

	11 //

	12 // Specifies core class for intelligbility enhancement.

	13 //

	14

11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

13	17

14 #include <complex>	18 #include <complex>

15	19

	20 #include "webrtc/base/scoped_ptr.h"

16 #include "webrtc/common_audio/lapped_transform.h"	21 #include "webrtc/common_audio/lapped_transform.h"

17 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	22 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

18 #include "webrtc/system_wrappers/interface/scoped_ptr.h"

19	23

20 struct WebRtcVadInst;	24 struct WebRtcVadInst;

21 typedef struct WebRtcVadInst VadInst;	25 typedef struct WebRtcVadInst VadInst;

22	26

23 namespace webrtc {	27 namespace webrtc {

24	28

25 // Speech intelligibility enhancement module. Reads render and capture	29 // Speech intelligibility enhancement module. Reads render and capture

26 // audio streams and modifies the render stream with a set of gains per	30 // audio streams and modifies the render stream with a set of gains per

27 // frequency bin to enhance speech against the noise background.	31 // frequency bin to enhance speech against the noise background.

	32 // Note: assumes speech and noise streams are already separated.

28 class IntelligibilityEnhancer {	33 class IntelligibilityEnhancer {

29 public:	34 public:

30 // Construct a new instance with the given filter bank resolution,	35 // Construct a new instance with the given filter bank resolution,

31 // sampling rate, number of channels and analysis rates.	36 // sampling rate, number of channels and analysis rates.

32 // \|analysis_rate\| sets the number of input blocks (containing speech!)	37 // \|analysis_rate\| sets the number of input blocks (containing speech!)

33 // to elapse before a new gain computation is made. \|variance_rate\| specifies	38 // to elapse before a new gain computation is made. \|variance_rate\| specifies

34 // the number of gain recomputations after which the variances are reset.	39 // the number of gain recomputations after which the variances are reset.

35 // \|cv_*\| are parameters for the VarianceArray constructor for the	40 // \|cv_*\| are parameters for the VarianceArray constructor for the

36 // lear speech stream.	41 // clear speech stream.

37 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should	42 // TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should

38 // probably go away once fine tuning is done. They override the internal	43 // probably go away once fine tuning is done. They override the internal

39 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).	44 // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).

40 IntelligibilityEnhancer(int erb_resolution, int sample_rate_hz, int channels,	45 IntelligibilityEnhancer(int erb_resolution,

41 int cv_type, float cv_alpha, int cv_win,	46 int sample_rate_hz,

42 int analysis_rate, int variance_rate,	47 int channels,

	48 int cv_type,

	49 float cv_alpha,

	50 int cv_win,

	51 int analysis_rate,

	52 int variance_rate,

43 float gain_limit);	53 float gain_limit);

44 ~IntelligibilityEnhancer();	54 ~IntelligibilityEnhancer();

45	55

	56 // Reads and processes chunk of noise stream in time domain.

	57 void ProcessCaptureAudio(float* const* audio);

	58

	59 // Reads chunk of speech in time domain and updates with modified signal.

46 void ProcessRenderAudio(float* const* audio);	60 void ProcessRenderAudio(float* const* audio);

47 void ProcessCaptureAudio(float* const* audio);

48	61

49 private:	62 private:

50 enum AudioSource {	63 enum AudioSource {

51 kRenderStream = 0,	64 kRenderStream = 0, // Clear speech stream.

52 kCaptureStream,	65 kCaptureStream, // Noise stream.

53 };	66 };

54	67

	68 // Provides access point to the frequency domain.

55 class TransformCallback : public LappedTransform::Callback {	69 class TransformCallback : public LappedTransform::Callback {

56 public:	70 public:

57 TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);	71 TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);

	72

	73 // All in frequency domain, receives input \|in_block\|, applies

	74 // intelligibility enhancement, and writes result to \|out_block\|.

58 virtual void ProcessAudioBlock(const std::complex<float>* const* in_block,	75 virtual void ProcessAudioBlock(const std::complex<float>* const* in_block,

59 int in_channels, int frames,	76 int in_channels,

	77 int frames,

60 int out_channels,	78 int out_channels,

61 std::complex<float>* const* out_block);	79 std::complex<float>* const* out_block);

62	80

63 private:	81 private:

64 IntelligibilityEnhancer* parent_;	82 IntelligibilityEnhancer* parent_;

65 AudioSource source_;	83 AudioSource source_;

66 };	84 };

67 friend class TransformCallback;	85 friend class TransformCallback;

68	86

69 void DispatchAudio(AudioSource source, const std::complex<float>* in_block,	87 // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source.

	88 void DispatchAudio(AudioSource source,

	89 const std::complex<float>* in_block,

70 std::complex<float>* out_block);	90 std::complex<float>* out_block);

	91

	92 // Updates variance computation and analysis with \|in_block_\|,

	93 // and writes modified speech to \|out_block\|.

71 void ProcessClearBlock(const std::complex<float>* in_block,	94 void ProcessClearBlock(const std::complex<float>* in_block,

72 std::complex<float>* out_block);	95 std::complex<float>* out_block);

	96

	97 // Computes and sets modified gains.

73 void AnalyzeClearBlock(float power_target);	98 void AnalyzeClearBlock(float power_target);

	99

	100 // Updates variance calculation for noise input with \|in_block\|.

74 void ProcessNoiseBlock(const std::complex<float>* in_block,	101 void ProcessNoiseBlock(const std::complex<float>* in_block,

75 std::complex<float>* out_block);	102 std::complex<float>* out_block);

76	103

	104 // Returns number of ERB filters.

77 static int GetBankSize(int sample_rate, int erb_resolution);	105 static int GetBankSize(int sample_rate, int erb_resolution);

	106

	107 // Initializes ERB filterbank.

78 void CreateErbBank();	108 void CreateErbBank();

79 void SolveEquation14(float lambda, int start_freq, float* sols);	109

	110 // Analytically solves quadratic for optimal gains given \|lambda\|.

	111 // Negative gains are set to 0. Stores the results in \|sols\|.

	112 void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);

	113

	114 // Computes variance across ERB filters from freq variance \|var\|.

	115 // Stores in \|result\|.

80 void FilterVariance(const float* var, float* result);	116 void FilterVariance(const float* var, float* result);

	117

	118 // Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.

81 static float DotProduct(const float* a, const float* b, int length);	119 static float DotProduct(const float* a, const float* b, int length);

82	120

83 static const int kErbResolution;	121 static const int kErbResolution;

84 static const int kWindowSizeMs;	122 static const int kWindowSizeMs;

85 static const int kChunkSizeMs;	123 static const int kChunkSizeMs;

86 static const int kAnalyzeRate;	124 static const int kAnalyzeRate; // Default for \|analysis_rate_\|.

87 static const int kVarianceRate;	125 static const int kVarianceRate; // Default for \|variance_rate_\|.

88 static const float kClipFreq;	126 static const float kClipFreq;

89 static const float kConfigRho;	127 static const float kConfigRho; // Default production and interpretation SNR.

90 static const float kKbdAlpha;	128 static const float kKbdAlpha;

91 static const float kGainChangeLimit;	129 static const float kGainChangeLimit;

92	130

93 const int freqs_;	131 const int freqs_; // Num frequencies in frequency domain.

94 const int window_size_; // window size in samples; also the block size	132 const int window_size_; // Window size in samples; also the block size.

95 const int chunk_length_; // chunk size in samples	133 const int chunk_length_; // Chunk size in samples.

96 const int bank_size_;	134 const int bank_size_; // Num ERB filters.

97 const int sample_rate_hz_;	135 const int sample_rate_hz_;

98 const int erb_resolution_;	136 const int erb_resolution_;

99 const int channels_;	137 const int channels_; // Num channels.

100 const int analysis_rate_;	138 const int analysis_rate_; // Num blocks before gains recalculated.

101 const int variance_rate_;	139 const int variance_rate_; // Num recalculations before history is cleared.

102	140

103 intelligibility::VarianceArray clear_variance_;	141 intelligibility::VarianceArray clear_variance_;

104 intelligibility::VarianceArray noise_variance_;	142 intelligibility::VarianceArray noise_variance_;

105 scoped_ptr<float[]> filtered_clear_var_;	143 rtc::scoped_ptr<float[]> filtered_clear_var_;

106 scoped_ptr<float[]> filtered_noise_var_;	144 rtc::scoped_ptr<float[]> filtered_noise_var_;

107 float** filter_bank_;	145 float** filter_bank_; // TODO(ekmeyerson): Switch to using ChannelBuffer.

108 scoped_ptr<float[]> center_freqs_;	146 rtc::scoped_ptr<float[]> center_freqs_;

109 int start_freq_;	147 int start_freq_;

110 scoped_ptr<float[]> rho_;	148 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.

111 scoped_ptr<float[]> gains_eq_;	149 // for each ERB band.

	150 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

112 intelligibility::GainApplier gain_applier_;	151 intelligibility::GainApplier gain_applier_;

113	152

114 // Destination buffer used to reassemble blocked chunks before overwriting	153 // Destination buffer used to reassemble blocked chunks before overwriting

115 // the original input array with modifications.	154 // the original input array with modifications.

	155 // TODO(ekmeyerson): Switch to using ChannelBuffer.

116 float** temp_out_buffer_;	156 float** temp_out_buffer_;

117 scoped_ptr<float*[]> input_audio_;	157

118 scoped_ptr<float[]> kbd_window_;	158 rtc::scoped_ptr<float* []> input_audio_;

	159 rtc::scoped_ptr<float[]> kbd_window_;

119 TransformCallback render_callback_;	160 TransformCallback render_callback_;

120 TransformCallback capture_callback_;	161 TransformCallback capture_callback_;

121 scoped_ptr<LappedTransform> render_mangler_;	162 rtc::scoped_ptr<LappedTransform> render_mangler_;

122 scoped_ptr<LappedTransform> capture_mangler_;	163 rtc::scoped_ptr<LappedTransform> capture_mangler_;

123 int block_count_;	164 int block_count_;

124 int analysis_step_;	165 int analysis_step_;

125	166

126 // TODO(bercic): Quick stopgap measure for voice detection in the clear	167 // TODO(bercic): Quick stopgap measure for voice detection in the clear

127 // and noise streams.	168 // and noise streams.

	169 // Note: VAD currently does not affect anything in IntelligibilityEnhancer.

128 VadInst* vad_high_;	170 VadInst* vad_high_;

129 VadInst* vad_low_;	171 VadInst* vad_low_;

130 scoped_ptr<int16_t[]> vad_tmp_buffer_;	172 rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;

131 bool has_voice_low_;	173 bool has_voice_low_; // Whether voice detected in speech stream.

132 };	174 };

133	175

134 } // namespace webrtc	176 } // namespace webrtc

135	177

136 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	178 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

137

OLD	NEW