webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow

Patch Set: Make gain change limit relative Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/audio_processing_impl.cc ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //

12 // Specifies core class for intelligbility enhancement.

13 //

14

15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

17	13

18 #include <complex>	14 #include <complex>

19 #include <vector>	15 #include <vector>

20	16

21 #include "webrtc/base/scoped_ptr.h"	17 #include "webrtc/base/scoped_ptr.h"

22 #include "webrtc/common_audio/lapped_transform.h"	18 #include "webrtc/common_audio/lapped_transform.h"

23 #include "webrtc/common_audio/channel_buffer.h"	19 #include "webrtc/common_audio/channel_buffer.h"

24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	20 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

	21 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

25	22

26 namespace webrtc {	23 namespace webrtc {

27	24

28 // Speech intelligibility enhancement module. Reads render and capture	25 // Speech intelligibility enhancement module. Reads render and capture

29 // audio streams and modifies the render stream with a set of gains per	26 // audio streams and modifies the render stream with a set of gains per

30 // frequency bin to enhance speech against the noise background.	27 // frequency bin to enhance speech against the noise background.

31 // Note: assumes speech and noise streams are already separated.	28 // Details of the model and algorithm can be found in the original paper:

	29 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

32 class IntelligibilityEnhancer {	30 class IntelligibilityEnhancer {

33 public:	31 public:

34 struct Config {	32 IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);

35 // \|var_*\| are parameters for the VarianceArray constructor for the

36 // clear speech stream.

37 // TODO(bercic): the \|var_\|, \|_rate\| and \|gain_limit\| parameters should

38 // probably go away once fine tuning is done.

39 Config()

40 : sample_rate_hz(16000),

41 num_capture_channels(1),

42 num_render_channels(1),

43 var_type(intelligibility::VarianceArray::kStepDecaying),

44 var_decay_rate(0.9f),

45 var_window_size(10),

46 analysis_rate(800),

47 gain_change_limit(0.1f),

48 rho(0.02f) {}

49 int sample_rate_hz;

50 size_t num_capture_channels;

51 size_t num_render_channels;

52 intelligibility::VarianceArray::StepType var_type;

53 float var_decay_rate;

54 size_t var_window_size;

55 int analysis_rate;

56 float gain_change_limit;

57 float rho;

58 };

59

60 explicit IntelligibilityEnhancer(const Config& config);

61 IntelligibilityEnhancer(); // Initialize with default config.

62	33

63 // Sets the capture noise magnitude spectrum estimate.	34 // Sets the capture noise magnitude spectrum estimate.

64 void SetCaptureNoiseEstimate(std::vector<float> noise);	35 void SetCaptureNoiseEstimate(std::vector<float> noise);

65	36

66 // Reads chunk of speech in time domain and updates with modified signal.	37 // Reads chunk of speech in time domain and updates with modified signal.

67 void ProcessRenderAudio(float* const* audio,	38 void ProcessRenderAudio(float* const* audio,

68 int sample_rate_hz,	39 int sample_rate_hz,

69 size_t num_channels);	40 size_t num_channels);

70 bool active() const;	41 bool active() const;

71	42

(...skipping 11 matching lines...) Expand all Loading...
83 size_t out_channels,	54 size_t out_channels,

84 std::complex<float>* const* out_block) override;	55 std::complex<float>* const* out_block) override;

85	56

86 private:	57 private:

87 IntelligibilityEnhancer* parent_;	58 IntelligibilityEnhancer* parent_;

88 };	59 };

89 friend class TransformCallback;	60 friend class TransformCallback;

90 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);	61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);

91 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);	62 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);

92	63

93 // Updates variance computation and analysis with \|in_block_\|,	64 // Updates power computation and analysis with \|in_block_\|,

94 // and writes modified speech to \|out_block\|.	65 // and writes modified speech to \|out_block\|.

95 void ProcessClearBlock(const std::complex<float>* in_block,	66 void ProcessClearBlock(const std::complex<float>* in_block,

96 std::complex<float>* out_block);	67 std::complex<float>* out_block);

97	68

98 // Computes and sets modified gains.

99 void AnalyzeClearBlock(float power_target);

100

101 // Bisection search for optimal \|lambda\|.	69 // Bisection search for optimal \|lambda\|.

102 void SolveForLambda(float power_target, float power_bot, float power_top);	70 void SolveForLambda(float power_target, float power_bot, float power_top);

103	71

104 // Transforms freq gains to ERB gains.	72 // Transforms freq gains to ERB gains.

105 void UpdateErbGains();	73 void UpdateErbGains();

106	74

107 // Returns number of ERB filters.	75 // Returns number of ERB filters.

108 static size_t GetBankSize(int sample_rate, size_t erb_resolution);	76 static size_t GetBankSize(int sample_rate, size_t erb_resolution);

109	77

110 // Initializes ERB filterbank.	78 // Initializes ERB filterbank.

111 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);	79 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);

112	80

113 // Analytically solves quadratic for optimal gains given \|lambda\|.	81 // Analytically solves quadratic for optimal gains given \|lambda\|.

114 // Negative gains are set to 0. Stores the results in \|sols\|.	82 // Negative gains are set to 0. Stores the results in \|sols\|.

115 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);	83 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);

116	84

	85 // Returns true if the audio is speech.

	86 bool IsSpeech(const float* audio);

	87

117 const size_t freqs_; // Num frequencies in frequency domain.	88 const size_t freqs_; // Num frequencies in frequency domain.

118 const size_t window_size_; // Window size in samples; also the block size.

119 const size_t chunk_length_; // Chunk size in samples.	89 const size_t chunk_length_; // Chunk size in samples.

120 const size_t bank_size_; // Num ERB filters.	90 const size_t bank_size_; // Num ERB filters.

121 const int sample_rate_hz_;	91 const int sample_rate_hz_;

122 const int erb_resolution_;

123 const size_t num_capture_channels_;

124 const size_t num_render_channels_;	92 const size_t num_render_channels_;

125 const int analysis_rate_; // Num blocks before gains recalculated.

126	93

127 const bool active_; // Whether render gains are being updated.	94 intelligibility::PowerEstimator clear_power_estimator_;

128 // TODO(ekm): Add logic for updating \|active_\|.	95 rtc::scoped_ptr<intelligibility::PowerEstimator> noise_power_estimator_;

129	96 rtc::scoped_ptr<float[]> filtered_clear_pow_;

130 intelligibility::VarianceArray clear_variance_;	97 rtc::scoped_ptr<float[]> filtered_noise_pow_;

131 std::vector<float> noise_power_;

132 rtc::scoped_ptr<float[]> filtered_clear_var_;

133 rtc::scoped_ptr<float[]> filtered_noise_var_;

134 rtc::scoped_ptr<float[]> center_freqs_;	98 rtc::scoped_ptr<float[]> center_freqs_;

135 std::vector<std::vector<float>> capture_filter_bank_;	99 std::vector<std::vector<float>> capture_filter_bank_;

136 std::vector<std::vector<float>> render_filter_bank_;	100 std::vector<std::vector<float>> render_filter_bank_;

137 size_t start_freq_;	101 size_t start_freq_;

138 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.	102

139 // for each ERB band.

140 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.	103 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.

141 intelligibility::GainApplier gain_applier_;	104 intelligibility::GainApplier gain_applier_;

142	105

143 // Destination buffers used to reassemble blocked chunks before overwriting	106 // Destination buffers used to reassemble blocked chunks before overwriting

144 // the original input array with modifications.	107 // the original input array with modifications.

145 ChannelBuffer<float> temp_render_out_buffer_;	108 ChannelBuffer<float> temp_render_out_buffer_;

146	109

147 rtc::scoped_ptr<float[]> kbd_window_;

148 TransformCallback render_callback_;	110 TransformCallback render_callback_;

149 rtc::scoped_ptr<LappedTransform> render_mangler_;	111 rtc::scoped_ptr<LappedTransform> render_mangler_;

150 int block_count_;	112

151 int analysis_step_;	113 VoiceActivityDetector vad_;

	114 std::vector<int16_t> audio_s16_;

	115 size_t chunks_since_voice_;

	116 bool is_speech_;

152 };	117 };

153	118

154 } // namespace webrtc	119 } // namespace webrtc

155	120

156 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	121 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW