webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow

Patch Set: Use f for float Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/audio_processing_impl.cc ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

13	13

14 #include <complex>	14 #include <complex>

15 #include <memory>	15 #include <memory>

16 #include <vector>	16 #include <vector>

17	17

18 #include "webrtc/common_audio/lapped_transform.h"	18 #include "webrtc/common_audio/lapped_transform.h"

19 #include "webrtc/common_audio/channel_buffer.h"	19 #include "webrtc/common_audio/channel_buffer.h"

20 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	20 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

	21 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

21	22

22 namespace webrtc {	23 namespace webrtc {

23	24

24 // Speech intelligibility enhancement module. Reads render and capture	25 // Speech intelligibility enhancement module. Reads render and capture

25 // audio streams and modifies the render stream with a set of gains per	26 // audio streams and modifies the render stream with a set of gains per

26 // frequency bin to enhance speech against the noise background.	27 // frequency bin to enhance speech against the noise background.

27 // Details of the model and algorithm can be found in the original paper:	28 // Details of the model and algorithm can be found in the original paper:

28 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	29 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

29 class IntelligibilityEnhancer {	30 class IntelligibilityEnhancer {

30 public:	31 public:

31 struct Config {	32 IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);

32 // TODO(bercic): the \|decay_rate\|, \|analysis_rate\| and \|gain_limit\|

33 // parameters should probably go away once fine tuning is done.

34 Config()

35 : sample_rate_hz(16000),

36 num_capture_channels(1),

37 num_render_channels(1),

38 decay_rate(0.9f),

39 analysis_rate(60),

40 gain_change_limit(0.1f),

41 rho(0.02f) {}

42 int sample_rate_hz;

43 size_t num_capture_channels;

44 size_t num_render_channels;

45 float decay_rate;

46 int analysis_rate;

47 float gain_change_limit;

48 float rho;

49 };

50

51 explicit IntelligibilityEnhancer(const Config& config);

52 IntelligibilityEnhancer(); // Initialize with default config.

53	33

54 // Sets the capture noise magnitude spectrum estimate.	34 // Sets the capture noise magnitude spectrum estimate.

55 void SetCaptureNoiseEstimate(std::vector<float> noise);	35 void SetCaptureNoiseEstimate(std::vector<float> noise);

56	36

57 // Reads chunk of speech in time domain and updates with modified signal.	37 // Reads chunk of speech in time domain and updates with modified signal.

58 void ProcessRenderAudio(float* const* audio,	38 void ProcessRenderAudio(float* const* audio,

59 int sample_rate_hz,	39 int sample_rate_hz,

60 size_t num_channels);	40 size_t num_channels);

61 bool active() const;	41 bool active() const;

62	42

(...skipping 16 matching lines...) Expand all Loading...
79 };	59 };

80 friend class TransformCallback;	60 friend class TransformCallback;

81 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);	61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);

82 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);	62 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);

83	63

84 // Updates power computation and analysis with \|in_block_\|,	64 // Updates power computation and analysis with \|in_block_\|,

85 // and writes modified speech to \|out_block\|.	65 // and writes modified speech to \|out_block\|.

86 void ProcessClearBlock(const std::complex<float>* in_block,	66 void ProcessClearBlock(const std::complex<float>* in_block,

87 std::complex<float>* out_block);	67 std::complex<float>* out_block);

88	68

89 // Computes and sets modified gains.

90 void AnalyzeClearBlock();

91

92 // Bisection search for optimal \|lambda\|.	69 // Bisection search for optimal \|lambda\|.

93 void SolveForLambda(float power_target, float power_bot, float power_top);	70 void SolveForLambda(float power_target, float power_bot, float power_top);

94	71

95 // Transforms freq gains to ERB gains.	72 // Transforms freq gains to ERB gains.

96 void UpdateErbGains();	73 void UpdateErbGains();

97	74

98 // Returns number of ERB filters.	75 // Returns number of ERB filters.

99 static size_t GetBankSize(int sample_rate, size_t erb_resolution);	76 static size_t GetBankSize(int sample_rate, size_t erb_resolution);

100	77

101 // Initializes ERB filterbank.	78 // Initializes ERB filterbank.

102 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);	79 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);

103	80

104 // Analytically solves quadratic for optimal gains given \|lambda\|.	81 // Analytically solves quadratic for optimal gains given \|lambda\|.

105 // Negative gains are set to 0. Stores the results in \|sols\|.	82 // Negative gains are set to 0. Stores the results in \|sols\|.

106 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);	83 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);

107	84

	85 // Returns true if the audio is speech.

	86 bool IsSpeech(const float* audio);

	87

108 const size_t freqs_; // Num frequencies in frequency domain.	88 const size_t freqs_; // Num frequencies in frequency domain.

109 const size_t window_size_; // Window size in samples; also the block size.

110 const size_t chunk_length_; // Chunk size in samples.	89 const size_t chunk_length_; // Chunk size in samples.

111 const size_t bank_size_; // Num ERB filters.	90 const size_t bank_size_; // Num ERB filters.

112 const int sample_rate_hz_;	91 const int sample_rate_hz_;

113 const int erb_resolution_;

114 const size_t num_capture_channels_;

115 const size_t num_render_channels_;	92 const size_t num_render_channels_;

116 const int analysis_rate_; // Num blocks before gains recalculated.

117	93

118 const bool active_; // Whether render gains are being updated.	94 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;

119 // TODO(ekm): Add logic for updating \|active_\|.	95 std::unique_ptr<intelligibility::PowerEstimator<float>>

120	96 noise_power_estimator_;

121 intelligibility::PowerEstimator clear_power_;

122 std::vector<float> noise_power_;

123 std::unique_ptr<float[]> filtered_clear_pow_;	97 std::unique_ptr<float[]> filtered_clear_pow_;

124 std::unique_ptr<float[]> filtered_noise_pow_;	98 std::unique_ptr<float[]> filtered_noise_pow_;

125 std::unique_ptr<float[]> center_freqs_;	99 std::unique_ptr<float[]> center_freqs_;

126 std::vector<std::vector<float>> capture_filter_bank_;	100 std::vector<std::vector<float>> capture_filter_bank_;

127 std::vector<std::vector<float>> render_filter_bank_;	101 std::vector<std::vector<float>> render_filter_bank_;

128 size_t start_freq_;	102 size_t start_freq_;

129 std::unique_ptr<float[]> rho_; // Production and interpretation SNR.	103

130 // for each ERB band.

131 std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.	104 std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.

132 intelligibility::GainApplier gain_applier_;	105 intelligibility::GainApplier gain_applier_;

133	106

134 // Destination buffers used to reassemble blocked chunks before overwriting	107 // Destination buffers used to reassemble blocked chunks before overwriting

135 // the original input array with modifications.	108 // the original input array with modifications.

136 ChannelBuffer<float> temp_render_out_buffer_;	109 ChannelBuffer<float> temp_render_out_buffer_;

137	110

138 std::unique_ptr<float[]> kbd_window_;

139 TransformCallback render_callback_;	111 TransformCallback render_callback_;

140 std::unique_ptr<LappedTransform> render_mangler_;	112 std::unique_ptr<LappedTransform> render_mangler_;

141 int block_count_;	113

142 int analysis_step_;	114 VoiceActivityDetector vad_;

	115 std::vector<int16_t> audio_s16_;

	116 size_t chunks_since_voice_;

	117 bool is_speech_;

143 };	118 };

144	119

145 } // namespace webrtc	120 } // namespace webrtc

146	121

147 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	122 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW