webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@gains2

Patch Set: Rebasing Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

13	13

14 #include <complex>	14 #include <complex>

15 #include <memory>	15 #include <memory>

16 #include <vector>	16 #include <vector>

17	17

18 #include "webrtc/common_audio/lapped_transform.h"	18 #include "webrtc/common_audio/lapped_transform.h"

19 #include "webrtc/common_audio/channel_buffer.h"	19 #include "webrtc/common_audio/channel_buffer.h"

20 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	20 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

21 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"	21 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

22	22

23 namespace webrtc {	23 namespace webrtc {

24	24

25 // Speech intelligibility enhancement module. Reads render and capture	25 // Speech intelligibility enhancement module. Reads render and capture

26 // audio streams and modifies the render stream with a set of gains per	26 // audio streams and modifies the render stream with a set of gains per

27 // frequency bin to enhance speech against the noise background.	27 // frequency bin to enhance speech against the noise background.

28 // Details of the model and algorithm can be found in the original paper:	28 // Details of the model and algorithm can be found in the original paper:

29 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	29 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

30 class IntelligibilityEnhancer {	30 class IntelligibilityEnhancer : public LappedTransform::Callback {

31 public:	31 public:

32 IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);	32 IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);

33	33

34 // Sets the capture noise magnitude spectrum estimate.	34 // Sets the capture noise magnitude spectrum estimate.

35 void SetCaptureNoiseEstimate(std::vector<float> noise);	35 void SetCaptureNoiseEstimate(std::vector<float> noise);

36	36

37 // Reads chunk of speech in time domain and updates with modified signal.	37 // Reads chunk of speech in time domain and updates with modified signal.

38 void ProcessRenderAudio(float* const* audio,	38 void ProcessRenderAudio(float* const* audio,

39 int sample_rate_hz,	39 int sample_rate_hz,

40 size_t num_channels);	40 size_t num_channels);

41 bool active() const;	41 bool active() const;

42	42

	43 protected:

	44 // All in frequency domain, receives input \|in_block\|, applies

	45 // intelligibility enhancement, and writes result to \|out_block\|.

	46 void ProcessAudioBlock(const std::complex<float>* const* in_block,

	47 size_t in_channels,

	48 size_t frames,

	49 size_t out_channels,

	50 std::complex<float>* const* out_block) override;

	51

43 private:	52 private:

44 // Provides access point to the frequency domain.

45 class TransformCallback : public LappedTransform::Callback {

46 public:

47 TransformCallback(IntelligibilityEnhancer* parent);

48

49 // All in frequency domain, receives input \|in_block\|, applies

50 // intelligibility enhancement, and writes result to \|out_block\|.

51 void ProcessAudioBlock(const std::complex<float>* const* in_block,

52 size_t in_channels,

53 size_t frames,

54 size_t out_channels,

55 std::complex<float>* const* out_block) override;

56

57 private:

58 IntelligibilityEnhancer* parent_;

59 };

60 friend class TransformCallback;

61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);	53 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);

62 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);	54 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);

63	55

64 // Updates power computation and analysis with \|in_block_\|,

65 // and writes modified speech to \|out_block\|.

66 void ProcessClearBlock(const std::complex<float>* in_block,

67 std::complex<float>* out_block);

68

69 // Bisection search for optimal \|lambda\|.	56 // Bisection search for optimal \|lambda\|.

70 void SolveForLambda(float power_target);	57 void SolveForLambda(float power_target);

71	58

72 // Transforms freq gains to ERB gains.	59 // Transforms freq gains to ERB gains.

73 void UpdateErbGains();	60 void UpdateErbGains();

74	61

75 // Returns number of ERB filters.	62 // Returns number of ERB filters.

76 static size_t GetBankSize(int sample_rate, size_t erb_resolution);	63 static size_t GetBankSize(int sample_rate, size_t erb_resolution);

77	64

78 // Initializes ERB filterbank.	65 // Initializes ERB filterbank.

79 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);	66 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);

80	67

81 // Analytically solves quadratic for optimal gains given \|lambda\|.	68 // Analytically solves quadratic for optimal gains given \|lambda\|.

82 // Negative gains are set to 0. Stores the results in \|sols\|.	69 // Negative gains are set to 0. Stores the results in \|sols\|.

83 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);	70 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);

84	71

85 // Returns true if the audio is speech.	72 // Returns true if the audio is speech.

86 bool IsSpeech(const float* audio);	73 bool IsSpeech(const float* audio);

87	74

88 const size_t freqs_; // Num frequencies in frequency domain.	75 const size_t freqs_; // Num frequencies in frequency domain.

89 const size_t chunk_length_; // Chunk size in samples.	76 const size_t chunk_length_; // Chunk size in samples.

90 const size_t bank_size_; // Num ERB filters.	77 const size_t bank_size_; // Num ERB filters.

91 const int sample_rate_hz_;	78 const int sample_rate_hz_;

92 const size_t num_render_channels_;	79 const size_t num_render_channels_;

93	80

94 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;	81 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;

95 std::unique_ptr<intelligibility::PowerEstimator<float>>	82 std::unique_ptr<intelligibility::PowerEstimator<float>>

96 noise_power_estimator_;	83 noise_power_estimator_;

97 std::unique_ptr<float[]> filtered_clear_pow_;	84 std::vector<float> filtered_clear_pow_;

98 std::unique_ptr<float[]> filtered_noise_pow_;	85 std::vector<float> filtered_noise_pow_;

99 std::unique_ptr<float[]> center_freqs_;	86 std::vector<float> center_freqs_;

100 std::vector<std::vector<float>> capture_filter_bank_;	87 std::vector<std::vector<float>> capture_filter_bank_;

101 std::vector<std::vector<float>> render_filter_bank_;	88 std::vector<std::vector<float>> render_filter_bank_;

102 size_t start_freq_;	89 size_t start_freq_;

103	90

104 std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.	91 std::vector<float> gains_eq_; // Pre-filter modified gains.

105 intelligibility::GainApplier gain_applier_;	92 intelligibility::GainApplier gain_applier_;

106	93

107 // Destination buffers used to reassemble blocked chunks before overwriting

108 // the original input array with modifications.

109 ChannelBuffer<float> temp_render_out_buffer_;

110

111 TransformCallback render_callback_;

112 std::unique_ptr<LappedTransform> render_mangler_;	94 std::unique_ptr<LappedTransform> render_mangler_;

113	95

114 VoiceActivityDetector vad_;	96 VoiceActivityDetector vad_;

115 std::vector<int16_t> audio_s16_;	97 std::vector<int16_t> audio_s16_;

116 size_t chunks_since_voice_;	98 size_t chunks_since_voice_;

117 bool is_speech_;	99 bool is_speech_;

118 };	100 };

119	101

120 } // namespace webrtc	102 } // namespace webrtc

121	103

122 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	104 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW

« no previous file with comments | « no previous file | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »