webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands (Closed)

Patch Set: Rebasing Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« webrtc/common_audio/lapped_transform.h ('K') | « webrtc/modules/audio_processing/audio_processing_impl.cc ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

13	13

14 #include <complex>	14 #include <complex>

15 #include <memory>	15 #include <memory>

16 #include <vector>	16 #include <vector>

17	17

18 #include "webrtc/base/swap_queue.h"	18 #include "webrtc/base/swap_queue.h"

	19 #include "webrtc/common_audio/channel_buffer.h"

19 #include "webrtc/common_audio/lapped_transform.h"	20 #include "webrtc/common_audio/lapped_transform.h"

20 #include "webrtc/common_audio/channel_buffer.h"	21 #include "webrtc/modules/audio_processing/audio_buffer.h"

21 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	22 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

22 #include "webrtc/modules/audio_processing/render_queue_item_verifier.h"	23 #include "webrtc/modules/audio_processing/render_queue_item_verifier.h"

23 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"	24 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

24	25

25 namespace webrtc {	26 namespace webrtc {

26	27

27 // Speech intelligibility enhancement module. Reads render and capture	28 // Speech intelligibility enhancement module. Reads render and capture

28 // audio streams and modifies the render stream with a set of gains per	29 // audio streams and modifies the render stream with a set of gains per

29 // frequency bin to enhance speech against the noise background.	30 // frequency bin to enhance speech against the noise background.

30 // Details of the model and algorithm can be found in the original paper:	31 // Details of the model and algorithm can be found in the original paper:

31 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	32 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

32 class IntelligibilityEnhancer : public LappedTransform::Callback {	33 class IntelligibilityEnhancer : public LappedTransform::Callback {

33 public:	34 public:

34 IntelligibilityEnhancer(int sample_rate_hz,	35 IntelligibilityEnhancer(int sample_rate_hz,

35 size_t num_render_channels,	36 size_t num_render_channels,

	37 size_t num_bands,

36 size_t num_noise_bins);	38 size_t num_noise_bins);

37	39

38 ~IntelligibilityEnhancer() override;	40 ~IntelligibilityEnhancer() override;

39	41

40 // Sets the capture noise magnitude spectrum estimate.	42 // Sets the capture noise magnitude spectrum estimate.

41 void SetCaptureNoiseEstimate(std::vector<float> noise, float gain);	43 void SetCaptureNoiseEstimate(std::vector<float> noise, float gain);

42	44

43 // Reads chunk of speech in time domain and updates with modified signal.	45 // Reads chunk of speech in time domain and updates with modified signal.

44 void ProcessRenderAudio(float* const* audio,	46 void ProcessRenderAudio(AudioBuffer* audio);

45 int sample_rate_hz,

46 size_t num_channels);

47 bool active() const;	47 bool active() const;

48	48

49 protected:	49 protected:

50 // All in frequency domain, receives input \|in_block\|, applies	50 // All in frequency domain, receives input \|in_block\|, applies

51 // intelligibility enhancement, and writes result to \|out_block\|.	51 // intelligibility enhancement, and writes result to \|out_block\|.

52 void ProcessAudioBlock(const std::complex<float>* const* in_block,	52 void ProcessAudioBlock(const std::complex<float>* const* in_block,

53 size_t in_channels,	53 size_t in_channels,

54 size_t frames,	54 size_t frames,

55 size_t out_channels,	55 size_t out_channels,

56 std::complex<float>* const* out_block) override;	56 std::complex<float>* const* out_block) override;

57	57

58 private:	58 private:

	59 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestRenderUpdate);

59 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);	60 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);

60 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);	61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);

61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,	62 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,

62 TestNoiseGainHasExpectedResult);	63 TestNoiseGainHasExpectedResult);

	64 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,

	65 TestAllBandsHaveSameDelay);

63	66

64 // Updates the SNR estimation and enables or disables this component using a	67 // Updates the SNR estimation and enables or disables this component using a

65 // hysteresis.	68 // hysteresis.

66 void SnrBasedEffectActivation();	69 void SnrBasedEffectActivation();

67	70

68 // Bisection search for optimal \|lambda\|.	71 // Bisection search for optimal \|lambda\|.

69 void SolveForLambda(float power_target);	72 void SolveForLambda(float power_target);

70	73

71 // Transforms freq gains to ERB gains.	74 // Transforms freq gains to ERB gains.

72 void UpdateErbGains();	75 void UpdateErbGains();

73	76

74 // Returns number of ERB filters.	77 // Returns number of ERB filters.

75 static size_t GetBankSize(int sample_rate, size_t erb_resolution);	78 static size_t GetBankSize(int sample_rate, size_t erb_resolution);

76	79

77 // Initializes ERB filterbank.	80 // Initializes ERB filterbank.

78 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);	81 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);

79	82

80 // Analytically solves quadratic for optimal gains given \|lambda\|.	83 // Analytically solves quadratic for optimal gains given \|lambda\|.

81 // Negative gains are set to 0. Stores the results in \|sols\|.	84 // Negative gains are set to 0. Stores the results in \|sols\|.

82 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);	85 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);

83	86

84 // Returns true if the audio is speech.	87 // Returns true if the audio is speech.

85 bool IsSpeech(const float* audio);	88 bool IsSpeech(const float* audio);

86	89

	90 // Delays the high bands to compensate for the processing delay in the low

	91 // band.

	92 void DelayHighBands(AudioBuffer* audio);

	93

87 static const size_t kMaxNumNoiseEstimatesToBuffer = 5;	94 static const size_t kMaxNumNoiseEstimatesToBuffer = 5;

88	95

89 const size_t freqs_; // Num frequencies in frequency domain.	96 const size_t freqs_; // Num frequencies in frequency domain.

90 const size_t num_noise_bins_;	97 const size_t num_noise_bins_;

91 const size_t chunk_length_; // Chunk size in samples.	98 const size_t chunk_length_; // Chunk size in samples.

92 const size_t bank_size_; // Num ERB filters.	99 const size_t bank_size_; // Num ERB filters.

93 const int sample_rate_hz_;	100 const int sample_rate_hz_;

94 const size_t num_render_channels_;	101 const size_t num_render_channels_;

95	102

96 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;	103 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;

(...skipping 16 matching lines...) Expand all Loading...
113 bool is_speech_;	120 bool is_speech_;

114 float snr_;	121 float snr_;

115 bool is_active_;	122 bool is_active_;

116	123

117 unsigned long int num_chunks_;	124 unsigned long int num_chunks_;

118 unsigned long int num_active_chunks_;	125 unsigned long int num_active_chunks_;

119	126

120 std::vector<float> noise_estimation_buffer_;	127 std::vector<float> noise_estimation_buffer_;

121 SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>>	128 SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>>

122 noise_estimation_queue_;	129 noise_estimation_queue_;

	130

	131 std::vector<std::unique_ptr<intelligibility::DelayBuffer>>

	132 high_bands_buffers_;

123 };	133 };

124	134

125 } // namespace webrtc	135 } // namespace webrtc

126	136

127 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	137 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW