webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h

Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands (Closed)

Patch Set: Fix glitches Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « webrtc/modules/audio_processing/audio_processing_impl.cc ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER _H_

13	13

14 #include <complex>	14 #include <complex>

15 #include <memory>	15 #include <memory>

16 #include <vector>	16 #include <vector>

17	17

18 #include "webrtc/base/swap_queue.h"	18 #include "webrtc/base/swap_queue.h"

	19 #include "webrtc/common_audio/audio_ring_buffer.h"

	20 #include "webrtc/common_audio/channel_buffer.h"

19 #include "webrtc/common_audio/lapped_transform.h"	21 #include "webrtc/common_audio/lapped_transform.h"

20 #include "webrtc/common_audio/channel_buffer.h"	22 #include "webrtc/modules/audio_processing/audio_buffer.h"

21 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	23 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

22 #include "webrtc/modules/audio_processing/render_queue_item_verifier.h"	24 #include "webrtc/modules/audio_processing/render_queue_item_verifier.h"

23 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"	25 #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"

24	26

25 namespace webrtc {	27 namespace webrtc {

26	28

27 // Speech intelligibility enhancement module. Reads render and capture	29 // Speech intelligibility enhancement module. Reads render and capture

28 // audio streams and modifies the render stream with a set of gains per	30 // audio streams and modifies the render stream with a set of gains per

29 // frequency bin to enhance speech against the noise background.	31 // frequency bin to enhance speech against the noise background.

30 // Details of the model and algorithm can be found in the original paper:	32 // Details of the model and algorithm can be found in the original paper:

31 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	33 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

32 class IntelligibilityEnhancer : public LappedTransform::Callback {	34 class IntelligibilityEnhancer : public LappedTransform::Callback {

33 public:	35 public:

34 IntelligibilityEnhancer(int sample_rate_hz,	36 IntelligibilityEnhancer(int sample_rate_hz,

35 size_t num_render_channels,	37 size_t num_render_channels,

	38 size_t num_bands,

36 size_t num_noise_bins);	39 size_t num_noise_bins);

37	40

38 ~IntelligibilityEnhancer() override;	41 ~IntelligibilityEnhancer() override;

39	42

40 // Sets the capture noise magnitude spectrum estimate.	43 // Sets the capture noise magnitude spectrum estimate.

41 void SetCaptureNoiseEstimate(std::vector<float> noise, float gain);	44 void SetCaptureNoiseEstimate(std::vector<float> noise, float gain);

42	45

43 // Reads chunk of speech in time domain and updates with modified signal.	46 // Reads chunk of speech in time domain and updates with modified signal.

44 void ProcessRenderAudio(float* const* audio,	47 void ProcessRenderAudio(AudioBuffer* audio);

45 int sample_rate_hz,

46 size_t num_channels);

47 bool active() const;	48 bool active() const;

48	49

49 protected:	50 protected:

50 // All in frequency domain, receives input \|in_block\|, applies	51 // All in frequency domain, receives input \|in_block\|, applies

51 // intelligibility enhancement, and writes result to \|out_block\|.	52 // intelligibility enhancement, and writes result to \|out_block\|.

52 void ProcessAudioBlock(const std::complex<float>* const* in_block,	53 void ProcessAudioBlock(const std::complex<float>* const* in_block,

53 size_t in_channels,	54 size_t in_channels,

54 size_t frames,	55 size_t frames,

55 size_t out_channels,	56 size_t out_channels,

56 std::complex<float>* const* out_block) override;	57 std::complex<float>* const* out_block) override;

57	58

58 private:	59 private:

	60 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestRenderUpdate);

59 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);	61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);

60 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);	62 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);

61 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,	63 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,

62 TestNoiseGainHasExpectedResult);	64 TestNoiseGainHasExpectedResult);

63	65

64 // Updates the SNR estimation and enables or disables this component using a	66 // Updates the SNR estimation and enables or disables this component using a

65 // hysteresis.	67 // hysteresis.

66 void SnrBasedEffectActivation();	68 void SnrBasedEffectActivation();

67	69

68 // Bisection search for optimal \|lambda\|.	70 // Bisection search for optimal \|lambda\|.

69 void SolveForLambda(float power_target);	71 void SolveForLambda(float power_target);

70	72

71 // Transforms freq gains to ERB gains.	73 // Transforms freq gains to ERB gains.

72 void UpdateErbGains();	74 void UpdateErbGains();

73	75

74 // Returns number of ERB filters.	76 // Returns number of ERB filters.

75 static size_t GetBankSize(int sample_rate, size_t erb_resolution);	77 static size_t GetBankSize(int sample_rate, size_t erb_resolution);

76	78

77 // Initializes ERB filterbank.	79 // Initializes ERB filterbank.

78 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);	80 std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);

79	81

80 // Analytically solves quadratic for optimal gains given \|lambda\|.	82 // Analytically solves quadratic for optimal gains given \|lambda\|.

81 // Negative gains are set to 0. Stores the results in \|sols\|.	83 // Negative gains are set to 0. Stores the results in \|sols\|.

82 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);	84 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);

83	85

84 // Returns true if the audio is speech.	86 // Returns true if the audio is speech.

85 bool IsSpeech(const float* audio);	87 bool IsSpeech(const float* audio);

86	88

	89 // Delays the high bands to compensate for the processing delay in the low

	90 // band.

	91 void DelayHighBands(AudioBuffer* audio);

	92

87 static const size_t kMaxNumNoiseEstimatesToBuffer = 5;	93 static const size_t kMaxNumNoiseEstimatesToBuffer = 5;

88	94

89 const size_t freqs_; // Num frequencies in frequency domain.	95 const size_t freqs_; // Num frequencies in frequency domain.

90 const size_t num_noise_bins_;	96 const size_t num_noise_bins_;

91 const size_t chunk_length_; // Chunk size in samples.	97 const size_t chunk_length_; // Chunk size in samples.

92 const size_t bank_size_; // Num ERB filters.	98 const size_t bank_size_; // Num ERB filters.

93 const int sample_rate_hz_;	99 const int sample_rate_hz_;

94 const size_t num_render_channels_;	100 const size_t num_render_channels_;

95	101

96 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;	102 intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;

(...skipping 16 matching lines...) Expand all Loading...
113 bool is_speech_;	119 bool is_speech_;

114 float snr_;	120 float snr_;

115 bool is_active_;	121 bool is_active_;

116	122

117 unsigned long int num_chunks_;	123 unsigned long int num_chunks_;

118 unsigned long int num_active_chunks_;	124 unsigned long int num_active_chunks_;

119	125

120 std::vector<float> noise_estimation_buffer_;	126 std::vector<float> noise_estimation_buffer_;

121 SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>>	127 SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>>

122 noise_estimation_queue_;	128 noise_estimation_queue_;

	129

	130 std::vector<std::unique_ptr<AudioRingBuffer>> high_bands_buffers_;

123 };	131 };

124	132

125 } // namespace webrtc	133 } // namespace webrtc

126	134

127 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_	135 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN CER_H_

OLD	NEW