webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands (Closed)

Patch Set: Rebasing Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« webrtc/common_audio/lapped_transform.h ('K') | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
61 RTC_DCHECK_GT(filter_bank[i].size(), 0u);	61 RTC_DCHECK_GT(filter_bank[i].size(), 0u);

62 result[i] = kPowerNormalizationFactor *	62 result[i] = kPowerNormalizationFactor *

63 DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());	63 DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());

64 }	64 }

65 }	65 }

66	66

67 } // namespace	67 } // namespace

68	68

69 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,	69 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,

70 size_t num_render_channels,	70 size_t num_render_channels,

	71 size_t num_bands,

71 size_t num_noise_bins)	72 size_t num_noise_bins)

72 : freqs_(RealFourier::ComplexLength(	73 : freqs_(RealFourier::ComplexLength(

73 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	74 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

74 num_noise_bins_(num_noise_bins),	75 num_noise_bins_(num_noise_bins),

75 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),	76 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

76 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),	77 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),

77 sample_rate_hz_(sample_rate_hz),	78 sample_rate_hz_(sample_rate_hz),

78 num_render_channels_(num_render_channels),	79 num_render_channels_(num_render_channels),

79 clear_power_estimator_(freqs_, kDecayRate),	80 clear_power_estimator_(freqs_, kDecayRate),

80 noise_power_estimator_(num_noise_bins, kDecayRate),	81 noise_power_estimator_(num_noise_bins, kDecayRate),

(...skipping 22 matching lines...) Expand all Loading...
103 43.f));	104 43.f));

104 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);	105 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

105	106

106 size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_);	107 size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_);

107 std::vector<float> kbd_window(window_size);	108 std::vector<float> kbd_window(window_size);

108 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,	109 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,

109 kbd_window.data());	110 kbd_window.data());

110 render_mangler_.reset(new LappedTransform(	111 render_mangler_.reset(new LappedTransform(

111 num_render_channels_, num_render_channels_, chunk_length_,	112 num_render_channels_, num_render_channels_, chunk_length_,

112 kbd_window.data(), window_size, window_size / 2, this));	113 kbd_window.data(), window_size, window_size / 2, this));

	114

	115 const size_t initial_delay = render_mangler_->initial_delay();

	116 for (size_t i = 0u; i < num_bands - 1; ++i) {

	117 high_bands_buffers_.push_back(std::unique_ptr<intelligibility::DelayBuffer>(

	118 new intelligibility::DelayBuffer(initial_delay, num_render_channels_)));

	119 }

113 }	120 }

114	121

115 IntelligibilityEnhancer::~IntelligibilityEnhancer() {	122 IntelligibilityEnhancer::~IntelligibilityEnhancer() {

116 // Don't rely on this log, since the destructor isn't called when the app/tab	123 // Don't rely on this log, since the destructor isn't called when the

117 // is killed.	124 // app/tab is killed.

118 LOG(LS_INFO) << "Intelligibility Enhancer was active for "	125 if (num_chunks_ > 0) {

119 << static_cast<float>(num_active_chunks_) / num_chunks_	126 LOG(LS_INFO) << "Intelligibility Enhancer was active for "

120 << "% of the call.";	127 << 100.f * static_cast<float>(num_active_chunks_) / num_chunks_

	128 << "% of the call.";

	129 } else {

	130 LOG(LS_INFO) << "Intelligibility Enhancer processed no chunk.";

	131 }

121 }	132 }

122	133

123 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(	134 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(

124 std::vector<float> noise, float gain) {	135 std::vector<float> noise, float gain) {

125 RTC_DCHECK_EQ(noise.size(), num_noise_bins_);	136 RTC_DCHECK_EQ(noise.size(), num_noise_bins_);

126 for (auto& bin : noise) {	137 for (auto& bin : noise) {

127 bin *= gain;	138 bin *= gain;

128 }	139 }

129 // Disregarding return value since buffer overflow is acceptable, because it	140 // Disregarding return value since buffer overflow is acceptable, because it

130 // is not critical to get each noise estimate.	141 // is not critical to get each noise estimate.

131 if (noise_estimation_queue_.Insert(&noise)) {	142 if (noise_estimation_queue_.Insert(&noise)) {

132 };	143 };

133 }	144 }

134	145

135 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,	146 void IntelligibilityEnhancer::ProcessRenderAudio(AudioBuffer* audio) {

136 int sample_rate_hz,	147 RTC_DCHECK_EQ(num_render_channels_, audio->num_channels());

137 size_t num_channels) {

138 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);

139 RTC_CHECK_EQ(num_render_channels_, num_channels);

140 while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) {	148 while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) {

141 noise_power_estimator_.Step(noise_estimation_buffer_.data());	149 noise_power_estimator_.Step(noise_estimation_buffer_.data());

142 }	150 }

143 is_speech_ = IsSpeech(audio[0]);	151 float* const* low_band = audio->split_channels_f(kBand0To8kHz);

144 render_mangler_->ProcessChunk(audio, audio);	152 is_speech_ = IsSpeech(low_band[0]);

	153 render_mangler_->ProcessChunk(low_band, low_band);

	154 DelayHighBands(audio);

145 }	155 }

146	156

147 void IntelligibilityEnhancer::ProcessAudioBlock(	157 void IntelligibilityEnhancer::ProcessAudioBlock(

148 const std::complex<float>* const* in_block,	158 const std::complex<float>* const* in_block,

149 size_t in_channels,	159 size_t in_channels,

150 size_t frames,	160 size_t frames,

151 size_t /* out_channels */,	161 size_t /* out_channels */,

152 std::complex<float>* const* out_block) {	162 std::complex<float>* const* out_block) {

153 RTC_DCHECK_EQ(freqs_, frames);	163 RTC_DCHECK_EQ(freqs_, frames);

154 if (is_speech_) {	164 if (is_speech_) {

(...skipping 207 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
362 FloatToS16(audio, chunk_length_, audio_s16_.data());	372 FloatToS16(audio, chunk_length_, audio_s16_.data());

363 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);	373 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);

364 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	374 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

365 chunks_since_voice_ = 0;	375 chunks_since_voice_ = 0;

366 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	376 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

367 ++chunks_since_voice_;	377 ++chunks_since_voice_;

368 }	378 }

369 return chunks_since_voice_ < kSpeechOffsetDelay;	379 return chunks_since_voice_ < kSpeechOffsetDelay;

370 }	380 }

371	381

	382 void IntelligibilityEnhancer::DelayHighBands(AudioBuffer* audio) {

	383 RTC_DCHECK_EQ(audio->num_bands(), high_bands_buffers_.size() + 1u);

	384 for (size_t i = 0u; i < high_bands_buffers_.size(); ++i) {

	385 Band band = static_cast<Band>(i + 1);

	386 high_bands_buffers_[i]->Delay(audio->split_channels_f(band), chunk_length_);

	387 }

	388 }

	389

372 } // namespace webrtc	390 } // namespace webrtc

OLD	NEW