webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands (Closed)

Patch Set: Fix glitches Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
61 RTC_DCHECK_GT(filter_bank[i].size(), 0u);	61 RTC_DCHECK_GT(filter_bank[i].size(), 0u);

62 result[i] = kPowerNormalizationFactor *	62 result[i] = kPowerNormalizationFactor *

63 DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());	63 DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());

64 }	64 }

65 }	65 }

66	66

67 } // namespace	67 } // namespace

68	68

69 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,	69 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,

70 size_t num_render_channels,	70 size_t num_render_channels,

	71 size_t num_bands,

71 size_t num_noise_bins)	72 size_t num_noise_bins)

72 : freqs_(RealFourier::ComplexLength(	73 : freqs_(RealFourier::ComplexLength(

73 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	74 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

74 num_noise_bins_(num_noise_bins),	75 num_noise_bins_(num_noise_bins),

75 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),	76 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

76 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),	77 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),

77 sample_rate_hz_(sample_rate_hz),	78 sample_rate_hz_(sample_rate_hz),

78 num_render_channels_(num_render_channels),	79 num_render_channels_(num_render_channels),

79 clear_power_estimator_(freqs_, kDecayRate),	80 clear_power_estimator_(freqs_, kDecayRate),

80 noise_power_estimator_(num_noise_bins, kDecayRate),	81 noise_power_estimator_(num_noise_bins, kDecayRate),

(...skipping 22 matching lines...) Expand all Loading...
103 43.f));	104 43.f));

104 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);	105 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

105	106

106 size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_);	107 size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_);

107 std::vector<float> kbd_window(window_size);	108 std::vector<float> kbd_window(window_size);

108 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,	109 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,

109 kbd_window.data());	110 kbd_window.data());

110 render_mangler_.reset(new LappedTransform(	111 render_mangler_.reset(new LappedTransform(

111 num_render_channels_, num_render_channels_, chunk_length_,	112 num_render_channels_, num_render_channels_, chunk_length_,

112 kbd_window.data(), window_size, window_size / 2, this));	113 kbd_window.data(), window_size, window_size / 2, this));

	114

	115 const size_t initial_delay = render_mangler_->initial_delay();
	peah-webrtc 2016/09/13 13:29:59 Have you verified that this is indeed the delay in Have you verified that this is indeed the delay introduced by the IE processing? aluebs-webrtc 2016/09/14 00:35:54 Yes. Show quoted text On 2016/09/13 13:29:59, peah-webrtc wrote: > Have you verified that this is indeed the delay introduced by the IE processing? Yes.
	116 for (size_t i = 0u; i < num_bands - 1; ++i) {

	117 high_bands_buffers_.push_back(

	118 std::unique_ptr<AudioRingBuffer>(new AudioRingBuffer(

	119 num_render_channels_, chunk_length_ + initial_delay)));

	120 high_bands_buffers_[i]->MoveReadPositionBackward(initial_delay);

	121 }

113 }	122 }

114	123

115 IntelligibilityEnhancer::~IntelligibilityEnhancer() {	124 IntelligibilityEnhancer::~IntelligibilityEnhancer() {

116 // Don't rely on this log, since the destructor isn't called when the app/tab	125 if (num_chunks_ > 0) {

117 // is killed.	126 // Don't rely on this log, since the destructor isn't called when the

118 LOG(LS_INFO) << "Intelligibility Enhancer was active for "	127 // app/tab is killed.

119 << static_cast<float>(num_active_chunks_) / num_chunks_	128 LOG(LS_INFO) << "Intelligibility Enhancer was active for "
	peah-webrtc 2016/09/13 13:29:59 I think it would be good to have a log line for th I think it would be good to have a log line for the case when num_chunks_ == 0 as well. When analyzing logs to find errors, missing log-lines is a source of confusion. WDYT? aluebs-webrtc 2016/09/14 00:35:54 Done. Show quoted text On 2016/09/13 13:29:59, peah-webrtc wrote: > I think it would be good to have a log line for the case when num_chunks_ == 0 > as well. When analyzing logs to find errors, missing log-lines is a source of > confusion. > WDYT? Done.
120 << "% of the call.";	129 << 100.f * static_cast<float>(num_active_chunks_) / num_chunks_

	130 << "% of the call.";

	131 }

121 }	132 }

122	133

123 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(	134 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(

124 std::vector<float> noise, float gain) {	135 std::vector<float> noise, float gain) {

125 RTC_DCHECK_EQ(noise.size(), num_noise_bins_);	136 RTC_DCHECK_EQ(noise.size(), num_noise_bins_);

126 for (auto& bin : noise) {	137 for (auto& bin : noise) {

127 bin *= gain;	138 bin *= gain;

128 }	139 }

129 // Disregarding return value since buffer overflow is acceptable, because it	140 // Disregarding return value since buffer overflow is acceptable, because it

130 // is not critical to get each noise estimate.	141 // is not critical to get each noise estimate.

131 if (noise_estimation_queue_.Insert(&noise)) {	142 if (noise_estimation_queue_.Insert(&noise)) {

132 };	143 };

133 }	144 }

134	145

135 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,	146 void IntelligibilityEnhancer::ProcessRenderAudio(AudioBuffer* audio) {

136 int sample_rate_hz,	147 RTC_DCHECK_EQ(num_render_channels_, audio->num_channels());

137 size_t num_channels) {

138 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);

139 RTC_CHECK_EQ(num_render_channels_, num_channels);

140 while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) {	148 while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) {

141 noise_power_estimator_.Step(noise_estimation_buffer_.data());	149 noise_power_estimator_.Step(noise_estimation_buffer_.data());

142 }	150 }

143 is_speech_ = IsSpeech(audio[0]);	151 float* const* low_band = audio->split_channels_f(kBand0To8kHz);

144 render_mangler_->ProcessChunk(audio, audio);	152 is_speech_ = IsSpeech(low_band[0]);

	153 render_mangler_->ProcessChunk(low_band, low_band);

	154 DelayHighBands(audio);

145 }	155 }

146	156

147 void IntelligibilityEnhancer::ProcessAudioBlock(	157 void IntelligibilityEnhancer::ProcessAudioBlock(

148 const std::complex<float>* const* in_block,	158 const std::complex<float>* const* in_block,

149 size_t in_channels,	159 size_t in_channels,

150 size_t frames,	160 size_t frames,

151 size_t /* out_channels */,	161 size_t /* out_channels */,

152 std::complex<float>* const* out_block) {	162 std::complex<float>* const* out_block) {

153 RTC_DCHECK_EQ(freqs_, frames);	163 RTC_DCHECK_EQ(freqs_, frames);

154 if (is_speech_) {	164 if (is_speech_) {

155 clear_power_estimator_.Step(in_block[0]);	165 clear_power_estimator_.Step(in_block[0]);

156 }	166 }

157 SnrBasedEffectActivation();

158 ++num_chunks_;

159 if (is_active_) {	167 if (is_active_) {

160 ++num_active_chunks_;	168 ++num_active_chunks_;

161 if (num_chunks_ % kGainUpdatePeriod == 0) {	169 if (num_chunks_ % kGainUpdatePeriod == 0) {

162 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_,	170 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_,

163 filtered_clear_pow_.data());	171 filtered_clear_pow_.data());

164 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_,	172 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_,

165 filtered_noise_pow_.data());	173 filtered_noise_pow_.data());

166 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());	174 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());

167 const float power_target = std::accumulate(	175 const float power_target = std::accumulate(

168 filtered_clear_pow_.data(),	176 filtered_clear_pow_.data(),

169 filtered_clear_pow_.data() + bank_size_,	177 filtered_clear_pow_.data() + bank_size_,

170 0.f);	178 0.f);

171 const float power_top =	179 const float power_top =

172 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);	180 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

173 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());	181 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());

174 const float power_bot =	182 const float power_bot =

175 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);	183 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

176 if (power_target >= power_bot && power_target <= power_top) {	184 if (power_target >= power_bot && power_target <= power_top) {

177 SolveForLambda(power_target);	185 SolveForLambda(power_target);

178 UpdateErbGains();	186 UpdateErbGains();

179 } // Else experiencing power underflow, so do nothing.	187 } // Else experiencing power underflow, so do nothing.

180 }	188 }

181 }	189 }

	190 SnrBasedEffectActivation();
	peah-webrtc 2016/09/13 13:29:59 I think it makes sense. But please comment in the I think it makes sense. But please comment in the CL description on why you moved this, as it impacts the processing and is not at all mentioned in the description of the CL. aluebs-webrtc 2016/09/14 00:35:54 Moved it back. This was left from when is_active_ Show quoted text On 2016/09/13 13:29:59, peah-webrtc wrote: > I think it makes sense. But please comment in the CL description on why you > moved this, as it impacts the processing and is not at all mentioned in the > description of the CL. Moved it back. This was left from when is_active_ was needed in ProcessRenderAudio to decide what pointer to pass in.
	191 ++num_chunks_;

182 for (size_t i = 0; i < in_channels; ++i) {	192 for (size_t i = 0; i < in_channels; ++i) {

183 gain_applier_.Apply(in_block[i], out_block[i]);	193 gain_applier_.Apply(in_block[i], out_block[i]);

184 }	194 }

185 }	195 }

186	196

187 void IntelligibilityEnhancer::SnrBasedEffectActivation() {	197 void IntelligibilityEnhancer::SnrBasedEffectActivation() {

188 const float* clear_psd = clear_power_estimator_.power().data();	198 const float* clear_psd = clear_power_estimator_.power().data();

189 const float* noise_psd = noise_power_estimator_.power().data();	199 const float* noise_psd = noise_power_estimator_.power().data();

190 const float clear_power =	200 const float clear_power =

191 std::accumulate(clear_psd, clear_psd + freqs_, 0.f);	201 std::accumulate(clear_psd, clear_psd + freqs_, 0.f);

(...skipping 170 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
362 FloatToS16(audio, chunk_length_, audio_s16_.data());	372 FloatToS16(audio, chunk_length_, audio_s16_.data());

363 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);	373 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);

364 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	374 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

365 chunks_since_voice_ = 0;	375 chunks_since_voice_ = 0;

366 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	376 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

367 ++chunks_since_voice_;	377 ++chunks_since_voice_;

368 }	378 }

369 return chunks_since_voice_ < kSpeechOffsetDelay;	379 return chunks_since_voice_ < kSpeechOffsetDelay;

370 }	380 }

371	381

	382 void IntelligibilityEnhancer::DelayHighBands(AudioBuffer* audio) {

	383 RTC_DCHECK_EQ(audio->num_bands(), high_bands_buffers_.size() + 1u);

	384 for (size_t i = 0u; i < high_bands_buffers_.size(); ++i) {

	385 Band band = static_cast<Band>(i + 1);

	386 high_bands_buffers_[i]->Write(audio->split_channels_const_f(band),

	387 num_render_channels_, chunk_length_);

	388 high_bands_buffers_[i]->Read(audio->split_channels_f(band),

	389 num_render_channels_, chunk_length_);

	390 }

	391 }

	392

372 } // namespace webrtc	393 } // namespace webrtc

OLD	NEW