webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@gains2

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
53 const std::vector<std::vector<float>>& filter_bank,	53 const std::vector<std::vector<float>>& filter_bank,

54 float* result) {	54 float* result) {

55 for (size_t i = 0; i < filter_bank.size(); ++i) {	55 for (size_t i = 0; i < filter_bank.size(); ++i) {

56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);	56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);

57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());	57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());

58 }	58 }

59 }	59 }

60	60

61 } // namespace	61 } // namespace

62	62

63 IntelligibilityEnhancer::TransformCallback::TransformCallback(

64 IntelligibilityEnhancer* parent)

65 : parent_(parent) {

66 }

67

68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(

69 const std::complex<float>* const* in_block,

70 size_t in_channels,

71 size_t frames,

72 size_t /* out_channels */,

73 std::complex<float>* const* out_block) {

74 RTC_DCHECK_EQ(parent_->freqs_, frames);

75 for (size_t i = 0; i < in_channels; ++i) {

76 parent_->ProcessClearBlock(in_block[i], out_block[i]);

77 }

78 }

79

80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,	63 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,

81 size_t num_render_channels)	64 size_t num_render_channels)

82 : freqs_(RealFourier::ComplexLength(	65 : freqs_(RealFourier::ComplexLength(

83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	66 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),	67 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),	68 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),

86 sample_rate_hz_(sample_rate_hz),	69 sample_rate_hz_(sample_rate_hz),

87 num_render_channels_(num_render_channels),	70 num_render_channels_(num_render_channels),

88 clear_power_estimator_(freqs_, kDecayRate),	71 clear_power_estimator_(freqs_, kDecayRate),

89 noise_power_estimator_(	72 noise_power_estimator_(

90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),	73 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),

91 filtered_clear_pow_(new float[bank_size_]),	74 filtered_clear_pow_(bank_size_, 0.f),

92 filtered_noise_pow_(new float[bank_size_]),	75 filtered_noise_pow_(bank_size_, 0.f),

93 center_freqs_(new float[bank_size_]),	76 center_freqs_(bank_size_),

94 render_filter_bank_(CreateErbBank(freqs_)),	77 render_filter_bank_(CreateErbBank(freqs_)),

95 gains_eq_(new float[bank_size_]),	78 gains_eq_(bank_size_),

96 gain_applier_(freqs_, kMaxRelativeGainChange),	79 gain_applier_(freqs_, kMaxRelativeGainChange),

97 temp_render_out_buffer_(chunk_length_, num_render_channels_),

98 render_callback_(this),

99 audio_s16_(chunk_length_),	80 audio_s16_(chunk_length_),

100 chunks_since_voice_(kSpeechOffsetDelay),	81 chunks_since_voice_(kSpeechOffsetDelay),

101 is_speech_(false) {	82 is_speech_(false) {

102 RTC_DCHECK_LE(kRho, 1.f);	83 RTC_DCHECK_LE(kRho, 1.f);

103	84

104 memset(filtered_clear_pow_.get(), 0,

105 bank_size_ * sizeof(filtered_clear_pow_[0]));

106 memset(filtered_noise_pow_.get(), 0,

107 bank_size_ * sizeof(filtered_noise_pow_[0]));

108

109 const size_t erb_index = static_cast<size_t>(	85 const size_t erb_index = static_cast<size_t>(

110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +	86 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +

111 43.f));	87 43.f));

112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);	88 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

113	89

114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));	90 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));

115 std::vector<float> kbd_window(window_size);	91 std::vector<float> kbd_window(window_size);

116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);	92 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);

117 render_mangler_.reset(new LappedTransform(	93 render_mangler_.reset(new LappedTransform(

118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],	94 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],

119 window_size, window_size / 2, &render_callback_));	95 window_size, window_size / 2, this));

120 }	96 }

121	97

122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(	98 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(

123 std::vector<float> noise) {	99 std::vector<float> noise) {

124 if (capture_filter_bank_.size() != bank_size_ \|\|	100 if (capture_filter_bank_.size() != bank_size_ \|\|

125 capture_filter_bank_[0].size() != noise.size()) {	101 capture_filter_bank_[0].size() != noise.size()) {

126 capture_filter_bank_ = CreateErbBank(noise.size());	102 capture_filter_bank_ = CreateErbBank(noise.size());

127 noise_power_estimator_.reset(	103 noise_power_estimator_.reset(

128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));	104 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));

129 }	105 }

130 noise_power_estimator_->Step(&noise[0]);	106 noise_power_estimator_->Step(&noise[0]);

131 }	107 }

132	108

133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,	109 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,

134 int sample_rate_hz,	110 int sample_rate_hz,

135 size_t num_channels) {	111 size_t num_channels) {

136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);	112 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);

137 RTC_CHECK_EQ(num_render_channels_, num_channels);	113 RTC_CHECK_EQ(num_render_channels_, num_channels);

138 is_speech_ = IsSpeech(audio[0]);	114 is_speech_ = IsSpeech(audio[0]);

139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());	115 render_mangler_->ProcessChunk(audio, audio);

140 for (size_t i = 0; i < num_render_channels_; ++i) {

141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

142 chunk_length_ * sizeof(**audio));

143 }

144 }	116 }

145	117

146 void IntelligibilityEnhancer::ProcessClearBlock(	118 void IntelligibilityEnhancer::ProcessAudioBlock(

147 const std::complex<float>* in_block,	119 const std::complex<float>* const* in_block,

148 std::complex<float>* out_block) {	120 size_t in_channels,

	121 size_t frames,

	122 size_t /* out_channels */,

	123 std::complex<float>* const* out_block) {

	124 RTC_DCHECK_EQ(freqs_, frames);

149 if (is_speech_) {	125 if (is_speech_) {

150 clear_power_estimator_.Step(in_block);	126 clear_power_estimator_.Step(in_block[0]);
	turaj 2016/02/24 16:00:17 I suppose this change and changes in lines 146-148 I suppose this change and changes in lines 146-148 are highlights of this CL, the reset is refactoring, right? aluebs-webrtc 2016/02/25 00:18:37 Yes, as you point out, the main change is removing Show quoted text On 2016/02/24 16:00:17, turaj wrote: > I suppose this change and changes in lines 146-148 are highlights of this CL, > the reset is refactoring, right? Yes, as you point out, the main change is removing the callback class and make the IE directly be the callback, only estimate the power once per chunk and apply the same gain to all channels. I probably should have done 2 separate CLs now that I look at it. Sorry about that.
151 }	127 }

152 const std::vector<float>& clear_power = clear_power_estimator_.power();	128 const std::vector<float>& clear_power = clear_power_estimator_.power();

153 const std::vector<float>& noise_power = noise_power_estimator_->power();	129 const std::vector<float>& noise_power = noise_power_estimator_->power();

154 MapToErbBands(&clear_power[0], render_filter_bank_,	130 MapToErbBands(&clear_power[0], render_filter_bank_,

155 filtered_clear_pow_.get());	131 &filtered_clear_pow_[0]);
	hlundin-webrtc 2016/02/24 10:17:03 I suggest you use .data() instead of &...[0]. See I suggest you use .data() instead of &...[0]. See http://en.cppreference.com/w/cpp/container/vector/data. This is the preferred way in C++11. Several places below, too. aluebs-webrtc 2016/02/25 00:18:37 I am aware of that, but I thought it was still not Show quoted text On 2016/02/24 10:17:03, hlundin-webrtc wrote: > I suggest you use .data() instead of &...[0]. See > http://en.cppreference.com/w/cpp/container/vector/data. This is the preferred > way in C++11. > > Several places below, too. I am aware of that, but I thought it was still not allowed in Chromium/WebRTC. Now I checked and it is: https://chromium-cpp.appspot.com/. Changed to use that method in all places.
156 MapToErbBands(&noise_power[0], capture_filter_bank_,	132 MapToErbBands(&noise_power[0], capture_filter_bank_,

157 filtered_noise_pow_.get());	133 &filtered_noise_pow_[0]);

158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	134 SolveForGainsGivenLambda(kLambdaTop, start_freq_, &gains_eq_[0]);

159 const float power_target =	135 const float power_target =

160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);	136 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);

161 const float power_top =	137 const float power_top =

162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	138 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_);

163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	139 SolveForGainsGivenLambda(kLambdaBot, start_freq_, &gains_eq_[0]);

164 const float power_bot =	140 const float power_bot =

165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	141 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_);

166 if (power_target >= power_bot && power_target <= power_top) {	142 if (power_target >= power_bot && power_target <= power_top) {

167 SolveForLambda(power_target);	143 SolveForLambda(power_target);

168 UpdateErbGains();	144 UpdateErbGains();

169 } // Else experiencing power underflow, so do nothing.	145 } // Else experiencing power underflow, so do nothing.

170 gain_applier_.Apply(in_block, out_block);	146 for (size_t i = 0; i < in_channels; ++i) {

	147 gain_applier_.Apply(in_block[i], out_block[i]);

	148 }

171 }	149 }

172	150

173 void IntelligibilityEnhancer::SolveForLambda(float power_target) {	151 void IntelligibilityEnhancer::SolveForLambda(float power_target) {

174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	152 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

175 const int kMaxIters = 100; // for these, based on experiments.	153 const int kMaxIters = 100; // for these, based on experiments.

176	154

177 const float reciprocal_power_target =	155 const float reciprocal_power_target =

178 1.f / (power_target + std::numeric_limits<float>::epsilon());	156 1.f / (power_target + std::numeric_limits<float>::epsilon());

179 float lambda_bot = kLambdaBot;	157 float lambda_bot = kLambdaBot;

180 float lambda_top = kLambdaTop;	158 float lambda_top = kLambdaTop;

181 float power_ratio = 2.f; // Ratio of achieved power to target power.	159 float power_ratio = 2.f; // Ratio of achieved power to target power.

182 int iters = 0;	160 int iters = 0;

183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {	161 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {

184 const float lambda = (lambda_bot + lambda_top) / 2.f;	162 const float lambda = (lambda_bot + lambda_top) / 2.f;

185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());	163 SolveForGainsGivenLambda(lambda, start_freq_, &gains_eq_[0]);

186 const float power =	164 const float power =

187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	165 DotProduct(&gains_eq_[0], &filtered_clear_pow_[0], bank_size_);

188 if (power < power_target) {	166 if (power < power_target) {

189 lambda_bot = lambda;	167 lambda_bot = lambda;

190 } else {	168 } else {

191 lambda_top = lambda;	169 lambda_top = lambda;

192 }	170 }

193 power_ratio = std::fabs(power * reciprocal_power_target);	171 power_ratio = std::fabs(power * reciprocal_power_target);

194 ++iters;	172 ++iters;

195 }	173 }

196 }	174 }

197	175

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
279 }	257 }

280 }	258 }

281 return filter_bank;	259 return filter_bank;

282 }	260 }

283	261

284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	262 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,

285 size_t start_freq,	263 size_t start_freq,

286 float* sols) {	264 float* sols) {

287 const float kMinPower = 1e-5;	265 const float kMinPower = 1e-5;

288	266

289 const float* pow_x0 = filtered_clear_pow_.get();	267 const float* pow_x0 = &filtered_clear_pow_[0];

290 const float* pow_n0 = filtered_noise_pow_.get();	268 const float* pow_n0 = &filtered_noise_pow_[0];

291	269

292 for (size_t n = 0; n < start_freq; ++n) {	270 for (size_t n = 0; n < start_freq; ++n) {

293 sols[n] = 1.f;	271 sols[n] = 1.f;

294 }	272 }

295	273

296 // Analytic solution for optimal gains. See paper for derivation.	274 // Analytic solution for optimal gains. See paper for derivation.

297 for (size_t n = start_freq; n < bank_size_; ++n) {	275 for (size_t n = start_freq; n < bank_size_; ++n) {

298 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {	276 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {

299 sols[n] = 1.f;	277 sols[n] = 1.f;

300 } else {	278 } else {

(...skipping 19 matching lines...) Expand all Loading...
320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);	298 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);

321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	299 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

322 chunks_since_voice_ = 0;	300 chunks_since_voice_ = 0;

323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	301 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

324 ++chunks_since_voice_;	302 ++chunks_since_voice_;

325 }	303 }

326 return chunks_since_voice_ < kSpeechOffsetDelay;	304 return chunks_since_voice_ < kSpeechOffsetDelay;

327 }	305 }

328	306

329 } // namespace webrtc	307 } // namespace webrtc

OLD	NEW