webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1729753003: Fix the stereo support in IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@gains2

Patch Set: Rebasing Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
47 return ret;	47 return ret;

48 }	48 }

49	49

50 // Computes the power across ERB bands from the power spectral density \|pow\|.	50 // Computes the power across ERB bands from the power spectral density \|pow\|.

51 // Stores it in \|result\|.	51 // Stores it in \|result\|.

52 void MapToErbBands(const float* pow,	52 void MapToErbBands(const float* pow,

53 const std::vector<std::vector<float>>& filter_bank,	53 const std::vector<std::vector<float>>& filter_bank,

54 float* result) {	54 float* result) {

55 for (size_t i = 0; i < filter_bank.size(); ++i) {	55 for (size_t i = 0; i < filter_bank.size(); ++i) {

56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);	56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);

57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());	57 result[i] = DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());

58 }	58 }

59 }	59 }

60	60

61 } // namespace	61 } // namespace

62	62

63 IntelligibilityEnhancer::TransformCallback::TransformCallback(

64 IntelligibilityEnhancer* parent)

65 : parent_(parent) {

66 }

67

68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(

69 const std::complex<float>* const* in_block,

70 size_t in_channels,

71 size_t frames,

72 size_t /* out_channels */,

73 std::complex<float>* const* out_block) {

74 RTC_DCHECK_EQ(parent_->freqs_, frames);

75 for (size_t i = 0; i < in_channels; ++i) {

76 parent_->ProcessClearBlock(in_block[i], out_block[i]);

77 }

78 }

79

80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,	63 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,

81 size_t num_render_channels)	64 size_t num_render_channels)

82 : freqs_(RealFourier::ComplexLength(	65 : freqs_(RealFourier::ComplexLength(

83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	66 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),	67 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),	68 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),

86 sample_rate_hz_(sample_rate_hz),	69 sample_rate_hz_(sample_rate_hz),

87 num_render_channels_(num_render_channels),	70 num_render_channels_(num_render_channels),

88 clear_power_estimator_(freqs_, kDecayRate),	71 clear_power_estimator_(freqs_, kDecayRate),

89 noise_power_estimator_(	72 noise_power_estimator_(

90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),	73 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),

91 filtered_clear_pow_(new float[bank_size_]),	74 filtered_clear_pow_(bank_size_, 0.f),

92 filtered_noise_pow_(new float[bank_size_]),	75 filtered_noise_pow_(bank_size_, 0.f),

93 center_freqs_(new float[bank_size_]),	76 center_freqs_(bank_size_),

94 render_filter_bank_(CreateErbBank(freqs_)),	77 render_filter_bank_(CreateErbBank(freqs_)),

95 gains_eq_(new float[bank_size_]),	78 gains_eq_(bank_size_),

96 gain_applier_(freqs_, kMaxRelativeGainChange),	79 gain_applier_(freqs_, kMaxRelativeGainChange),

97 temp_render_out_buffer_(chunk_length_, num_render_channels_),

98 render_callback_(this),

99 audio_s16_(chunk_length_),	80 audio_s16_(chunk_length_),

100 chunks_since_voice_(kSpeechOffsetDelay),	81 chunks_since_voice_(kSpeechOffsetDelay),

101 is_speech_(false) {	82 is_speech_(false) {

102 RTC_DCHECK_LE(kRho, 1.f);	83 RTC_DCHECK_LE(kRho, 1.f);

103	84

104 memset(filtered_clear_pow_.get(), 0,

105 bank_size_ * sizeof(filtered_clear_pow_[0]));

106 memset(filtered_noise_pow_.get(), 0,

107 bank_size_ * sizeof(filtered_noise_pow_[0]));

108

109 const size_t erb_index = static_cast<size_t>(	85 const size_t erb_index = static_cast<size_t>(

110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +	86 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +

111 43.f));	87 43.f));

112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);	88 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

113	89

114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));	90 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));

115 std::vector<float> kbd_window(window_size);	91 std::vector<float> kbd_window(window_size);

116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);	92 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,

	93 kbd_window.data());

117 render_mangler_.reset(new LappedTransform(	94 render_mangler_.reset(new LappedTransform(

118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],	95 num_render_channels_, num_render_channels_, chunk_length_,

119 window_size, window_size / 2, &render_callback_));	96 kbd_window.data(), window_size, window_size / 2, this));

120 }	97 }

121	98

122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(	99 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(

123 std::vector<float> noise) {	100 std::vector<float> noise) {

124 if (capture_filter_bank_.size() != bank_size_ \|\|	101 if (capture_filter_bank_.size() != bank_size_ \|\|

125 capture_filter_bank_[0].size() != noise.size()) {	102 capture_filter_bank_[0].size() != noise.size()) {

126 capture_filter_bank_ = CreateErbBank(noise.size());	103 capture_filter_bank_ = CreateErbBank(noise.size());

127 noise_power_estimator_.reset(	104 noise_power_estimator_.reset(

128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));	105 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));

129 }	106 }

130 noise_power_estimator_->Step(&noise[0]);	107 noise_power_estimator_->Step(noise.data());

131 }	108 }

132	109

133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,	110 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,

134 int sample_rate_hz,	111 int sample_rate_hz,

135 size_t num_channels) {	112 size_t num_channels) {

136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);	113 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);

137 RTC_CHECK_EQ(num_render_channels_, num_channels);	114 RTC_CHECK_EQ(num_render_channels_, num_channels);

138 is_speech_ = IsSpeech(audio[0]);	115 is_speech_ = IsSpeech(audio[0]);

139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());	116 render_mangler_->ProcessChunk(audio, audio);

140 for (size_t i = 0; i < num_render_channels_; ++i) {

141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

142 chunk_length_ * sizeof(**audio));

143 }

144 }	117 }

145	118

146 void IntelligibilityEnhancer::ProcessClearBlock(	119 void IntelligibilityEnhancer::ProcessAudioBlock(

147 const std::complex<float>* in_block,	120 const std::complex<float>* const* in_block,

148 std::complex<float>* out_block) {	121 size_t in_channels,

	122 size_t frames,

	123 size_t /* out_channels */,

	124 std::complex<float>* const* out_block) {

	125 RTC_DCHECK_EQ(freqs_, frames);

149 if (is_speech_) {	126 if (is_speech_) {

150 clear_power_estimator_.Step(in_block);	127 clear_power_estimator_.Step(in_block[0]);

151 }	128 }

152 const std::vector<float>& clear_power = clear_power_estimator_.power();	129 const std::vector<float>& clear_power = clear_power_estimator_.power();

153 const std::vector<float>& noise_power = noise_power_estimator_->power();	130 const std::vector<float>& noise_power = noise_power_estimator_->power();

154 MapToErbBands(&clear_power[0], render_filter_bank_,	131 MapToErbBands(clear_power.data(), render_filter_bank_,

155 filtered_clear_pow_.get());	132 filtered_clear_pow_.data());

156 MapToErbBands(&noise_power[0], capture_filter_bank_,	133 MapToErbBands(noise_power.data(), capture_filter_bank_,

157 filtered_noise_pow_.get());	134 filtered_noise_pow_.data());

158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	135 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());

159 const float power_target =	136 const float power_target =

160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);	137 std::accumulate(clear_power.data(), clear_power.data() + freqs_, 0.f);

161 const float power_top =	138 const float power_top =

162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	139 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	140 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());

164 const float power_bot =	141 const float power_bot =

165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	142 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

166 if (power_target >= power_bot && power_target <= power_top) {	143 if (power_target >= power_bot && power_target <= power_top) {

167 SolveForLambda(power_target);	144 SolveForLambda(power_target);

168 UpdateErbGains();	145 UpdateErbGains();

169 } // Else experiencing power underflow, so do nothing.	146 } // Else experiencing power underflow, so do nothing.

170 gain_applier_.Apply(in_block, out_block);	147 for (size_t i = 0; i < in_channels; ++i) {

	148 gain_applier_.Apply(in_block[i], out_block[i]);

	149 }

171 }	150 }

172	151

173 void IntelligibilityEnhancer::SolveForLambda(float power_target) {	152 void IntelligibilityEnhancer::SolveForLambda(float power_target) {

174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	153 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

175 const int kMaxIters = 100; // for these, based on experiments.	154 const int kMaxIters = 100; // for these, based on experiments.

176	155

177 const float reciprocal_power_target =	156 const float reciprocal_power_target =

178 1.f / (power_target + std::numeric_limits<float>::epsilon());	157 1.f / (power_target + std::numeric_limits<float>::epsilon());

179 float lambda_bot = kLambdaBot;	158 float lambda_bot = kLambdaBot;

180 float lambda_top = kLambdaTop;	159 float lambda_top = kLambdaTop;

181 float power_ratio = 2.f; // Ratio of achieved power to target power.	160 float power_ratio = 2.f; // Ratio of achieved power to target power.

182 int iters = 0;	161 int iters = 0;

183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {	162 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {

184 const float lambda = (lambda_bot + lambda_top) / 2.f;	163 const float lambda = (lambda_bot + lambda_top) / 2.f;

185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());	164 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data());

186 const float power =	165 const float power =

187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	166 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

188 if (power < power_target) {	167 if (power < power_target) {

189 lambda_bot = lambda;	168 lambda_bot = lambda;

190 } else {	169 } else {

191 lambda_top = lambda;	170 lambda_top = lambda;

192 }	171 }

193 power_ratio = std::fabs(power * reciprocal_power_target);	172 power_ratio = std::fabs(power * reciprocal_power_target);

194 ++iters;	173 ++iters;

195 }	174 }

196 }	175 }

197	176

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
279 }	258 }

280 }	259 }

281 return filter_bank;	260 return filter_bank;

282 }	261 }

283	262

284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	263 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,

285 size_t start_freq,	264 size_t start_freq,

286 float* sols) {	265 float* sols) {

287 const float kMinPower = 1e-5f;	266 const float kMinPower = 1e-5f;

288	267

289 const float* pow_x0 = filtered_clear_pow_.get();	268 const float* pow_x0 = filtered_clear_pow_.data();

290 const float* pow_n0 = filtered_noise_pow_.get();	269 const float* pow_n0 = filtered_noise_pow_.data();

291	270

292 for (size_t n = 0; n < start_freq; ++n) {	271 for (size_t n = 0; n < start_freq; ++n) {

293 sols[n] = 1.f;	272 sols[n] = 1.f;

294 }	273 }

295	274

296 // Analytic solution for optimal gains. See paper for derivation.	275 // Analytic solution for optimal gains. See paper for derivation.

297 for (size_t n = start_freq; n < bank_size_; ++n) {	276 for (size_t n = start_freq; n < bank_size_; ++n) {

298 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {	277 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {

299 sols[n] = 1.f;	278 sols[n] = 1.f;

300 } else {	279 } else {

301 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +	280 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +

302 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];	281 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];

303 const float beta0 =	282 const float beta0 =

304 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n];	283 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n];

305 const float alpha0 =	284 const float alpha0 =

306 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n];	285 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n];

307 RTC_DCHECK_LT(alpha0, 0.f);	286 RTC_DCHECK_LT(alpha0, 0.f);

308 // The quadratic equation should always have real roots, but to guard	287 // The quadratic equation should always have real roots, but to guard

309 // against numerical errors we limit it to a minimum of zero.	288 // against numerical errors we limit it to a minimum of zero.

310 sols[n] = std::max(	289 sols[n] = std::max(

311 0.f, (-beta0 - std::sqrt(std::max(	290 0.f, (-beta0 - std::sqrt(std::max(

312 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) /	291 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) /

313 (2.f * alpha0));	292 (2.f * alpha0));

314 }	293 }

315 }	294 }

316 }	295 }

317	296

318 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {	297 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {

319 FloatToS16(audio, chunk_length_, &audio_s16_[0]);	298 FloatToS16(audio, chunk_length_, audio_s16_.data());

320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);	299 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);

321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	300 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

322 chunks_since_voice_ = 0;	301 chunks_since_voice_ = 0;

323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	302 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

324 ++chunks_since_voice_;	303 ++chunks_since_voice_;

325 }	304 }

326 return chunks_since_voice_ < kSpeechOffsetDelay;	305 return chunks_since_voice_ < kSpeechOffsetDelay;

327 }	306 }

328	307

329 } // namespace webrtc	308 } // namespace webrtc

OLD	NEW