webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1718793002: Fix the gain calculation in IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1718793002: Fix the gain calculation in IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@vad

Patch Set: Make windows happy Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 12 matching lines...) Expand all Loading...
23 namespace webrtc {	23 namespace webrtc {

24	24

25 namespace {	25 namespace {

26	26

27 const size_t kErbResolution = 2;	27 const size_t kErbResolution = 2;

28 const int kWindowSizeMs = 16;	28 const int kWindowSizeMs = 16;

29 const int kChunkSizeMs = 10; // Size provided by APM.	29 const int kChunkSizeMs = 10; // Size provided by APM.

30 const float kClipFreqKhz = 0.2f;	30 const float kClipFreqKhz = 0.2f;

31 const float kKbdAlpha = 1.5f;	31 const float kKbdAlpha = 1.5f;

32 const float kLambdaBot = -1.0f; // Extreme values in bisection	32 const float kLambdaBot = -1.0f; // Extreme values in bisection

33 const float kLambdaTop = -10e-18f; // search for lamda.	33 const float kLambdaTop = -1e-5f; // search for lamda.

34 const float kVoiceProbabilityThreshold = 0.02f;	34 const float kVoiceProbabilityThreshold = 0.02f;

35 // Number of chunks after voice activity which is still considered speech.	35 // Number of chunks after voice activity which is still considered speech.

36 const size_t kSpeechOffsetDelay = 80;	36 const size_t kSpeechOffsetDelay = 80;

37 const float kDecayRate = 0.98f; // Power estimation decay rate.	37 const float kDecayRate = 0.98f; // Power estimation decay rate.

38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.	38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.

39 const float kRho = 0.0004f; // Default production and interpretation SNR.	39 const float kRho = 0.0004f; // Default production and interpretation SNR.

40	40

41 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.	41 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.

42 float DotProduct(const float* a, const float* b, size_t length) {	42 float DotProduct(const float* a, const float* b, size_t length) {

43 float ret = 0.f;	43 float ret = 0.f;

(...skipping 113 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
157 filtered_noise_pow_.get());	157 filtered_noise_pow_.get());

158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());

159 const float power_target =	159 const float power_target =

160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);	160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);

161 const float power_top =	161 const float power_top =

162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());

164 const float power_bot =	164 const float power_bot =

165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

166 if (power_target >= power_bot && power_target <= power_top) {	166 if (power_target >= power_bot && power_target <= power_top) {

167 SolveForLambda(power_target, power_bot, power_top);	167 SolveForLambda(power_target);

168 UpdateErbGains();	168 UpdateErbGains();

169 } // Else experiencing power underflow, so do nothing.	169 } // Else experiencing power underflow, so do nothing.

170 gain_applier_.Apply(in_block, out_block);	170 gain_applier_.Apply(in_block, out_block);

171 }	171 }

172	172

173 void IntelligibilityEnhancer::SolveForLambda(float power_target,	173 void IntelligibilityEnhancer::SolveForLambda(float power_target) {

174 float power_bot,

175 float power_top) {

176 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	174 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

177 const int kMaxIters = 100; // for these, based on experiments.	175 const int kMaxIters = 100; // for these, based on experiments.

178	176

179 const float reciprocal_power_target =	177 const float reciprocal_power_target =

180 1.f / (power_target + std::numeric_limits<float>::epsilon());	178 1.f / (power_target + std::numeric_limits<float>::epsilon());

181 float lambda_bot = kLambdaBot;	179 float lambda_bot = kLambdaBot;

182 float lambda_top = kLambdaTop;	180 float lambda_top = kLambdaTop;

183 float power_ratio = 2.f; // Ratio of achieved power to target power.	181 float power_ratio = 2.f; // Ratio of achieved power to target power.

184 int iters = 0;	182 int iters = 0;

185 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {	183 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {

186 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;	184 const float lambda = (lambda_bot + lambda_top) / 2.f;

187 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());	185 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());

188 const float power =	186 const float power =

189 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	187 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

190 if (power < power_target) {	188 if (power < power_target) {

191 lambda_bot = lambda;	189 lambda_bot = lambda;

192 } else {	190 } else {

193 lambda_top = lambda;	191 lambda_top = lambda;

194 }	192 }

195 power_ratio = std::fabs(power * reciprocal_power_target);	193 power_ratio = std::fabs(power * reciprocal_power_target);

196 ++iters;	194 ++iters;

(...skipping 82 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
279 for (size_t j = 0; j < bank_size_; ++j) {	277 for (size_t j = 0; j < bank_size_; ++j) {

280 filter_bank[j][i] /= sum;	278 filter_bank[j][i] /= sum;

281 }	279 }

282 }	280 }

283 return filter_bank;	281 return filter_bank;

284 }	282 }

285	283

286 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	284 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,

287 size_t start_freq,	285 size_t start_freq,

288 float* sols) {	286 float* sols) {

289 bool quadratic = (kRho < 1.f);	287 const float kMinPower = 1e-5f;

	288

290 const float* pow_x0 = filtered_clear_pow_.get();	289 const float* pow_x0 = filtered_clear_pow_.get();

291 const float* pow_n0 = filtered_noise_pow_.get();	290 const float* pow_n0 = filtered_noise_pow_.get();

292	291

293 for (size_t n = 0; n < start_freq; ++n) {	292 for (size_t n = 0; n < start_freq; ++n) {

294 sols[n] = 1.f;	293 sols[n] = 1.f;

295 }	294 }

296	295

297 // Analytic solution for optimal gains. See paper for derivation.	296 // Analytic solution for optimal gains. See paper for derivation.

298 for (size_t n = start_freq - 1; n < bank_size_; ++n) {	297 for (size_t n = start_freq; n < bank_size_; ++n) {

299 float alpha0, beta0, gamma0;	298 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {

300 gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +	299 sols[n] = 1.f;

301 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];

302 beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];

303 if (quadratic) {

304 alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];

305 sols[n] =

306 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /

307 (2 * alpha0 + std::numeric_limits<float>::epsilon());

308 } else {	300 } else {

309 sols[n] = -gamma0 / beta0;	301 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +

	302 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];

	303 const float beta0 =

	304 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n];

	305 const float alpha0 =

	306 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n];

	307 RTC_DCHECK_LT(alpha0, 0.f);

	308 // The quadratic equation should always have real roots, but to guard

	309 // against numerical errors we limit it to a minimum of zero.

	310 sols[n] = std::max(

	311 0.f, (-beta0 - std::sqrt(std::max(

	312 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) /

	313 (2.f * alpha0));

310 }	314 }

311 sols[n] = fmax(0, sols[n]);

312 }	315 }

313 }	316 }

314	317

315 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {	318 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {

316 FloatToS16(audio, chunk_length_, &audio_s16_[0]);	319 FloatToS16(audio, chunk_length_, &audio_s16_[0]);

317 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);	320 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);

318 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	321 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

319 chunks_since_voice_ = 0;	322 chunks_since_voice_ = 0;

320 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	323 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

321 ++chunks_since_voice_;	324 ++chunks_since_voice_;

322 }	325 }

323 return chunks_since_voice_ < kSpeechOffsetDelay;	326 return chunks_since_voice_ < kSpeechOffsetDelay;

324 }	327 }

325	328

326 } // namespace webrtc	329 } // namespace webrtc

OLD	NEW