webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1821443003: Fix normalization of noise estimate in NoiseSuppressor

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1821443003: Fix normalization of noise estimate in NoiseSuppressor (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Normalize dynamically Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc » ('j') | webrtc/modules/audio_processing/ns/noise_suppression_x.c » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 11 matching lines...) Expand all Loading...
22	22

23 namespace webrtc {	23 namespace webrtc {

24	24

25 namespace {	25 namespace {

26	26

27 const size_t kErbResolution = 2;	27 const size_t kErbResolution = 2;

28 const int kWindowSizeMs = 16;	28 const int kWindowSizeMs = 16;

29 const int kChunkSizeMs = 10; // Size provided by APM.	29 const int kChunkSizeMs = 10; // Size provided by APM.

30 const float kClipFreqKhz = 0.2f;	30 const float kClipFreqKhz = 0.2f;

31 const float kKbdAlpha = 1.5f;	31 const float kKbdAlpha = 1.5f;

32 const float kLambdaBot = -1.0f; // Extreme values in bisection	32 const double kLambdaBot = -1.0 / (1 << 30); // Extreme values in bisection
	turaj 2016/03/30 14:51:55 This is relatively large change which scales the This is relatively large change which scales the search reason by 1/2^30. Is the the change needed because of the change in noise level representation? Does it mean that gain are 2^30 times more sensitive to the value of \lambda? aluebs-webrtc 2016/03/31 00:26:32 Yes, it needed to be normalized by the new noise s Show quoted text On 2016/03/30 14:51:55, turaj wrote: > This is relatively large change which scales the search reason by 1/2^30. Is > the the change needed because of the change in noise level representation? Does > it mean that gain are 2^30 times more sensitive to the value of \lambda? Yes, it needed to be normalized by the new noise scale. But now I changed the implementation to have a different normalization factor that is applied independently.
33 const float kLambdaTop = -1e-5f; // search for lamda.	33 const double kLambdaTop = -1e-5 / (1 << 30); // search for lamda.

34 const float kVoiceProbabilityThreshold = 0.02f;	34 const float kVoiceProbabilityThreshold = 0.02f;

35 // Number of chunks after voice activity which is still considered speech.	35 // Number of chunks after voice activity which is still considered speech.

36 const size_t kSpeechOffsetDelay = 80;	36 const size_t kSpeechOffsetDelay = 80;

37 const float kDecayRate = 0.98f; // Power estimation decay rate.	37 const float kDecayRate = 0.98f; // Power estimation decay rate.

38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.	38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.

39 const float kRho = 0.0004f; // Default production and interpretation SNR.	39 const float kRho = 0.0004f; // Default production and interpretation SNR.

40	40

41 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.	41 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.

42 float DotProduct(const float* a, const float* b, size_t length) {	42 float DotProduct(const float* a, const float* b, size_t length) {

43 float ret = 0.f;	43 float ret = 0.f;

(...skipping 111 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155 gain_applier_.Apply(in_block[i], out_block[i]);	155 gain_applier_.Apply(in_block[i], out_block[i]);

156 }	156 }

157 }	157 }

158	158

159 void IntelligibilityEnhancer::SolveForLambda(float power_target) {	159 void IntelligibilityEnhancer::SolveForLambda(float power_target) {

160 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	160 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

161 const int kMaxIters = 100; // for these, based on experiments.	161 const int kMaxIters = 100; // for these, based on experiments.

162	162

163 const float reciprocal_power_target =	163 const float reciprocal_power_target =

164 1.f / (power_target + std::numeric_limits<float>::epsilon());	164 1.f / (power_target + std::numeric_limits<float>::epsilon());

165 float lambda_bot = kLambdaBot;	165 double lambda_bot = kLambdaBot;

166 float lambda_top = kLambdaTop;	166 double lambda_top = kLambdaTop;

167 float power_ratio = 2.f; // Ratio of achieved power to target power.	167 float power_ratio = 2.f; // Ratio of achieved power to target power.

168 int iters = 0;	168 int iters = 0;

169 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {	169 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {

170 const float lambda = (lambda_bot + lambda_top) / 2.f;	170 const double lambda = (lambda_bot + lambda_top) / 2.0;
	peah-webrtc 2016/03/30 13:44:45 lambda is always inbetween lambda_bot and lambda_t lambda is always inbetween lambda_bot and lambda_top, right? And both of these are bounded by kLambdaBot and kLambdaTop, right? Then I don't see why the double precision is needed for lambda as kLambdaBot kLambdaTop are sufficiently close to be possible to represent using floats. Or am I missing something? aluebs-webrtc 2016/03/31 00:26:32 Good point, although I was trying to keep consiste Show quoted text On 2016/03/30 13:44:45, peah-webrtc wrote: > lambda is always inbetween lambda_bot and lambda_top, right? And both of these > are bounded by kLambdaBot and kLambdaTop, right? Then I don't see why the double > precision is needed for lambda as kLambdaBot kLambdaTop are sufficiently close > to be possible to represent using floats. Or am I missing something? Good point, although I was trying to keep consistency. In any way, I found a way to normalize the power independently so that double precision is not necessary.
171 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data());	171 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data());

172 const float power =	172 const float power =

173 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);	173 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

174 if (power < power_target) {	174 if (power < power_target) {

175 lambda_bot = lambda;	175 lambda_bot = lambda;

176 } else {	176 } else {

177 lambda_top = lambda;	177 lambda_top = lambda;

178 }	178 }

179 power_ratio = std::fabs(power * reciprocal_power_target);	179 power_ratio = std::fabs(power * reciprocal_power_target);

180 ++iters;	180 ++iters;

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
260 for (size_t j = 0; j < bank_size_; ++j) {	260 for (size_t j = 0; j < bank_size_; ++j) {

261 sum += filter_bank[j][i];	261 sum += filter_bank[j][i];

262 }	262 }

263 for (size_t j = 0; j < bank_size_; ++j) {	263 for (size_t j = 0; j < bank_size_; ++j) {

264 filter_bank[j][i] /= sum;	264 filter_bank[j][i] /= sum;

265 }	265 }

266 }	266 }

267 return filter_bank;	267 return filter_bank;

268 }	268 }

269	269

270 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	270 void IntelligibilityEnhancer::SolveForGainsGivenLambda(double lambda,

271 size_t start_freq,	271 size_t start_freq,

272 float* sols) {	272 float* sols) {

273 const float kMinPower = 1e-5f;	273 const float kMinPower = 1e-5f;

274	274

275 const float* pow_x0 = filtered_clear_pow_.data();	275 const float* pow_x0 = filtered_clear_pow_.data();

276 const float* pow_n0 = filtered_noise_pow_.data();	276 const float* pow_n0 = filtered_noise_pow_.data();

277	277

278 for (size_t n = 0; n < start_freq; ++n) {	278 for (size_t n = 0; n < start_freq; ++n) {

279 sols[n] = 1.f;	279 sols[n] = 1.f;

280 }	280 }

281	281

282 // Analytic solution for optimal gains. See paper for derivation.	282 // Analytic solution for optimal gains. See paper for derivation.

283 for (size_t n = start_freq; n < bank_size_; ++n) {	283 for (size_t n = start_freq; n < bank_size_; ++n) {

284 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {	284 if (pow_x0[n] < kMinPower \|\| pow_n0[n] < kMinPower) {

285 sols[n] = 1.f;	285 sols[n] = 1.f;

286 } else {	286 } else {

287 const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +	287 const double gamma0 = 0.5 * kRho * pow_x0[n] * pow_n0[n] +
	peah-webrtc 2016/03/30 13:44:46 I cannot see from this equations what difference t I cannot see from this equations what difference the double precision should make. Is it because beta, alpha and gamma are of highly different magnitudes? aluebs-webrtc 2016/03/31 00:26:32 Basically because each alpha0, beta0 or gamma0 has Show quoted text On 2016/03/30 13:44:46, peah-webrtc wrote: > I cannot see from this equations what difference the double precision should > make. Is it because beta, alpha and gamma are of highly different magnitudes? Basically because each alpha0, beta0 or gamma0 has a product of 3 pow_, which are the PSD of a 256 length FFT of a signal of 2^15 maximum amplitude, so it can go up to ((256 * 2^15)^2)^3 = 2^138, which is approximately 256e39 which is above the float maximum (3.40282e38). But now I am normalizing the power independtly, so that double precision is not necessary anymore.
288 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];	288 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];

289 const float beta0 =	289 const double beta0 =

290 lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n];	290 lambda * pow_x0[n] * (2.0 - kRho) * pow_x0[n] * pow_n0[n];

291 const float alpha0 =	291 const double alpha0 =

292 lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n];	292 lambda * pow_x0[n] * (1.0 - kRho) * pow_x0[n] * pow_x0[n];

293 RTC_DCHECK_LT(alpha0, 0.f);	293 RTC_DCHECK_LT(alpha0, 0.0);

294 // The quadratic equation should always have real roots, but to guard	294 // The quadratic equation should always have real roots, but to guard

295 // against numerical errors we limit it to a minimum of zero.	295 // against numerical errors we limit it to a minimum of zero.

296 sols[n] = std::max(	296 sols[n] = std::max(

297 0.f, (-beta0 - std::sqrt(std::max(	297 0.0, (-beta0 - std::sqrt(std::max(

298 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) /	298 0.0, beta0 * beta0 - 4.0 * alpha0 * gamma0))) /

299 (2.f * alpha0));	299 (2.0 * alpha0));

300 }	300 }

301 }	301 }

302 }	302 }

303	303

304 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {	304 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {

305 FloatToS16(audio, chunk_length_, audio_s16_.data());	305 FloatToS16(audio, chunk_length_, audio_s16_.data());

306 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);	306 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);

307 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	307 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

308 chunks_since_voice_ = 0;	308 chunks_since_voice_ = 0;

309 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	309 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

310 ++chunks_since_voice_;	310 ++chunks_since_voice_;

311 }	311 }

312 return chunks_since_voice_ < kSpeechOffsetDelay;	312 return chunks_since_voice_ < kSpeechOffsetDelay;

313 }	313 }

314	314

315 } // namespace webrtc	315 } // namespace webrtc

OLD	NEW