webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1878133002: Disable Intelligibility Enhancer for high SNRs

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1878133002: Disable Intelligibility Enhancer for high SNRs (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 20 matching lines...) Expand all Loading...
31 const float kKbdAlpha = 1.5f;	31 const float kKbdAlpha = 1.5f;

32 const float kLambdaBot = -1.f; // Extreme values in bisection	32 const float kLambdaBot = -1.f; // Extreme values in bisection

33 const float kLambdaTop = -1e-5f; // search for lamda.	33 const float kLambdaTop = -1e-5f; // search for lamda.

34 const float kVoiceProbabilityThreshold = 0.02f;	34 const float kVoiceProbabilityThreshold = 0.02f;

35 // Number of chunks after voice activity which is still considered speech.	35 // Number of chunks after voice activity which is still considered speech.

36 const size_t kSpeechOffsetDelay = 80;	36 const size_t kSpeechOffsetDelay = 80;

37 const float kDecayRate = 0.994f; // Power estimation decay rate.	37 const float kDecayRate = 0.994f; // Power estimation decay rate.

38 const float kMaxRelativeGainChange = 0.006f;	38 const float kMaxRelativeGainChange = 0.006f;

39 const float kRho = 0.0004f; // Default production and interpretation SNR.	39 const float kRho = 0.0004f; // Default production and interpretation SNR.

40 const float kPowerNormalizationFactor = 1.f / (1 << 30);	40 const float kPowerNormalizationFactor = 1.f / (1 << 30);

	41 const float kMaxActiveSNR = 128.f; // 21dB

	42 const float kMinInactiveSNR = 32.f; // 15dB

41	43

42 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.	44 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.

43 float DotProduct(const float* a, const float* b, size_t length) {	45 float DotProduct(const float* a, const float* b, size_t length) {

44 float ret = 0.f;	46 float ret = 0.f;

45 for (size_t i = 0; i < length; ++i) {	47 for (size_t i = 0; i < length; ++i) {

46 ret += a[i] * b[i];	48 ret += a[i] * b[i];

47 }	49 }

48 return ret;	50 return ret;

49 }	51 }

50	52

(...skipping 26 matching lines...) Expand all Loading...
77 filtered_clear_pow_(bank_size_, 0.f),	79 filtered_clear_pow_(bank_size_, 0.f),

78 filtered_noise_pow_(num_noise_bins, 0.f),	80 filtered_noise_pow_(num_noise_bins, 0.f),

79 center_freqs_(bank_size_),	81 center_freqs_(bank_size_),

80 capture_filter_bank_(CreateErbBank(num_noise_bins)),	82 capture_filter_bank_(CreateErbBank(num_noise_bins)),

81 render_filter_bank_(CreateErbBank(freqs_)),	83 render_filter_bank_(CreateErbBank(freqs_)),

82 gains_eq_(bank_size_),	84 gains_eq_(bank_size_),

83 gain_applier_(freqs_, kMaxRelativeGainChange),	85 gain_applier_(freqs_, kMaxRelativeGainChange),

84 audio_s16_(chunk_length_),	86 audio_s16_(chunk_length_),

85 chunks_since_voice_(kSpeechOffsetDelay),	87 chunks_since_voice_(kSpeechOffsetDelay),

86 is_speech_(false),	88 is_speech_(false),

	89 snr_(kMaxActiveSNR),

	90 is_active_(false),

87 noise_estimation_buffer_(num_noise_bins),	91 noise_estimation_buffer_(num_noise_bins),

88 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer,	92 noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer,

89 std::vector<float>(num_noise_bins),	93 std::vector<float>(num_noise_bins),

90 RenderQueueItemVerifier<float>(num_noise_bins)) {	94 RenderQueueItemVerifier<float>(num_noise_bins)) {

91 RTC_DCHECK_LE(kRho, 1.f);	95 RTC_DCHECK_LE(kRho, 1.f);

92	96

93 const size_t erb_index = static_cast<size_t>(	97 const size_t erb_index = static_cast<size_t>(

94 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +	98 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +

95 43.f));	99 43.f));

96 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);	100 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
128 void IntelligibilityEnhancer::ProcessAudioBlock(	132 void IntelligibilityEnhancer::ProcessAudioBlock(

129 const std::complex<float>* const* in_block,	133 const std::complex<float>* const* in_block,

130 size_t in_channels,	134 size_t in_channels,

131 size_t frames,	135 size_t frames,

132 size_t /* out_channels */,	136 size_t /* out_channels */,

133 std::complex<float>* const* out_block) {	137 std::complex<float>* const* out_block) {

134 RTC_DCHECK_EQ(freqs_, frames);	138 RTC_DCHECK_EQ(freqs_, frames);

135 if (is_speech_) {	139 if (is_speech_) {

136 clear_power_estimator_.Step(in_block[0]);	140 clear_power_estimator_.Step(in_block[0]);

137 }	141 }

138 const std::vector<float>& clear_power = clear_power_estimator_.power();	142 UpdateActivity();

139 const std::vector<float>& noise_power = noise_power_estimator_.power();	143 if (is_active_) {

140 MapToErbBands(clear_power.data(), render_filter_bank_,	144 MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_,

141 filtered_clear_pow_.data());	145 filtered_clear_pow_.data());

142 MapToErbBands(noise_power.data(), capture_filter_bank_,	146 MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_,

143 filtered_noise_pow_.data());	147 filtered_noise_pow_.data());

144 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());	148 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());

145 const float power_target = std::accumulate(	149 const float power_target = std::accumulate(

146 filtered_clear_pow_.data(), filtered_clear_pow_.data() + bank_size_, 0.f);	150 filtered_clear_pow_.data(),

147 const float power_top =	151 filtered_clear_pow_.data() + bank_size_,

148 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);	152 0.f);

149 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());	153 const float power_top =

150 const float power_bot =	154 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

151 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);	155 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());

152 if (power_target >= power_bot && power_target <= power_top) {	156 const float power_bot =

153 SolveForLambda(power_target);	157 DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);

154 UpdateErbGains();	158 if (power_target >= power_bot && power_target <= power_top) {

155 } // Else experiencing power underflow, so do nothing.	159 SolveForLambda(power_target);

	160 UpdateErbGains();

	161 } // Else experiencing power underflow, so do nothing.

	162 }

156 for (size_t i = 0; i < in_channels; ++i) {	163 for (size_t i = 0; i < in_channels; ++i) {

157 gain_applier_.Apply(in_block[i], out_block[i]);	164 gain_applier_.Apply(in_block[i], out_block[i]);

158 }	165 }

159 }	166 }

160	167

	168 void IntelligibilityEnhancer::UpdateActivity() {
	peah-webrtc 2016/04/12 13:39:21 What you are updating here is the is_active flag a What you are updating here is the is_active flag and gains, right? And the is_active flag is a flag for whether the effect of the IE should be active or not, and not for whether there is activity in the render signal, right? If I did not get that wrong, I think this method should be modified or renamed. since to me, activity means speech activity. What about renaming it to ControlEffectApplication, or SnrBasedEffectActivation which in my mind describe in more detail what is being done. aluebs-webrtc 2016/04/12 18:34:28 Yes, your understanding is completely right. And I Show quoted text On 2016/04/12 13:39:21, peah-webrtc wrote: > What you are updating here is the is_active flag and gains, right? And the > is_active flag is a flag for whether the effect of the IE should be active or > not, and not for whether there is activity in the render signal, right? > > If I did not get that wrong, I think this method should be modified or renamed. > since to me, activity means speech activity. > What about renaming it to ControlEffectApplication, or SnrBasedEffectActivation > which in my mind describe in more detail what is being done. > > Yes, your understanding is completely right. And I agree that your naming suggestion is more intuitive. Done.
	169 const float* clear_psd = clear_power_estimator_.power().data();

	170 const float* noise_psd = noise_power_estimator_.power().data();

	171 const float clear_power =

	172 std::accumulate(clear_psd, clear_psd + freqs_, 0.f);

	173 const float noise_power =

	174 std::accumulate(noise_psd, noise_psd + freqs_, 0.f);

	175 snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / noise_power;
	peah-webrtc 2016/04/12 13:39:21 This SNR estimate is an average of the instantaneo This SNR estimate is an average of the instantaneous SNR. An alternative could be to use the average of the overall SNR. Have you considered that (I'm not saying this is wrong). peah-webrtc 2016/04/12 13:39:21 This SNR estimate is assuming that the ratio of th This SNR estimate is assuming that the ratio of the clear_psd and noise_psd matches the ratio at the ear of the listener. What happens if the listener is using headphones? Then this ratio is very different from what the SNR is at the ear of the listener. The same is the case if a device is used in speaker mode. I don't really see any point in adding functionality for tuning the application of the IE effect based on the digital SNR until a mapping is in place to map this to the acoustic SNR at the ear. But I'm fine with the code change. aluebs-webrtc 2016/04/12 18:34:28 That is an interesting point. Because the PSDs are Show quoted text On 2016/04/12 13:39:21, peah-webrtc wrote: > This SNR estimate is an average of the instantaneous SNR. An alternative could > be to use the average of the overall SNR. Have you considered that (I'm not > saying this is wrong). That is an interesting point. Because the PSDs are already filtered over time already, this is not exactly an average of the instantaneous SNR, but more of an average of an averaged SNR, if that makes some sense. This additional filtering is just to ensure its smoothness. aluebs-webrtc 2016/04/12 18:34:28 As discussed offline at the beginning of this proj Show quoted text On 2016/04/12 13:39:21, peah-webrtc wrote: > This SNR estimate is assuming that the ratio of the clear_psd and noise_psd > matches the ratio at the ear of the listener. What happens if the listener is > using headphones? Then this ratio is very different from what the SNR is at the > ear of the listener. The same is the case if a device is used in speaker mode. > > I don't really see any point in adding functionality for tuning the application > of the IE effect based on the digital SNR until a mapping is in place to map > this to the acoustic SNR at the ear. > > But I'm fine with the code change. As discussed offline at the beginning of this project, with some broad assumptions we can estimate acoustic SNRs from the digital one good enough for the IE to improve the intelligibility of the signal. But also, I am testing right now on a real device if this holds true and at the same time working on a mapping I suggested to see if it improves the relation between the SNRs. On the other hand, what we decided was to enable this feature first only for headphones and phone mode (no speaker phone), so we can focus on that and delay the additional tweaking to later on the process. I think this code is valuable as of today, but I agree that the thresholds will need to be adjusted if we apply any mapping.
	176 if (is_active_) {

	177 if (snr_ > kMaxActiveSNR) {

	178 is_active_ = false;

	179 // Set the target gains to unity.

	180 float* gains = gain_applier_.target();

	181 for (size_t i = 0; i < freqs_; ++i) {

	182 gains[i] = 1.f;

	183 }

	184 }

	185 } else {

	186 is_active_ = snr_ < kMinInactiveSNR;

	187 }

	188 }

	189

161 void IntelligibilityEnhancer::SolveForLambda(float power_target) {	190 void IntelligibilityEnhancer::SolveForLambda(float power_target) {

162 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	191 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

163 const int kMaxIters = 100; // for these, based on experiments.	192 const int kMaxIters = 100; // for these, based on experiments.

164	193

165 const float reciprocal_power_target =	194 const float reciprocal_power_target =

166 1.f / (power_target + std::numeric_limits<float>::epsilon());	195 1.f / (power_target + std::numeric_limits<float>::epsilon());

167 float lambda_bot = kLambdaBot;	196 float lambda_bot = kLambdaBot;

168 float lambda_top = kLambdaTop;	197 float lambda_top = kLambdaTop;

169 float power_ratio = 2.f; // Ratio of achieved power to target power.	198 float power_ratio = 2.f; // Ratio of achieved power to target power.

170 int iters = 0;	199 int iters = 0;

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
308 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);	337 vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);

309 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {	338 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

310 chunks_since_voice_ = 0;	339 chunks_since_voice_ = 0;

311 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {	340 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

312 ++chunks_since_voice_;	341 ++chunks_since_voice_;

313 }	342 }

314 return chunks_since_voice_ < kSpeechOffsetDelay;	343 return chunks_since_voice_ < kSpeechOffsetDelay;

315 }	344 }

316	345

317 } // namespace webrtc	346 } // namespace webrtc

OLD	NEW

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | no next file » | no next file with comments »