webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow

Patch Set: Make gain change limit relative Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //

12 // Implements core class for intelligibility enhancer.

13 //

14 // Details of the model and algorithm can be found in the original paper:

15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

16 //

17

18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"	11 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"

19	12

20 #include <math.h>	13 #include <math.h>

21 #include <stdlib.h>	14 #include <stdlib.h>

22 #include <algorithm>	15 #include <algorithm>

23 #include <limits>	16 #include <limits>

24 #include <numeric>	17 #include <numeric>

25	18

26 #include "webrtc/base/checks.h"	19 #include "webrtc/base/checks.h"

27 #include "webrtc/common_audio/include/audio_util.h"	20 #include "webrtc/common_audio/include/audio_util.h"

28 #include "webrtc/common_audio/window_generator.h"	21 #include "webrtc/common_audio/window_generator.h"

29	22

30 namespace webrtc {	23 namespace webrtc {

31	24

32 namespace {	25 namespace {

33	26

34 const size_t kErbResolution = 2;	27 const size_t kErbResolution = 2;

35 const int kWindowSizeMs = 2;	28 const int kWindowSizeMs = 16;

36 const int kChunkSizeMs = 10; // Size provided by APM.	29 const int kChunkSizeMs = 10; // Size provided by APM.

37 const float kClipFreq = 200.0f;	30 const float kClipFreqKhz = 0.2f;

38 const float kConfigRho = 0.02f; // Default production and interpretation SNR.

39 const float kKbdAlpha = 1.5f;	31 const float kKbdAlpha = 1.5f;

40 const float kLambdaBot = -1.0f; // Extreme values in bisection	32 const float kLambdaBot = -1.0f; // Extreme values in bisection

41 const float kLambdaTop = -10e-18f; // search for lamda.	33 const float kLambdaTop = -10e-18f; // search for lamda.

	34 const float kVoiceProbabilityThreshold = 0.02;

	35 // Number of chunks after voice activity which is still considered speech.

	36 const size_t kSpeechOffsetDelay = 80;

	37 const float kDecayRate = 0.98f; // Power estimation decay rate.

	38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.

	39 const float kRho = 0.0004f; // Default production and interpretation SNR.

42	40

43 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.	41 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.

44 float DotProduct(const float* a, const float* b, size_t length) {	42 float DotProduct(const float* a, const float* b, size_t length) {

45 float ret = 0.f;	43 float ret = 0.f;

46 for (size_t i = 0; i < length; ++i) {	44 for (size_t i = 0; i < length; ++i) {

47 ret = fmaf(a[i], b[i], ret);	45 ret = fmaf(a[i], b[i], ret);

48 }	46 }

49 return ret;	47 return ret;

50 }	48 }

51	49

52 // Computes the power across ERB filters from the power spectral density \|var\|.	50 // Computes the power across ERB bands from the power spectral density \|pow\|.

53 // Stores it in \|result\|.	51 // Stores it in \|result\|.

54 void FilterVariance(const float* var,	52 void MapToErbBands(const float* pow,

55 const std::vector<std::vector<float>>& filter_bank,	53 const std::vector<std::vector<float>>& filter_bank,

56 float* result) {	54 float* result) {

57 for (size_t i = 0; i < filter_bank.size(); ++i) {	55 for (size_t i = 0; i < filter_bank.size(); ++i) {

58 RTC_DCHECK_GT(filter_bank[i].size(), 0u);	56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);

59 result[i] = DotProduct(&filter_bank[i][0], var, filter_bank[i].size());	57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());

60 }	58 }

61 }	59 }

62	60

63 } // namespace	61 } // namespace

64	62

65 using std::complex;

66 using std::max;

67 using std::min;

68 using VarianceType = intelligibility::VarianceArray::StepType;

69

70 IntelligibilityEnhancer::TransformCallback::TransformCallback(	63 IntelligibilityEnhancer::TransformCallback::TransformCallback(

71 IntelligibilityEnhancer* parent)	64 IntelligibilityEnhancer* parent)

72 : parent_(parent) {	65 : parent_(parent) {

73 }	66 }

74	67

75 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(	68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(

76 const complex<float>* const* in_block,	69 const std::complex<float>* const* in_block,

77 size_t in_channels,	70 size_t in_channels,

78 size_t frames,	71 size_t frames,

79 size_t /* out_channels */,	72 size_t /* out_channels */,

80 complex<float>* const* out_block) {	73 std::complex<float>* const* out_block) {

81 RTC_DCHECK_EQ(parent_->freqs_, frames);	74 RTC_DCHECK_EQ(parent_->freqs_, frames);

82 for (size_t i = 0; i < in_channels; ++i) {	75 for (size_t i = 0; i < in_channels; ++i) {

83 parent_->ProcessClearBlock(in_block[i], out_block[i]);	76 parent_->ProcessClearBlock(in_block[i], out_block[i]);

84 }	77 }

85 }	78 }

86	79

87 IntelligibilityEnhancer::IntelligibilityEnhancer()	80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,

88 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {	81 size_t num_render_channels)

89 }

90

91 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)

92 : freqs_(RealFourier::ComplexLength(	82 : freqs_(RealFourier::ComplexLength(

93 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),	83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

94 window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))),	84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

95 chunk_length_(	85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),

96 static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)),	86 sample_rate_hz_(sample_rate_hz),

97 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),	87 num_render_channels_(num_render_channels),

98 sample_rate_hz_(config.sample_rate_hz),	88 clear_power_estimator_(freqs_, kDecayRate),

99 erb_resolution_(kErbResolution),	89 noise_power_estimator_(

100 num_capture_channels_(config.num_capture_channels),	90 new intelligibility::PowerEstimator(freqs_, kDecayRate)),

101 num_render_channels_(config.num_render_channels),	91 filtered_clear_pow_(new float[bank_size_]),

102 analysis_rate_(config.analysis_rate),	92 filtered_noise_pow_(new float[bank_size_]),

103 active_(true),

104 clear_variance_(freqs_,

105 config.var_type,

106 config.var_window_size,

107 config.var_decay_rate),

108 noise_power_(freqs_, 0.f),

109 filtered_clear_var_(new float[bank_size_]),

110 filtered_noise_var_(new float[bank_size_]),

111 center_freqs_(new float[bank_size_]),	93 center_freqs_(new float[bank_size_]),

112 render_filter_bank_(CreateErbBank(freqs_)),	94 render_filter_bank_(CreateErbBank(freqs_)),

113 rho_(new float[bank_size_]),

114 gains_eq_(new float[bank_size_]),	95 gains_eq_(new float[bank_size_]),

115 gain_applier_(freqs_, config.gain_change_limit),	96 gain_applier_(freqs_, kMaxRelativeGainChange),

116 temp_render_out_buffer_(chunk_length_, num_render_channels_),	97 temp_render_out_buffer_(chunk_length_, num_render_channels_),

117 kbd_window_(new float[window_size_]),

118 render_callback_(this),	98 render_callback_(this),

119 block_count_(0),	99 audio_s16_(chunk_length_),

120 analysis_step_(0) {	100 chunks_since_voice_(kSpeechOffsetDelay),

121 RTC_DCHECK_LE(config.rho, 1.0f);	101 is_speech_(false) {

	102 RTC_DCHECK_LE(kRho, 1.f);

122	103

123 memset(filtered_clear_var_.get(),	104 memset(filtered_clear_pow_.get(), 0,

124 0,	105 bank_size_ * sizeof(filtered_clear_pow_[0]));

125 bank_size_ * sizeof(filtered_clear_var_[0]));	106 memset(filtered_noise_pow_.get(), 0,

126 memset(filtered_noise_var_.get(),	107 bank_size_ * sizeof(filtered_noise_pow_[0]));

127 0,

128 bank_size_ * sizeof(filtered_noise_var_[0]));

129	108

130 // Assumes all rho equal.	109 const size_t erb_index = static_cast<size_t>(

131 for (size_t i = 0; i < bank_size_; ++i) {	110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +

132 rho_[i] = config.rho * config.rho;	111 43.f));

133 }	112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

134	113

135 float freqs_khz = kClipFreq / 1000.0f;	114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));

136 size_t erb_index = static_cast<size_t>(ceilf(	115 std::vector<float> kbd_window(window_size);

137 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));	116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);

138 start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);

139

140 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,

141 kbd_window_.get());

142 render_mangler_.reset(new LappedTransform(	117 render_mangler_.reset(new LappedTransform(

143 num_render_channels_, num_render_channels_, chunk_length_,	118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],

144 kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));	119 window_size, window_size / 2, &render_callback_));

145 }	120 }

146	121

147 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(	122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(

148 std::vector<float> noise) {	123 std::vector<float> noise) {

149 if (capture_filter_bank_.size() != bank_size_ \|\|	124 if (capture_filter_bank_.size() != bank_size_ \|\|

150 capture_filter_bank_[0].size() != noise.size()) {	125 capture_filter_bank_[0].size() != noise.size()) {

151 capture_filter_bank_ = CreateErbBank(noise.size());	126 capture_filter_bank_ = CreateErbBank(noise.size());

	127 noise_power_estimator_.reset(

	128 new intelligibility::PowerEstimator(noise.size(), kDecayRate));

152 }	129 }

153 if (noise.size() != noise_power_.size()) {	130 noise_power_estimator_->Step(&noise[0]);

154 noise_power_.resize(noise.size());

155 }

156 for (size_t i = 0; i < noise.size(); ++i) {

157 noise_power_[i] = noise[i] * noise[i];

158 }

159 }	131 }

160	132

161 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,	133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,

162 int sample_rate_hz,	134 int sample_rate_hz,

163 size_t num_channels) {	135 size_t num_channels) {

164 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);	136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);

165 RTC_CHECK_EQ(num_render_channels_, num_channels);	137 RTC_CHECK_EQ(num_render_channels_, num_channels);

166	138 is_speech_ = IsSpeech(audio[0]);

167 if (active_) {	139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());

168 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());	140 for (size_t i = 0; i < num_render_channels_; ++i) {

169 }	141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

170	142 chunk_length_ * sizeof(**audio));

171 if (active_) {

172 for (size_t i = 0; i < num_render_channels_; ++i) {

173 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

174 chunk_length_ * sizeof(**audio));

175 }

176 }	143 }

177 }	144 }

178	145

179 void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,	146 void IntelligibilityEnhancer::ProcessClearBlock(

180 complex<float>* out_block) {	147 const std::complex<float>* in_block,

181 if (block_count_ < 2) {	148 std::complex<float>* out_block) {

182 memset(out_block, 0, freqs_ * sizeof(*out_block));	149 if (is_speech_) {

183 ++block_count_;	150 clear_power_estimator_.Step(in_block);

184 return;

185 }	151 }

186	152 const std::vector<float>& clear_power = clear_power_estimator_.power();

187 // TODO(ekm): Use VAD to \|Step\| and \|AnalyzeClearBlock\| only if necessary.	153 const std::vector<float>& noise_power = noise_power_estimator_->power();

188 if (true) {	154 MapToErbBands(&clear_power[0], render_filter_bank_,

189 clear_variance_.Step(in_block, false);	155 filtered_clear_pow_.get());

190 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {	156 MapToErbBands(&noise_power[0], capture_filter_bank_,

191 const float power_target = std::accumulate(	157 filtered_noise_pow_.get());

192 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f);

193 AnalyzeClearBlock(power_target);

194 ++analysis_step_;

195 }

196 ++block_count_;

197 }

198

199 if (active_) {

200 gain_applier_.Apply(in_block, out_block);

201 }

202 }

203

204 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {

205 FilterVariance(clear_variance_.variance(),

206 render_filter_bank_,

207 filtered_clear_var_.get());

208 FilterVariance(&noise_power_[0],

209 capture_filter_bank_,

210 filtered_noise_var_.get());

211 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());

	159 const float power_target =

	160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);

212 const float power_top =	161 const float power_top =

213 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);	162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

214 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());

215 const float power_bot =	164 const float power_bot =

216 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);	165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

217 if (power_target >= power_bot && power_target <= power_top) {	166 if (power_target >= power_bot && power_target <= power_top) {

218 SolveForLambda(power_target, power_bot, power_top);	167 SolveForLambda(power_target, power_bot, power_top);

219 UpdateErbGains();	168 UpdateErbGains();

220 } // Else experiencing variance underflow, so do nothing.	169 } // Else experiencing power underflow, so do nothing.

	170 gain_applier_.Apply(in_block, out_block);

221 }	171 }

222	172

223 void IntelligibilityEnhancer::SolveForLambda(float power_target,	173 void IntelligibilityEnhancer::SolveForLambda(float power_target,

224 float power_bot,	174 float power_bot,

225 float power_top) {	175 float power_top) {

226 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	176 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

227 const int kMaxIters = 100; // for these, based on experiments.	177 const int kMaxIters = 100; // for these, based on experiments.

228	178

229 const float reciprocal_power_target =	179 const float reciprocal_power_target =

230 1.f / (power_target + std::numeric_limits<float>::epsilon());	180 1.f / (power_target + std::numeric_limits<float>::epsilon());

231 float lambda_bot = kLambdaBot;	181 float lambda_bot = kLambdaBot;

232 float lambda_top = kLambdaTop;	182 float lambda_top = kLambdaTop;

233 float power_ratio = 2.0f; // Ratio of achieved power to target power.	183 float power_ratio = 2.f; // Ratio of achieved power to target power.

234 int iters = 0;	184 int iters = 0;

235 while (std::fabs(power_ratio - 1.0f) > kConvergeThresh &&	185 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {

236 iters <= kMaxIters) {	186 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;

237 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;

238 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());	187 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());

239 const float power =	188 const float power =

240 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);	189 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

241 if (power < power_target) {	190 if (power < power_target) {

242 lambda_bot = lambda;	191 lambda_bot = lambda;

243 } else {	192 } else {

244 lambda_top = lambda;	193 lambda_top = lambda;

245 }	194 }

246 power_ratio = std::fabs(power * reciprocal_power_target);	195 power_ratio = std::fabs(power * reciprocal_power_target);

247 ++iters;	196 ++iters;

248 }	197 }

249 }	198 }

250	199

251 void IntelligibilityEnhancer::UpdateErbGains() {	200 void IntelligibilityEnhancer::UpdateErbGains() {

252 // (ERB gain) = filterbank' * (freq gain)	201 // (ERB gain) = filterbank' * (freq gain)

253 float* gains = gain_applier_.target();	202 float* gains = gain_applier_.target();

254 for (size_t i = 0; i < freqs_; ++i) {	203 for (size_t i = 0; i < freqs_; ++i) {

255 gains[i] = 0.0f;	204 gains[i] = 0.f;

256 for (size_t j = 0; j < bank_size_; ++j) {	205 for (size_t j = 0; j < bank_size_; ++j) {

257 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);	206 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);

258 }	207 }

259 }	208 }

260 }	209 }

261	210

262 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,	211 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,

263 size_t erb_resolution) {	212 size_t erb_resolution) {

264 float freq_limit = sample_rate / 2000.0f;	213 float freq_limit = sample_rate / 2000.f;

265 size_t erb_scale = static_cast<size_t>(ceilf(	214 size_t erb_scale = static_cast<size_t>(ceilf(

266 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f));	215 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));

267 return erb_scale * erb_resolution;	216 return erb_scale * erb_resolution;

268 }	217 }

269	218

270 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(	219 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(

271 size_t num_freqs) {	220 size_t num_freqs) {

272 std::vector<std::vector<float>> filter_bank(bank_size_);	221 std::vector<std::vector<float>> filter_bank(bank_size_);

273 size_t lf = 1, rf = 4;	222 size_t lf = 1, rf = 4;

274	223

275 for (size_t i = 0; i < bank_size_; ++i) {	224 for (size_t i = 0; i < bank_size_; ++i) {

276 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));	225 float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));

277 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));	226 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));

278 center_freqs_[i] -= 14678.49f;	227 center_freqs_[i] -= 14678.49f;

279 }	228 }

280 float last_center_freq = center_freqs_[bank_size_ - 1];	229 float last_center_freq = center_freqs_[bank_size_ - 1];

281 for (size_t i = 0; i < bank_size_; ++i) {	230 for (size_t i = 0; i < bank_size_; ++i) {

282 center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;	231 center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;

283 }	232 }

284	233

285 for (size_t i = 0; i < bank_size_; ++i) {	234 for (size_t i = 0; i < bank_size_; ++i) {

286 filter_bank[i].resize(num_freqs);	235 filter_bank[i].resize(num_freqs);

287 }	236 }

288	237

289 for (size_t i = 1; i <= bank_size_; ++i) {	238 for (size_t i = 1; i <= bank_size_; ++i) {

290 size_t lll, ll, rr, rrr;	239 size_t lll, ll, rr, rrr;

291 static const size_t kOne = 1; // Avoids repeated static_cast<>s below.	240 lll = static_cast<size_t>(round(center_freqs_[std::max(1ul, i - lf) - 1] *

292 lll = static_cast<size_t>(round(	241 num_freqs / (0.5f * sample_rate_hz_)));

293 center_freqs_[max(kOne, i - lf) - 1] * num_freqs /	242 ll = static_cast<size_t>(round(center_freqs_[std::max(1ul, i) - 1] *

294 (0.5f * sample_rate_hz_)));	243 num_freqs / (0.5f * sample_rate_hz_)));

295 ll = static_cast<size_t>(round(	244 lll = std::min(num_freqs, std::max(lll, 1ul)) - 1;

296 center_freqs_[max(kOne, i) - 1] * num_freqs /	245 ll = std::min(num_freqs, std::max(ll, 1ul)) - 1;

297 (0.5f * sample_rate_hz_)));

298 lll = min(num_freqs, max(lll, kOne)) - 1;

299 ll = min(num_freqs, max(ll, kOne)) - 1;

300	246

301 rrr = static_cast<size_t>(round(	247 rrr = static_cast<size_t>(

302 center_freqs_[min(bank_size_, i + rf) - 1] * num_freqs /	248 round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /

303 (0.5f * sample_rate_hz_)));	249 (0.5f * sample_rate_hz_)));

304 rr = static_cast<size_t>(round(	250 rr = static_cast<size_t>(

305 center_freqs_[min(bank_size_, i + 1) - 1] * num_freqs /	251 round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /

306 (0.5f * sample_rate_hz_)));	252 (0.5f * sample_rate_hz_)));

307 rrr = min(num_freqs, max(rrr, kOne)) - 1;	253 rrr = std::min(num_freqs, std::max(rrr, 1ul)) - 1;

308 rr = min(num_freqs, max(rr, kOne)) - 1;	254 rr = std::min(num_freqs, std::max(rr, 1ul)) - 1;

309	255

310 float step, element;	256 float step, element;

311	257

312 step = ll == lll ? 0.f : 1.f / (ll - lll);	258 step = ll == lll ? 0.f : 1.f / (ll - lll);

313 element = 0.0f;	259 element = 0.f;

314 for (size_t j = lll; j <= ll; ++j) {	260 for (size_t j = lll; j <= ll; ++j) {

315 filter_bank[i - 1][j] = element;	261 filter_bank[i - 1][j] = element;

316 element += step;	262 element += step;

317 }	263 }

318 step = rr == rrr ? 0.f : 1.f / (rrr - rr);	264 step = rr == rrr ? 0.f : 1.f / (rrr - rr);

319 element = 1.0f;	265 element = 1.f;

320 for (size_t j = rr; j <= rrr; ++j) {	266 for (size_t j = rr; j <= rrr; ++j) {

321 filter_bank[i - 1][j] = element;	267 filter_bank[i - 1][j] = element;

322 element -= step;	268 element -= step;

323 }	269 }

324 for (size_t j = ll; j <= rr; ++j) {	270 for (size_t j = ll; j <= rr; ++j) {

325 filter_bank[i - 1][j] = 1.0f;	271 filter_bank[i - 1][j] = 1.f;

326 }	272 }

327 }	273 }

328	274

329 float sum;	275 float sum;

330 for (size_t i = 0; i < num_freqs; ++i) {	276 for (size_t i = 0; i < num_freqs; ++i) {

331 sum = 0.0f;	277 sum = 0.f;

332 for (size_t j = 0; j < bank_size_; ++j) {	278 for (size_t j = 0; j < bank_size_; ++j) {

333 sum += filter_bank[j][i];	279 sum += filter_bank[j][i];

334 }	280 }

335 for (size_t j = 0; j < bank_size_; ++j) {	281 for (size_t j = 0; j < bank_size_; ++j) {

336 filter_bank[j][i] /= sum;	282 filter_bank[j][i] /= sum;

337 }	283 }

338 }	284 }

339 return filter_bank;	285 return filter_bank;

340 }	286 }

341	287

342 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	288 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,

343 size_t start_freq,	289 size_t start_freq,

344 float* sols) {	290 float* sols) {

345 bool quadratic = (kConfigRho < 1.0f);	291 bool quadratic = (kRho < 1.f);

346 const float* var_x0 = filtered_clear_var_.get();	292 const float* pow_x0 = filtered_clear_pow_.get();

347 const float* var_n0 = filtered_noise_var_.get();	293 const float* pow_n0 = filtered_noise_pow_.get();

348	294

349 for (size_t n = 0; n < start_freq; ++n) {	295 for (size_t n = 0; n < start_freq; ++n) {

350 sols[n] = 1.0f;	296 sols[n] = 1.f;

351 }	297 }

352	298

353 // Analytic solution for optimal gains. See paper for derivation.	299 // Analytic solution for optimal gains. See paper for derivation.

354 for (size_t n = start_freq - 1; n < bank_size_; ++n) {	300 for (size_t n = start_freq - 1; n < bank_size_; ++n) {

355 float alpha0, beta0, gamma0;	301 float alpha0, beta0, gamma0;

356 gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] +	302 gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +

357 lambda * var_x0[n] * var_n0[n] * var_n0[n];	303 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];

358 beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n];	304 beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];

359 if (quadratic) {	305 if (quadratic) {

360 alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n];	306 alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];

361 sols[n] =	307 sols[n] =

362 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /	308 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /

363 (2 * alpha0 + std::numeric_limits<float>::epsilon());	309 (2 * alpha0 + std::numeric_limits<float>::epsilon());

364 } else {	310 } else {

365 sols[n] = -gamma0 / beta0;	311 sols[n] = -gamma0 / beta0;

366 }	312 }

367 sols[n] = fmax(0, sols[n]);	313 sols[n] = fmax(0, sols[n]);

368 }	314 }

369 }	315 }

370	316

371 bool IntelligibilityEnhancer::active() const {	317 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {

372 return active_;	318 FloatToS16(audio, chunk_length_, &audio_s16_[0]);

	319 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);

	320 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

	321 chunks_since_voice_ = 0;

	322 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

	323 ++chunks_since_voice_;

	324 }

	325 return chunks_since_voice_ < kSpeechOffsetDelay;

373 }	326 }

374	327

375 } // namespace webrtc	328 } // namespace webrtc

OLD	NEW