webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow

Patch Set: Use f for float Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"	11 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"

12	12

13 #include <math.h>	13 #include <math.h>

14 #include <stdlib.h>	14 #include <stdlib.h>

15 #include <algorithm>	15 #include <algorithm>

16 #include <limits>	16 #include <limits>

17 #include <numeric>	17 #include <numeric>

18	18

19 #include "webrtc/base/checks.h"	19 #include "webrtc/base/checks.h"

20 #include "webrtc/common_audio/include/audio_util.h"	20 #include "webrtc/common_audio/include/audio_util.h"

21 #include "webrtc/common_audio/window_generator.h"	21 #include "webrtc/common_audio/window_generator.h"

22	22

23 namespace webrtc {	23 namespace webrtc {

24	24

25 namespace {	25 namespace {

26	26

27 const size_t kErbResolution = 2;	27 const size_t kErbResolution = 2;

28 const int kWindowSizeMs = 16;	28 const int kWindowSizeMs = 16;

29 const int kChunkSizeMs = 10; // Size provided by APM.	29 const int kChunkSizeMs = 10; // Size provided by APM.

30 const float kClipFreq = 200.0f;	30 const float kClipFreqKhz = 0.2f;

31 const float kConfigRho = 0.02f; // Default production and interpretation SNR.

32 const float kKbdAlpha = 1.5f;	31 const float kKbdAlpha = 1.5f;

33 const float kLambdaBot = -1.0f; // Extreme values in bisection	32 const float kLambdaBot = -1.0f; // Extreme values in bisection

34 const float kLambdaTop = -10e-18f; // search for lamda.	33 const float kLambdaTop = -10e-18f; // search for lamda.

	34 const float kVoiceProbabilityThreshold = 0.02f;

	35 // Number of chunks after voice activity which is still considered speech.

	36 const size_t kSpeechOffsetDelay = 80;

	37 const float kDecayRate = 0.98f; // Power estimation decay rate.

	38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.

	39 const float kRho = 0.0004f; // Default production and interpretation SNR.

35	40

36 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.	41 // Returns dot product of vectors \|a\| and \|b\| with size \|length\|.

37 float DotProduct(const float* a, const float* b, size_t length) {	42 float DotProduct(const float* a, const float* b, size_t length) {

38 float ret = 0.f;	43 float ret = 0.f;

39 for (size_t i = 0; i < length; ++i) {	44 for (size_t i = 0; i < length; ++i) {

40 ret = fmaf(a[i], b[i], ret);	45 ret = fmaf(a[i], b[i], ret);

41 }	46 }

42 return ret;	47 return ret;

43 }	48 }

44	49

(...skipping 20 matching lines...) Expand all Loading...
65 size_t in_channels,	70 size_t in_channels,

66 size_t frames,	71 size_t frames,

67 size_t /* out_channels */,	72 size_t /* out_channels */,

68 std::complex<float>* const* out_block) {	73 std::complex<float>* const* out_block) {

69 RTC_DCHECK_EQ(parent_->freqs_, frames);	74 RTC_DCHECK_EQ(parent_->freqs_, frames);

70 for (size_t i = 0; i < in_channels; ++i) {	75 for (size_t i = 0; i < in_channels; ++i) {

71 parent_->ProcessClearBlock(in_block[i], out_block[i]);	76 parent_->ProcessClearBlock(in_block[i], out_block[i]);

72 }	77 }

73 }	78 }

74	79

75 IntelligibilityEnhancer::IntelligibilityEnhancer()	80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,

76 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {	81 size_t num_render_channels)

77 }

78

79 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)

80 : freqs_(RealFourier::ComplexLength(	82 : freqs_(RealFourier::ComplexLength(

81 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),	83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

82 window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))),	84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

83 chunk_length_(	85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),

84 static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)),	86 sample_rate_hz_(sample_rate_hz),

85 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),	87 num_render_channels_(num_render_channels),

86 sample_rate_hz_(config.sample_rate_hz),	88 clear_power_estimator_(freqs_, kDecayRate),

87 erb_resolution_(kErbResolution),	89 noise_power_estimator_(

88 num_capture_channels_(config.num_capture_channels),	90 new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),

89 num_render_channels_(config.num_render_channels),

90 analysis_rate_(config.analysis_rate),

91 active_(true),

92 clear_power_(freqs_, config.decay_rate),

93 noise_power_(freqs_, 0.f),

94 filtered_clear_pow_(new float[bank_size_]),	91 filtered_clear_pow_(new float[bank_size_]),

95 filtered_noise_pow_(new float[bank_size_]),	92 filtered_noise_pow_(new float[bank_size_]),

96 center_freqs_(new float[bank_size_]),	93 center_freqs_(new float[bank_size_]),

97 render_filter_bank_(CreateErbBank(freqs_)),	94 render_filter_bank_(CreateErbBank(freqs_)),

98 rho_(new float[bank_size_]),

99 gains_eq_(new float[bank_size_]),	95 gains_eq_(new float[bank_size_]),

100 gain_applier_(freqs_, config.gain_change_limit),	96 gain_applier_(freqs_, kMaxRelativeGainChange),

101 temp_render_out_buffer_(chunk_length_, num_render_channels_),	97 temp_render_out_buffer_(chunk_length_, num_render_channels_),

102 kbd_window_(new float[window_size_]),

103 render_callback_(this),	98 render_callback_(this),

104 block_count_(0),	99 audio_s16_(chunk_length_),

105 analysis_step_(0) {	100 chunks_since_voice_(kSpeechOffsetDelay),

106 RTC_DCHECK_LE(config.rho, 1.0f);	101 is_speech_(false) {

	102 RTC_DCHECK_LE(kRho, 1.f);

107	103

108 memset(filtered_clear_pow_.get(),	104 memset(filtered_clear_pow_.get(), 0,

109 0,

110 bank_size_ * sizeof(filtered_clear_pow_[0]));	105 bank_size_ * sizeof(filtered_clear_pow_[0]));

111 memset(filtered_noise_pow_.get(),	106 memset(filtered_noise_pow_.get(), 0,

112 0,

113 bank_size_ * sizeof(filtered_noise_pow_[0]));	107 bank_size_ * sizeof(filtered_noise_pow_[0]));

114	108

115 // Assumes all rho equal.	109 const size_t erb_index = static_cast<size_t>(

116 for (size_t i = 0; i < bank_size_; ++i) {	110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +

117 rho_[i] = config.rho * config.rho;	111 43.f));

118 }	112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);

119	113

120 float freqs_khz = kClipFreq / 1000.0f;	114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));

121 size_t erb_index = static_cast<size_t>(ceilf(	115 std::vector<float> kbd_window(window_size);

122 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));	116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);

123 start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);

124

125 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,

126 kbd_window_.get());

127 render_mangler_.reset(new LappedTransform(	117 render_mangler_.reset(new LappedTransform(

128 num_render_channels_, num_render_channels_, chunk_length_,	118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],

129 kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));	119 window_size, window_size / 2, &render_callback_));

130 }	120 }

131	121

132 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(	122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(

133 std::vector<float> noise) {	123 std::vector<float> noise) {

134 if (capture_filter_bank_.size() != bank_size_ \|\|	124 if (capture_filter_bank_.size() != bank_size_ \|\|

135 capture_filter_bank_[0].size() != noise.size()) {	125 capture_filter_bank_[0].size() != noise.size()) {

136 capture_filter_bank_ = CreateErbBank(noise.size());	126 capture_filter_bank_ = CreateErbBank(noise.size());

	127 noise_power_estimator_.reset(

	128 new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));

137 }	129 }

138 if (noise.size() != noise_power_.size()) {	130 noise_power_estimator_->Step(&noise[0]);

139 noise_power_.resize(noise.size());

140 }

141 for (size_t i = 0; i < noise.size(); ++i) {

142 noise_power_[i] = noise[i] * noise[i];

143 }

144 }	131 }

145	132

146 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,	133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,

147 int sample_rate_hz,	134 int sample_rate_hz,

148 size_t num_channels) {	135 size_t num_channels) {

149 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);	136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);

150 RTC_CHECK_EQ(num_render_channels_, num_channels);	137 RTC_CHECK_EQ(num_render_channels_, num_channels);

151	138 is_speech_ = IsSpeech(audio[0]);

152 if (active_) {	139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());

153 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());	140 for (size_t i = 0; i < num_render_channels_; ++i) {

154 }	141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

155	142 chunk_length_ * sizeof(**audio));

156 if (active_) {

157 for (size_t i = 0; i < num_render_channels_; ++i) {

158 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

159 chunk_length_ * sizeof(**audio));

160 }

161 }	143 }

162 }	144 }

163	145

164 void IntelligibilityEnhancer::ProcessClearBlock(	146 void IntelligibilityEnhancer::ProcessClearBlock(

165 const std::complex<float>* in_block,	147 const std::complex<float>* in_block,

166 std::complex<float>* out_block) {	148 std::complex<float>* out_block) {

167 if (block_count_ < 2) {	149 if (is_speech_) {

168 memset(out_block, 0, freqs_ * sizeof(*out_block));	150 clear_power_estimator_.Step(in_block);

169 ++block_count_;

170 return;

171 }	151 }

172	152 const std::vector<float>& clear_power = clear_power_estimator_.power();

173 // TODO(ekm): Use VAD to \|Step\| and \|AnalyzeClearBlock\| only if necessary.	153 const std::vector<float>& noise_power = noise_power_estimator_->power();

174 if (true) {	154 MapToErbBands(&clear_power[0], render_filter_bank_,

175 clear_power_.Step(in_block);

176 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {

177 AnalyzeClearBlock();

178 ++analysis_step_;

179 }

180 ++block_count_;

181 }

182

183 if (active_) {

184 gain_applier_.Apply(in_block, out_block);

185 }

186 }

187

188 void IntelligibilityEnhancer::AnalyzeClearBlock() {

189 const float* clear_power = clear_power_.Power();

190 MapToErbBands(clear_power,

191 render_filter_bank_,

192 filtered_clear_pow_.get());	155 filtered_clear_pow_.get());

193 MapToErbBands(&noise_power_[0],	156 MapToErbBands(&noise_power[0], capture_filter_bank_,

194 capture_filter_bank_,

195 filtered_noise_pow_.get());	157 filtered_noise_pow_.get());

196 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());

197 const float power_target = std::accumulate(	159 const float power_target =

198 clear_power, clear_power + freqs_, 0.f);	160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);

199 const float power_top =	161 const float power_top =

200 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

201 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());

202 const float power_bot =	164 const float power_bot =

203 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

204 if (power_target >= power_bot && power_target <= power_top) {	166 if (power_target >= power_bot && power_target <= power_top) {

205 SolveForLambda(power_target, power_bot, power_top);	167 SolveForLambda(power_target, power_bot, power_top);

206 UpdateErbGains();	168 UpdateErbGains();

207 } // Else experiencing power underflow, so do nothing.	169 } // Else experiencing power underflow, so do nothing.

	170 gain_applier_.Apply(in_block, out_block);

208 }	171 }

209	172

210 void IntelligibilityEnhancer::SolveForLambda(float power_target,	173 void IntelligibilityEnhancer::SolveForLambda(float power_target,

211 float power_bot,	174 float power_bot,

212 float power_top) {	175 float power_top) {

213 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values	176 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values

214 const int kMaxIters = 100; // for these, based on experiments.	177 const int kMaxIters = 100; // for these, based on experiments.

215	178

216 const float reciprocal_power_target =	179 const float reciprocal_power_target =

217 1.f / (power_target + std::numeric_limits<float>::epsilon());	180 1.f / (power_target + std::numeric_limits<float>::epsilon());

218 float lambda_bot = kLambdaBot;	181 float lambda_bot = kLambdaBot;

219 float lambda_top = kLambdaTop;	182 float lambda_top = kLambdaTop;

220 float power_ratio = 2.0f; // Ratio of achieved power to target power.	183 float power_ratio = 2.f; // Ratio of achieved power to target power.

221 int iters = 0;	184 int iters = 0;

222 while (std::fabs(power_ratio - 1.0f) > kConvergeThresh &&	185 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {

223 iters <= kMaxIters) {	186 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;

224 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;

225 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());	187 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());

226 const float power =	188 const float power =

227 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);	189 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);

228 if (power < power_target) {	190 if (power < power_target) {

229 lambda_bot = lambda;	191 lambda_bot = lambda;

230 } else {	192 } else {

231 lambda_top = lambda;	193 lambda_top = lambda;

232 }	194 }

233 power_ratio = std::fabs(power * reciprocal_power_target);	195 power_ratio = std::fabs(power * reciprocal_power_target);

234 ++iters;	196 ++iters;

235 }	197 }

236 }	198 }

237	199

238 void IntelligibilityEnhancer::UpdateErbGains() {	200 void IntelligibilityEnhancer::UpdateErbGains() {

239 // (ERB gain) = filterbank' * (freq gain)	201 // (ERB gain) = filterbank' * (freq gain)

240 float* gains = gain_applier_.target();	202 float* gains = gain_applier_.target();

241 for (size_t i = 0; i < freqs_; ++i) {	203 for (size_t i = 0; i < freqs_; ++i) {

242 gains[i] = 0.0f;	204 gains[i] = 0.f;

243 for (size_t j = 0; j < bank_size_; ++j) {	205 for (size_t j = 0; j < bank_size_; ++j) {

244 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);	206 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);

245 }	207 }

246 }	208 }

247 }	209 }

248	210

249 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,	211 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,

250 size_t erb_resolution) {	212 size_t erb_resolution) {

251 float freq_limit = sample_rate / 2000.0f;	213 float freq_limit = sample_rate / 2000.f;

252 size_t erb_scale = static_cast<size_t>(ceilf(	214 size_t erb_scale = static_cast<size_t>(ceilf(

253 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f));	215 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));

254 return erb_scale * erb_resolution;	216 return erb_scale * erb_resolution;

255 }	217 }

256	218

257 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(	219 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(

258 size_t num_freqs) {	220 size_t num_freqs) {

259 std::vector<std::vector<float>> filter_bank(bank_size_);	221 std::vector<std::vector<float>> filter_bank(bank_size_);

260 size_t lf = 1, rf = 4;	222 size_t lf = 1, rf = 4;

261	223

262 for (size_t i = 0; i < bank_size_; ++i) {	224 for (size_t i = 0; i < bank_size_; ++i) {

263 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));	225 float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));

264 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));	226 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));

265 center_freqs_[i] -= 14678.49f;	227 center_freqs_[i] -= 14678.49f;

266 }	228 }

267 float last_center_freq = center_freqs_[bank_size_ - 1];	229 float last_center_freq = center_freqs_[bank_size_ - 1];

268 for (size_t i = 0; i < bank_size_; ++i) {	230 for (size_t i = 0; i < bank_size_; ++i) {

269 center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;	231 center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;

270 }	232 }

271	233

272 for (size_t i = 0; i < bank_size_; ++i) {	234 for (size_t i = 0; i < bank_size_; ++i) {

273 filter_bank[i].resize(num_freqs);	235 filter_bank[i].resize(num_freqs);

274 }	236 }

275	237

276 for (size_t i = 1; i <= bank_size_; ++i) {	238 for (size_t i = 1; i <= bank_size_; ++i) {

277 size_t lll, ll, rr, rrr;

278 static const size_t kOne = 1; // Avoids repeated static_cast<>s below.	239 static const size_t kOne = 1; // Avoids repeated static_cast<>s below.

279 lll = static_cast<size_t>(round(	240 size_t lll =

280 center_freqs_[std::max(kOne, i - lf) - 1] * num_freqs /	241 static_cast<size_t>(round(center_freqs_[std::max(kOne, i - lf) - 1] *

281 (0.5f * sample_rate_hz_)));	242 num_freqs / (0.5f * sample_rate_hz_)));

282 ll = static_cast<size_t>(round(	243 size_t ll = static_cast<size_t>(round(center_freqs_[std::max(kOne, i) - 1] *

283 center_freqs_[std::max(kOne, i) - 1] * num_freqs /	244 num_freqs / (0.5f * sample_rate_hz_)));

284 (0.5f * sample_rate_hz_)));

285 lll = std::min(num_freqs, std::max(lll, kOne)) - 1;	245 lll = std::min(num_freqs, std::max(lll, kOne)) - 1;

286 ll = std::min(num_freqs, std::max(ll, kOne)) - 1;	246 ll = std::min(num_freqs, std::max(ll, kOne)) - 1;

287	247

288 rrr = static_cast<size_t>(round(	248 size_t rrr = static_cast<size_t>(

289 center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /	249 round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /

290 (0.5f * sample_rate_hz_)));	250 (0.5f * sample_rate_hz_)));

291 rr = static_cast<size_t>(round(	251 size_t rr = static_cast<size_t>(

292 center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /	252 round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /

293 (0.5f * sample_rate_hz_)));	253 (0.5f * sample_rate_hz_)));

294 rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1;	254 rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1;

295 rr = std::min(num_freqs, std::max(rr, kOne)) - 1;	255 rr = std::min(num_freqs, std::max(rr, kOne)) - 1;

296	256

297 float step, element;	257 float step = ll == lll ? 0.f : 1.f / (ll - lll);

298	258 float element = 0.f;

299 step = ll == lll ? 0.f : 1.f / (ll - lll);

300 element = 0.0f;

301 for (size_t j = lll; j <= ll; ++j) {	259 for (size_t j = lll; j <= ll; ++j) {

302 filter_bank[i - 1][j] = element;	260 filter_bank[i - 1][j] = element;

303 element += step;	261 element += step;

304 }	262 }

305 step = rr == rrr ? 0.f : 1.f / (rrr - rr);	263 step = rr == rrr ? 0.f : 1.f / (rrr - rr);

306 element = 1.0f;	264 element = 1.f;

307 for (size_t j = rr; j <= rrr; ++j) {	265 for (size_t j = rr; j <= rrr; ++j) {

308 filter_bank[i - 1][j] = element;	266 filter_bank[i - 1][j] = element;

309 element -= step;	267 element -= step;

310 }	268 }

311 for (size_t j = ll; j <= rr; ++j) {	269 for (size_t j = ll; j <= rr; ++j) {

312 filter_bank[i - 1][j] = 1.0f;	270 filter_bank[i - 1][j] = 1.f;

313 }	271 }

314 }	272 }

315	273

316 float sum;

317 for (size_t i = 0; i < num_freqs; ++i) {	274 for (size_t i = 0; i < num_freqs; ++i) {

318 sum = 0.0f;	275 float sum = 0.f;

319 for (size_t j = 0; j < bank_size_; ++j) {	276 for (size_t j = 0; j < bank_size_; ++j) {

320 sum += filter_bank[j][i];	277 sum += filter_bank[j][i];

321 }	278 }

322 for (size_t j = 0; j < bank_size_; ++j) {	279 for (size_t j = 0; j < bank_size_; ++j) {

323 filter_bank[j][i] /= sum;	280 filter_bank[j][i] /= sum;

324 }	281 }

325 }	282 }

326 return filter_bank;	283 return filter_bank;

327 }	284 }

328	285

329 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	286 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,

330 size_t start_freq,	287 size_t start_freq,

331 float* sols) {	288 float* sols) {

332 bool quadratic = (kConfigRho < 1.0f);	289 bool quadratic = (kRho < 1.f);

333 const float* pow_x0 = filtered_clear_pow_.get();	290 const float* pow_x0 = filtered_clear_pow_.get();

334 const float* pow_n0 = filtered_noise_pow_.get();	291 const float* pow_n0 = filtered_noise_pow_.get();

335	292

336 for (size_t n = 0; n < start_freq; ++n) {	293 for (size_t n = 0; n < start_freq; ++n) {

337 sols[n] = 1.0f;	294 sols[n] = 1.f;

338 }	295 }

339	296

340 // Analytic solution for optimal gains. See paper for derivation.	297 // Analytic solution for optimal gains. See paper for derivation.

341 for (size_t n = start_freq - 1; n < bank_size_; ++n) {	298 for (size_t n = start_freq - 1; n < bank_size_; ++n) {

342 float alpha0, beta0, gamma0;	299 float alpha0, beta0, gamma0;

343 gamma0 = 0.5f * rho_[n] * pow_x0[n] * pow_n0[n] +	300 gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +

344 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];	301 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];

345 beta0 = lambda * pow_x0[n] * (2 - rho_[n]) * pow_x0[n] * pow_n0[n];	302 beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];

346 if (quadratic) {	303 if (quadratic) {

347 alpha0 = lambda * pow_x0[n] * (1 - rho_[n]) * pow_x0[n] * pow_x0[n];	304 alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];

348 sols[n] =	305 sols[n] =

349 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /	306 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /

350 (2 * alpha0 + std::numeric_limits<float>::epsilon());	307 (2 * alpha0 + std::numeric_limits<float>::epsilon());

351 } else {	308 } else {

352 sols[n] = -gamma0 / beta0;	309 sols[n] = -gamma0 / beta0;

353 }	310 }

354 sols[n] = fmax(0, sols[n]);	311 sols[n] = fmax(0, sols[n]);

355 }	312 }

356 }	313 }

357	314

358 bool IntelligibilityEnhancer::active() const {	315 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {

359 return active_;	316 FloatToS16(audio, chunk_length_, &audio_s16_[0]);

	317 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);

	318 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {

	319 chunks_since_voice_ = 0;

	320 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {

	321 ++chunks_since_voice_;

	322 }

	323 return chunks_since_voice_ < kSpeechOffsetDelay;

360 }	324 }

361	325

362 } // namespace webrtc	326 } // namespace webrtc

OLD	NEW