webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1227213002: Update audio code to use size_t more correctly, webrtc/modules/audio_processing/

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1227213002: Update audio code to use size_t more correctly, webrtc/modules/audio_processing/ (Closed) Base URL: https://chromium.googlesource.com/external/webrtc@master

Patch Set: Resync Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
48	48

49 IntelligibilityEnhancer::TransformCallback::TransformCallback(	49 IntelligibilityEnhancer::TransformCallback::TransformCallback(

50 IntelligibilityEnhancer* parent,	50 IntelligibilityEnhancer* parent,

51 IntelligibilityEnhancer::AudioSource source)	51 IntelligibilityEnhancer::AudioSource source)

52 : parent_(parent), source_(source) {	52 : parent_(parent), source_(source) {

53 }	53 }

54	54

55 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(	55 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(

56 const complex<float>* const* in_block,	56 const complex<float>* const* in_block,

57 int in_channels,	57 int in_channels,

58 int frames,	58 size_t frames,

59 int /* out_channels */,	59 int /* out_channels */,

60 complex<float>* const* out_block) {	60 complex<float>* const* out_block) {

61 DCHECK_EQ(parent_->freqs_, frames);	61 DCHECK_EQ(parent_->freqs_, frames);

62 for (int i = 0; i < in_channels; ++i) {	62 for (int i = 0; i < in_channels; ++i) {

63 parent_->DispatchAudio(source_, in_block[i], out_block[i]);	63 parent_->DispatchAudio(source_, in_block[i], out_block[i]);

64 }	64 }

65 }	65 }

66	66

67 IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,	67 IntelligibilityEnhancer::IntelligibilityEnhancer(size_t erb_resolution,

68 int sample_rate_hz,	68 int sample_rate_hz,

69 int channels,	69 int channels,

70 int cv_type,	70 int cv_type,

71 float cv_alpha,	71 float cv_alpha,

72 int cv_win,	72 size_t cv_win,

73 int analysis_rate,	73 int analysis_rate,

74 int variance_rate,	74 int variance_rate,

75 float gain_limit)	75 float gain_limit)

76 : freqs_(RealFourier::ComplexLength(	76 : freqs_(RealFourier::ComplexLength(

77 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	77 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),

78 window_size_(1 << RealFourier::FftOrder(freqs_)),	78 window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))),

79 chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),	79 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),

80 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),	80 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),

81 sample_rate_hz_(sample_rate_hz),	81 sample_rate_hz_(sample_rate_hz),

82 erb_resolution_(erb_resolution),	82 erb_resolution_(erb_resolution),

83 channels_(channels),	83 channels_(channels),

84 analysis_rate_(analysis_rate),	84 analysis_rate_(analysis_rate),

85 variance_rate_(variance_rate),	85 variance_rate_(variance_rate),

86 clear_variance_(freqs_,	86 clear_variance_(freqs_,

87 static_cast<VarianceType>(cv_type),	87 static_cast<VarianceType>(cv_type),

88 cv_win,	88 cv_win,

89 cv_alpha),	89 cv_alpha),

(...skipping 27 matching lines...) Expand all Loading...
117 temp_out_buffer_ = static_cast<float**>(	117 temp_out_buffer_ = static_cast<float**>(

118 malloc(sizeof(temp_out_buffer_) channels_ +	118 malloc(sizeof(temp_out_buffer_) channels_ +

119 sizeof(*temp_out_buffer_) chunk_length_ * channels_));	119 sizeof(*temp_out_buffer_) chunk_length_ * channels_));

120 for (int i = 0; i < channels_; ++i) {	120 for (int i = 0; i < channels_; ++i) {

121 temp_out_buffer_[i] =	121 temp_out_buffer_[i] =

122 reinterpret_cast<float*>(temp_out_buffer_ + channels_) +	122 reinterpret_cast<float*>(temp_out_buffer_ + channels_) +

123 chunk_length_ * i;	123 chunk_length_ * i;

124 }	124 }

125	125

126 // Assumes all rho equal.	126 // Assumes all rho equal.

127 for (int i = 0; i < bank_size_; ++i) {	127 for (size_t i = 0; i < bank_size_; ++i) {

128 rho_[i] = kConfigRho * kConfigRho;	128 rho_[i] = kConfigRho * kConfigRho;

129 }	129 }

130	130

131 float freqs_khz = kClipFreq / 1000.0f;	131 float freqs_khz = kClipFreq / 1000.0f;

132 int erb_index = static_cast<int>(ceilf(	132 size_t erb_index = static_cast<size_t>(ceilf(

133 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));	133 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));

134 start_freq_ = std::max(1, erb_index * erb_resolution);	134 start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution);

135	135

136 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,	136 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,

137 kbd_window_.get());	137 kbd_window_.get());

138 render_mangler_.reset(new LappedTransform(	138 render_mangler_.reset(new LappedTransform(

139 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	139 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,

140 window_size_ / 2, &render_callback_));	140 window_size_ / 2, &render_callback_));

141 capture_mangler_.reset(new LappedTransform(	141 capture_mangler_.reset(new LappedTransform(

142 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	142 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,

143 window_size_ / 2, &capture_callback_));	143 window_size_ / 2, &capture_callback_));

144 }	144 }

145	145

146 IntelligibilityEnhancer::~IntelligibilityEnhancer() {	146 IntelligibilityEnhancer::~IntelligibilityEnhancer() {

147 WebRtcVad_Free(vad_low_);	147 WebRtcVad_Free(vad_low_);

148 WebRtcVad_Free(vad_high_);	148 WebRtcVad_Free(vad_high_);

149 free(temp_out_buffer_);	149 free(temp_out_buffer_);

150 }	150 }

151	151

152 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {	152 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {

153 for (int i = 0; i < chunk_length_; ++i) {	153 for (size_t i = 0; i < chunk_length_; ++i) {

154 vad_tmp_buffer_[i] = (int16_t)audio[0][i];	154 vad_tmp_buffer_[i] = (int16_t)audio[0][i];

155 }	155 }

156 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,	156 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,

157 vad_tmp_buffer_.get(), chunk_length_) == 1;	157 vad_tmp_buffer_.get(), chunk_length_) == 1;

158	158

159 // Process and enhance chunk of \|audio\|	159 // Process and enhance chunk of \|audio\|

160 render_mangler_->ProcessChunk(audio, temp_out_buffer_);	160 render_mangler_->ProcessChunk(audio, temp_out_buffer_);

161	161

162 for (int i = 0; i < channels_; ++i) {	162 for (int i = 0; i < channels_; ++i) {

163 memcpy(audio[i], temp_out_buffer_[i],	163 memcpy(audio[i], temp_out_buffer_[i],

164 chunk_length_ * sizeof(**temp_out_buffer_));	164 chunk_length_ * sizeof(**temp_out_buffer_));

165 }	165 }

166 }	166 }

167	167

168 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {	168 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {

169 for (int i = 0; i < chunk_length_; ++i) {	169 for (size_t i = 0; i < chunk_length_; ++i) {

170 vad_tmp_buffer_[i] = (int16_t)audio[0][i];	170 vad_tmp_buffer_[i] = (int16_t)audio[0][i];

171 }	171 }

172 // TODO(bercic): The VAD was always detecting voice in the noise stream,	172 // TODO(bercic): The VAD was always detecting voice in the noise stream,

173 // no matter what the aggressiveness, so it was temporarily disabled here.	173 // no matter what the aggressiveness, so it was temporarily disabled here.

174	174

175 #if 0	175 #if 0

176 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),	176 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),

177 chunk_length_) == 1) {	177 chunk_length_) == 1) {

178 printf("capture HAS speech\n");	178 printf("capture HAS speech\n");

179 return;	179 return;

(...skipping 89 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
269 lambda_top = lambda;	269 lambda_top = lambda;

270 }	270 }

271 power_ratio = std::fabs(power * reciprocal_power_target);	271 power_ratio = std::fabs(power * reciprocal_power_target);

272 ++iters;	272 ++iters;

273 }	273 }

274 }	274 }

275	275

276 void IntelligibilityEnhancer::UpdateErbGains() {	276 void IntelligibilityEnhancer::UpdateErbGains() {

277 // (ERB gain) = filterbank' * (freq gain)	277 // (ERB gain) = filterbank' * (freq gain)

278 float* gains = gain_applier_.target();	278 float* gains = gain_applier_.target();

279 for (int i = 0; i < freqs_; ++i) {	279 for (size_t i = 0; i < freqs_; ++i) {

280 gains[i] = 0.0f;	280 gains[i] = 0.0f;

281 for (int j = 0; j < bank_size_; ++j) {	281 for (size_t j = 0; j < bank_size_; ++j) {

282 gains[i] = fmaf(filter_bank_[j][i], gains_eq_[j], gains[i]);	282 gains[i] = fmaf(filter_bank_[j][i], gains_eq_[j], gains[i]);

283 }	283 }

284 }	284 }

285 }	285 }

286	286

287 void IntelligibilityEnhancer::ProcessNoiseBlock(const complex<float>* in_block,	287 void IntelligibilityEnhancer::ProcessNoiseBlock(const complex<float>* in_block,

288 complex<float>* /out_block/) {	288 complex<float>* /out_block/) {

289 noise_variance_.Step(in_block);	289 noise_variance_.Step(in_block);

290 }	290 }

291	291

292 int IntelligibilityEnhancer::GetBankSize(int sample_rate, int erb_resolution) {	292 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,

	293 size_t erb_resolution) {

293 float freq_limit = sample_rate / 2000.0f;	294 float freq_limit = sample_rate / 2000.0f;

294 int erb_scale = ceilf(	295 size_t erb_scale = static_cast<size_t>(ceilf(

295 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f);	296 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f));

296 return erb_scale * erb_resolution;	297 return erb_scale * erb_resolution;

297 }	298 }

298	299

299 void IntelligibilityEnhancer::CreateErbBank() {	300 void IntelligibilityEnhancer::CreateErbBank() {

300 int lf = 1, rf = 4;	301 size_t lf = 1, rf = 4;

301	302

302 for (int i = 0; i < bank_size_; ++i) {	303 for (size_t i = 0; i < bank_size_; ++i) {

303 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));	304 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));

304 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));	305 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));

305 center_freqs_[i] -= 14678.49f;	306 center_freqs_[i] -= 14678.49f;

306 }	307 }

307 float last_center_freq = center_freqs_[bank_size_ - 1];	308 float last_center_freq = center_freqs_[bank_size_ - 1];

308 for (int i = 0; i < bank_size_; ++i) {	309 for (size_t i = 0; i < bank_size_; ++i) {

309 center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;	310 center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;

310 }	311 }

311	312

312 for (int i = 0; i < bank_size_; ++i) {	313 for (size_t i = 0; i < bank_size_; ++i) {

313 filter_bank_[i].resize(freqs_);	314 filter_bank_[i].resize(freqs_);

314 }	315 }

315	316

316 for (int i = 1; i <= bank_size_; ++i) {	317 for (size_t i = 1; i <= bank_size_; ++i) {

317 int lll, ll, rr, rrr;	318 size_t lll, ll, rr, rrr;

318 lll = round(center_freqs_[max(1, i - lf) - 1] * freqs_ /	319 static const size_t kOne = 1; // Avoids repeated static_cast<>s below.

319 (0.5f * sample_rate_hz_));	320 lll = static_cast<size_t>(round(

320 ll =	321 center_freqs_[max(kOne, i - lf) - 1] * freqs_ /

321 round(center_freqs_[max(1, i) - 1] * freqs_ / (0.5f * sample_rate_hz_));	322 (0.5f * sample_rate_hz_)));

322 lll = min(freqs_, max(lll, 1)) - 1;	323 ll = static_cast<size_t>(round(

323 ll = min(freqs_, max(ll, 1)) - 1;	324 center_freqs_[max(kOne, i) - 1] * freqs_ / (0.5f * sample_rate_hz_)));

	325 lll = min(freqs_, max(lll, kOne)) - 1;

	326 ll = min(freqs_, max(ll, kOne)) - 1;

324	327

325 rrr = round(center_freqs_[min(bank_size_, i + rf) - 1] * freqs_ /	328 rrr = static_cast<size_t>(round(

326 (0.5f * sample_rate_hz_));	329 center_freqs_[min(bank_size_, i + rf) - 1] * freqs_ /

327 rr = round(center_freqs_[min(bank_size_, i + 1) - 1] * freqs_ /	330 (0.5f * sample_rate_hz_)));

328 (0.5f * sample_rate_hz_));	331 rr = static_cast<size_t>(round(

329 rrr = min(freqs_, max(rrr, 1)) - 1;	332 center_freqs_[min(bank_size_, i + 1) - 1] * freqs_ /

330 rr = min(freqs_, max(rr, 1)) - 1;	333 (0.5f * sample_rate_hz_)));

	334 rrr = min(freqs_, max(rrr, kOne)) - 1;

	335 rr = min(freqs_, max(rr, kOne)) - 1;

331	336

332 float step, element;	337 float step, element;

333	338

334 step = 1.0f / (ll - lll);	339 step = 1.0f / (ll - lll);

335 element = 0.0f;	340 element = 0.0f;

336 for (int j = lll; j <= ll; ++j) {	341 for (size_t j = lll; j <= ll; ++j) {

337 filter_bank_[i - 1][j] = element;	342 filter_bank_[i - 1][j] = element;

338 element += step;	343 element += step;

339 }	344 }

340 step = 1.0f / (rrr - rr);	345 step = 1.0f / (rrr - rr);

341 element = 1.0f;	346 element = 1.0f;

342 for (int j = rr; j <= rrr; ++j) {	347 for (size_t j = rr; j <= rrr; ++j) {

343 filter_bank_[i - 1][j] = element;	348 filter_bank_[i - 1][j] = element;

344 element -= step;	349 element -= step;

345 }	350 }

346 for (int j = ll; j <= rr; ++j) {	351 for (size_t j = ll; j <= rr; ++j) {

347 filter_bank_[i - 1][j] = 1.0f;	352 filter_bank_[i - 1][j] = 1.0f;

348 }	353 }

349 }	354 }

350	355

351 float sum;	356 float sum;

352 for (int i = 0; i < freqs_; ++i) {	357 for (size_t i = 0; i < freqs_; ++i) {

353 sum = 0.0f;	358 sum = 0.0f;

354 for (int j = 0; j < bank_size_; ++j) {	359 for (size_t j = 0; j < bank_size_; ++j) {

355 sum += filter_bank_[j][i];	360 sum += filter_bank_[j][i];

356 }	361 }

357 for (int j = 0; j < bank_size_; ++j) {	362 for (size_t j = 0; j < bank_size_; ++j) {

358 filter_bank_[j][i] /= sum;	363 filter_bank_[j][i] /= sum;

359 }	364 }

360 }	365 }

361 }	366 }

362	367

363 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,	368 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,

364 int start_freq,	369 size_t start_freq,

365 float* sols) {	370 float* sols) {

366 bool quadratic = (kConfigRho < 1.0f);	371 bool quadratic = (kConfigRho < 1.0f);

367 const float* var_x0 = filtered_clear_var_.get();	372 const float* var_x0 = filtered_clear_var_.get();

368 const float* var_n0 = filtered_noise_var_.get();	373 const float* var_n0 = filtered_noise_var_.get();

369	374

370 for (int n = 0; n < start_freq; ++n) {	375 for (size_t n = 0; n < start_freq; ++n) {

371 sols[n] = 1.0f;	376 sols[n] = 1.0f;

372 }	377 }

373	378

374 // Analytic solution for optimal gains. See paper for derivation.	379 // Analytic solution for optimal gains. See paper for derivation.

375 for (int n = start_freq - 1; n < bank_size_; ++n) {	380 for (size_t n = start_freq - 1; n < bank_size_; ++n) {

376 float alpha0, beta0, gamma0;	381 float alpha0, beta0, gamma0;

377 gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] +	382 gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] +

378 lambda * var_x0[n] * var_n0[n] * var_n0[n];	383 lambda * var_x0[n] * var_n0[n] * var_n0[n];

379 beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n];	384 beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n];

380 if (quadratic) {	385 if (quadratic) {

381 alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n];	386 alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n];

382 sols[n] =	387 sols[n] =

383 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0);	388 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0);

384 } else {	389 } else {

385 sols[n] = -gamma0 / beta0;	390 sols[n] = -gamma0 / beta0;

386 }	391 }

387 sols[n] = fmax(0, sols[n]);	392 sols[n] = fmax(0, sols[n]);

388 }	393 }

389 }	394 }

390	395

391 void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) {	396 void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) {

392 DCHECK_GT(freqs_, 0);	397 DCHECK_GT(freqs_, 0u);

393 for (int i = 0; i < bank_size_; ++i) {	398 for (size_t i = 0; i < bank_size_; ++i) {

394 result[i] = DotProduct(&filter_bank_[i][0], var, freqs_);	399 result[i] = DotProduct(&filter_bank_[i][0], var, freqs_);

395 }	400 }

396 }	401 }

397	402

398 float IntelligibilityEnhancer::DotProduct(const float* a,	403 float IntelligibilityEnhancer::DotProduct(const float* a,

399 const float* b,	404 const float* b,

400 int length) {	405 size_t length) {

401 float ret = 0.0f;	406 float ret = 0.0f;

402	407

403 for (int i = 0; i < length; ++i) {	408 for (size_t i = 0; i < length; ++i) {

404 ret = fmaf(a[i], b[i], ret);	409 ret = fmaf(a[i], b[i], ret);

405 }	410 }

406 return ret;	411 return ret;

407 }	412 }

408	413

409 } // namespace webrtc	414 } // namespace webrtc

OLD	NEW