webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc - Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc

Issue 2320833002: Compensate for the IntelligibilityEnhancer processing delay in high bands (Closed)

Patch Set: optimize Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 184 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
195 const float kTestNonZeroVarLambdaTop[] = {	195 const float kTestNonZeroVarLambdaTop[] = {

196 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f,	196 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f,

197 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,	197 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,

198 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};	198 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};

199 static_assert(arraysize(kTestCenterFreqs) ==	199 static_assert(arraysize(kTestCenterFreqs) ==

200 arraysize(kTestNonZeroVarLambdaTop),	200 arraysize(kTestNonZeroVarLambdaTop),

201 "Power test data badly initialized.");	201 "Power test data badly initialized.");

202 const float kMaxTestError = 0.005f;	202 const float kMaxTestError = 0.005f;

203	203

204 // Enhancer initialization parameters.	204 // Enhancer initialization parameters.

205 const int kSamples = 1000;	205 const int kSamples = 10000;

206 const int kSampleRate = 4000;	206 const int kSampleRate = 4000;

207 const int kNumChannels = 1;	207 const int kNumChannels = 1;

208 const int kFragmentSize = kSampleRate / 100;	208 const int kFragmentSize = kSampleRate / 100;

209 const size_t kNumNoiseBins = 129;	209 const size_t kNumNoiseBins = 129;

	210 const size_t kNumBands = 1;

210	211

211 // Number of frames to process in the bitexactness tests.	212 // Number of frames to process in the bitexactness tests.

212 const size_t kNumFramesToProcess = 1000;	213 const size_t kNumFramesToProcess = 1000;

213	214

214 int IntelligibilityEnhancerSampleRate(int sample_rate_hz) {	215 int IntelligibilityEnhancerSampleRate(int sample_rate_hz) {

215 return (sample_rate_hz > AudioProcessing::kSampleRate16kHz	216 return (sample_rate_hz > AudioProcessing::kSampleRate16kHz

216 ? AudioProcessing::kSampleRate16kHz	217 ? AudioProcessing::kSampleRate16kHz

217 : sample_rate_hz);	218 : sample_rate_hz);

218 }	219 }

219	220

220 // Process one frame of data and produce the output.	221 // Process one frame of data and produce the output.

221 void ProcessOneFrame(int sample_rate_hz,	222 void ProcessOneFrame(int sample_rate_hz,

222 AudioBuffer* render_audio_buffer,	223 AudioBuffer* render_audio_buffer,

223 AudioBuffer* capture_audio_buffer,	224 AudioBuffer* capture_audio_buffer,

224 NoiseSuppressionImpl* noise_suppressor,	225 NoiseSuppressionImpl* noise_suppressor,

225 IntelligibilityEnhancer* intelligibility_enhancer) {	226 IntelligibilityEnhancer* intelligibility_enhancer) {

226 if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {	227 if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {

227 render_audio_buffer->SplitIntoFrequencyBands();	228 render_audio_buffer->SplitIntoFrequencyBands();

228 capture_audio_buffer->SplitIntoFrequencyBands();	229 capture_audio_buffer->SplitIntoFrequencyBands();

229 }	230 }

230	231

231 intelligibility_enhancer->ProcessRenderAudio(	232 intelligibility_enhancer->ProcessRenderAudio(render_audio_buffer);

232 render_audio_buffer->split_channels_f(kBand0To8kHz),

233 IntelligibilityEnhancerSampleRate(sample_rate_hz),

234 render_audio_buffer->num_channels());

235	233

236 noise_suppressor->AnalyzeCaptureAudio(capture_audio_buffer);	234 noise_suppressor->AnalyzeCaptureAudio(capture_audio_buffer);

237 noise_suppressor->ProcessCaptureAudio(capture_audio_buffer);	235 noise_suppressor->ProcessCaptureAudio(capture_audio_buffer);

238	236

239 intelligibility_enhancer->SetCaptureNoiseEstimate(	237 intelligibility_enhancer->SetCaptureNoiseEstimate(

240 noise_suppressor->NoiseEstimate(), 0);	238 noise_suppressor->NoiseEstimate(), 0);

241	239

242 if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {	240 if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {

243 render_audio_buffer->MergeFrequencyBands();	241 render_audio_buffer->MergeFrequencyBands();

244 }	242 }

(...skipping 24 matching lines...) Expand all Loading...
269 std::vector<float> capture_input(render_buffer.num_frames() *	267 std::vector<float> capture_input(render_buffer.num_frames() *

270 capture_buffer.num_channels());	268 capture_buffer.num_channels());

271	269

272 rtc::CriticalSection crit_capture;	270 rtc::CriticalSection crit_capture;

273 NoiseSuppressionImpl noise_suppressor(&crit_capture);	271 NoiseSuppressionImpl noise_suppressor(&crit_capture);

274 noise_suppressor.Initialize(capture_config.num_channels(), sample_rate_hz);	272 noise_suppressor.Initialize(capture_config.num_channels(), sample_rate_hz);

275 noise_suppressor.Enable(true);	273 noise_suppressor.Enable(true);

276	274

277 IntelligibilityEnhancer intelligibility_enhancer(	275 IntelligibilityEnhancer intelligibility_enhancer(

278 IntelligibilityEnhancerSampleRate(sample_rate_hz),	276 IntelligibilityEnhancerSampleRate(sample_rate_hz),

279 render_config.num_channels(), NoiseSuppressionImpl::num_noise_bins());	277 render_config.num_channels(), kNumBands,

	278 NoiseSuppressionImpl::num_noise_bins());

280	279

281 for (size_t frame_no = 0u; frame_no < kNumFramesToProcess; ++frame_no) {	280 for (size_t frame_no = 0u; frame_no < kNumFramesToProcess; ++frame_no) {

282 ReadFloatSamplesFromStereoFile(render_buffer.num_frames(),	281 ReadFloatSamplesFromStereoFile(render_buffer.num_frames(),

283 render_buffer.num_channels(), &render_file,	282 render_buffer.num_channels(), &render_file,

284 render_input);	283 render_input);

285 ReadFloatSamplesFromStereoFile(capture_buffer.num_frames(),	284 ReadFloatSamplesFromStereoFile(capture_buffer.num_frames(),

286 capture_buffer.num_channels(), &capture_file,	285 capture_buffer.num_channels(), &capture_file,

287 capture_input);	286 capture_input);

288	287

289 test::CopyVectorToAudioBuffer(render_config, render_input, &render_buffer);	288 test::CopyVectorToAudioBuffer(render_config, render_input, &render_buffer);

(...skipping 23 matching lines...) Expand all Loading...
313	312

314 float float_rand() {	313 float float_rand() {

315 return std::rand() * 2.f / RAND_MAX - 1;	314 return std::rand() * 2.f / RAND_MAX - 1;

316 }	315 }

317	316

318 } // namespace	317 } // namespace

319	318

320 class IntelligibilityEnhancerTest : public ::testing::Test {	319 class IntelligibilityEnhancerTest : public ::testing::Test {

321 protected:	320 protected:

322 IntelligibilityEnhancerTest()	321 IntelligibilityEnhancerTest()

323 : clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) {	322 : clear_buffer_(kFragmentSize,

	323 kNumChannels,

	324 kFragmentSize,

	325 kNumChannels,

	326 kFragmentSize),

	327 stream_config_(kSampleRate, kNumChannels),

	328 clear_data_(kSamples),

	329 noise_data_(kNumNoiseBins),

	330 orig_data_(kSamples) {

324 std::srand(1);	331 std::srand(1);

325 enh_.reset(	332 enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumBands,

326 new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumNoiseBins));	333 kNumNoiseBins));

327 }	334 }

328	335

329 bool CheckUpdate() {	336 bool CheckUpdate() {

330 enh_.reset(	337 enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumBands,

331 new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumNoiseBins));	338 kNumNoiseBins));

332 float* clear_cursor = clear_data_.data();	339 float* clear_cursor = clear_data_.data();

333 float* noise_cursor = noise_data_.data();

334 for (int i = 0; i < kSamples; i += kFragmentSize) {	340 for (int i = 0; i < kSamples; i += kFragmentSize) {

335 enh_->ProcessRenderAudio(&clear_cursor, kSampleRate, kNumChannels);	341 enh_->SetCaptureNoiseEstimate(noise_data_, 1);

	342 clear_buffer_.CopyFrom(&clear_cursor, stream_config_);

	343 enh_->ProcessRenderAudio(&clear_buffer_);

	344 clear_buffer_.CopyTo(stream_config_, &clear_cursor);

336 clear_cursor += kFragmentSize;	345 clear_cursor += kFragmentSize;

337 noise_cursor += kFragmentSize;

338 }	346 }

339 for (int i = 0; i < kSamples; i++) {	347 for (int i = initial_delay_; i < kSamples; i++) {

340 if (std::fabs(clear_data_[i] - orig_data_[i]) > kMaxTestError) {	348 if (std::fabs(clear_data_[i] - orig_data_[i - initial_delay_]) >

	349 kMaxTestError) {

341 return true;	350 return true;

342 }	351 }

343 }	352 }

344 return false;	353 return false;

345 }	354 }

346	355

347 std::unique_ptr<IntelligibilityEnhancer> enh_;	356 std::unique_ptr<IntelligibilityEnhancer> enh_;

	357 // Render clean speech buffer.

	358 AudioBuffer clear_buffer_;

	359 StreamConfig stream_config_;

348 std::vector<float> clear_data_;	360 std::vector<float> clear_data_;

349 std::vector<float> noise_data_;	361 std::vector<float> noise_data_;

350 std::vector<float> orig_data_;	362 std::vector<float> orig_data_;

	363 size_t initial_delay_;

351 };	364 };

352	365

353 // For each class of generated data, tests that render stream is updated when	366 // For each class of generated data, tests that render stream is updated when

354 // it should be.	367 // it should be.

355 TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) {	368 TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) {

	369 initial_delay_ = enh_->render_mangler_->initial_delay();

356 std::fill(noise_data_.begin(), noise_data_.end(), 0.f);	370 std::fill(noise_data_.begin(), noise_data_.end(), 0.f);

357 std::fill(orig_data_.begin(), orig_data_.end(), 0.f);	371 std::fill(orig_data_.begin(), orig_data_.end(), 0.f);

358 std::fill(clear_data_.begin(), clear_data_.end(), 0.f);	372 std::fill(clear_data_.begin(), clear_data_.end(), 0.f);

359 EXPECT_FALSE(CheckUpdate());	373 EXPECT_FALSE(CheckUpdate());

360 std::generate(noise_data_.begin(), noise_data_.end(), float_rand);	374 std::generate(clear_data_.begin(), clear_data_.end(), float_rand);

	375 orig_data_ = clear_data_;

361 EXPECT_FALSE(CheckUpdate());	376 EXPECT_FALSE(CheckUpdate());

362 std::generate(clear_data_.begin(), clear_data_.end(), float_rand);	377 std::generate(clear_data_.begin(), clear_data_.end(), float_rand);

363 orig_data_ = clear_data_;	378 orig_data_ = clear_data_;

	379 std::generate(noise_data_.begin(), noise_data_.end(), float_rand);

	380 FloatToFloatS16(noise_data_.data(), noise_data_.size(), noise_data_.data());

364 EXPECT_TRUE(CheckUpdate());	381 EXPECT_TRUE(CheckUpdate());

365 }	382 }

366	383

367 // Tests ERB bank creation, comparing against matlab output.	384 // Tests ERB bank creation, comparing against matlab output.

368 TEST_F(IntelligibilityEnhancerTest, TestErbCreation) {	385 TEST_F(IntelligibilityEnhancerTest, TestErbCreation) {

369 ASSERT_EQ(arraysize(kTestCenterFreqs), enh_->bank_size_);	386 ASSERT_EQ(arraysize(kTestCenterFreqs), enh_->bank_size_);

370 for (size_t i = 0; i < enh_->bank_size_; ++i) {	387 for (size_t i = 0; i < enh_->bank_size_; ++i) {

371 EXPECT_NEAR(kTestCenterFreqs[i], enh_->center_freqs_[i], kMaxTestError);	388 EXPECT_NEAR(kTestCenterFreqs[i], enh_->center_freqs_[i], kMaxTestError);

372 ASSERT_EQ(arraysize(kTestFilterBank[0]), enh_->freqs_);	389 ASSERT_EQ(arraysize(kTestFilterBank[0]), enh_->freqs_);

373 for (size_t j = 0; j < enh_->freqs_; ++j) {	390 for (size_t j = 0; j < enh_->freqs_; ++j) {

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
411 const float kTolerance = 0.007f;	428 const float kTolerance = 0.007f;

412 std::vector<float> noise(kNumNoiseBins);	429 std::vector<float> noise(kNumNoiseBins);

413 std::vector<float> noise_psd(kNumNoiseBins);	430 std::vector<float> noise_psd(kNumNoiseBins);

414 std::generate(noise.begin(), noise.end(), float_rand);	431 std::generate(noise.begin(), noise.end(), float_rand);

415 for (size_t i = 0; i < kNumNoiseBins; ++i) {	432 for (size_t i = 0; i < kNumNoiseBins; ++i) {

416 noise_psd[i] = kGain * kGain * noise[i] * noise[i];	433 noise_psd[i] = kGain * kGain * noise[i] * noise[i];

417 }	434 }

418 float* clear_cursor = clear_data_.data();	435 float* clear_cursor = clear_data_.data();

419 for (size_t i = 0; i < kNumFramesToProcess; ++i) {	436 for (size_t i = 0; i < kNumFramesToProcess; ++i) {

420 enh_->SetCaptureNoiseEstimate(noise, kGain);	437 enh_->SetCaptureNoiseEstimate(noise, kGain);

421 enh_->ProcessRenderAudio(&clear_cursor, kSampleRate, kNumChannels);	438 clear_buffer_.CopyFrom(&clear_cursor, stream_config_);

	439 enh_->ProcessRenderAudio(&clear_buffer_);

422 }	440 }

423 const std::vector<float>& estimated_psd =	441 const std::vector<float>& estimated_psd =

424 enh_->noise_power_estimator_.power();	442 enh_->noise_power_estimator_.power();

425 for (size_t i = 0; i < kNumNoiseBins; ++i) {	443 for (size_t i = 0; i < kNumNoiseBins; ++i) {

426 EXPECT_LT(std::abs(estimated_psd[i] - noise_psd[i]) / noise_psd[i],	444 EXPECT_LT(std::abs(estimated_psd[i] - noise_psd[i]) / noise_psd[i],

427 kTolerance);	445 kTolerance);

428 }	446 }

429 }	447 }

430	448

	449 TEST_F(IntelligibilityEnhancerTest, TestAllBandsHaveSameDelay) {

	450 const int kTestSampleRate = AudioProcessing::kSampleRate32kHz;

	451 const int kTestSplitRate = AudioProcessing::kSampleRate16kHz;

	452 const size_t kTestNumBands =

	453 rtc::CheckedDivExact(kTestSampleRate, kTestSplitRate);

	454 const size_t kTestFragmentSize = rtc::CheckedDivExact(kTestSampleRate, 100);

	455 const size_t kTestSplitFragmentSize =

	456 rtc::CheckedDivExact(kTestSplitRate, 100);

	457 enh_.reset(new IntelligibilityEnhancer(kTestSplitRate, kNumChannels,

	458 kTestNumBands, kNumNoiseBins));

	459 size_t initial_delay = enh_->render_mangler_->initial_delay();

	460 std::vector<float> rand_gen_buf(kTestFragmentSize);

	461 AudioBuffer original_buffer(kTestFragmentSize, kNumChannels,

	462 kTestFragmentSize, kNumChannels,

	463 kTestFragmentSize);

	464 AudioBuffer audio_buffer(kTestFragmentSize, kNumChannels, kTestFragmentSize,

	465 kNumChannels, kTestFragmentSize);

	466 for (size_t i = 0u; i < kTestNumBands; ++i) {

	467 std::generate(rand_gen_buf.begin(), rand_gen_buf.end(), float_rand);

	468 original_buffer.split_data_f()->SetDataForTesting(rand_gen_buf.data(),

	469 rand_gen_buf.size());

	470 audio_buffer.split_data_f()->SetDataForTesting(rand_gen_buf.data(),

	471 rand_gen_buf.size());

	472 }

	473 enh_->ProcessRenderAudio(&audio_buffer);

	474 for (size_t i = 0u; i < kTestNumBands; ++i) {

	475 const float* original_ptr = original_buffer.split_bands_const_f(0)[i];

	476 const float* audio_ptr = audio_buffer.split_bands_const_f(0)[i];

	477 for (size_t j = initial_delay; j < kTestSplitFragmentSize; ++j) {

	478 EXPECT_LT(std::fabs(original_ptr[j - initial_delay] - audio_ptr[j]),

	479 kMaxTestError);

	480 }

	481 }

	482 }

	483

431 TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono8kHz) {	484 TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono8kHz) {

432 const float kOutputReference[] = {-0.001892f, -0.003296f, -0.001953f};	485 const float kOutputReference[] = {-0.001892f, -0.003296f, -0.001953f};

433	486

434 RunBitexactnessTest(AudioProcessing::kSampleRate8kHz, 1, kOutputReference);	487 RunBitexactnessTest(AudioProcessing::kSampleRate8kHz, 1, kOutputReference);

435 }	488 }

436	489

437 TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono16kHz) {	490 TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono16kHz) {

438 const float kOutputReference[] = {-0.000977f, -0.003296f, -0.002441f};	491 const float kOutputReference[] = {-0.000977f, -0.003296f, -0.002441f};

439	492

440 RunBitexactnessTest(AudioProcessing::kSampleRate16kHz, 1, kOutputReference);	493 RunBitexactnessTest(AudioProcessing::kSampleRate16kHz, 1, kOutputReference);

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
474 }	527 }

475	528

476 TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Stereo48kHz) {	529 TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Stereo48kHz) {

477 const float kOutputReference[] = {-0.009276f, -0.001601f, -0.008255f,	530 const float kOutputReference[] = {-0.009276f, -0.001601f, -0.008255f,

478 -0.012975f, -0.015940f, -0.017820f};	531 -0.012975f, -0.015940f, -0.017820f};

479	532

480 RunBitexactnessTest(AudioProcessing::kSampleRate48kHz, 2, kOutputReference);	533 RunBitexactnessTest(AudioProcessing::kSampleRate48kHz, 2, kOutputReference);

481 }	534 }

482	535

483 } // namespace webrtc	536 } // namespace webrtc

OLD	NEW