webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc - Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow

Patch Set: Make gain change limit relative Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc ('K') | « webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc ('k') | webrtc/modules/audio_processing/noise_suppression_impl.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Command line tool for speech intelligibility enhancement. Provides for	12 // Command line tool for speech intelligibility enhancement. Provides for

13 // running and testing intelligibility_enhancer as an independent process.	13 // running and testing intelligibility_enhancer as an independent process.

14 // Use --help for options.	14 // Use --help for options.

15 //	15 //

16	16

17 #include <stdint.h>	17 #include <stdint.h>

18 #include <stdlib.h>	18 #include <stdlib.h>

19 #include <sys/stat.h>	19 #include <sys/stat.h>

20 #include <sys/types.h>	20 #include <sys/types.h>

21 #include <string>	21 #include <string>

22	22

23 #include "gflags/gflags.h"	23 #include "gflags/gflags.h"

24 #include "testing/gtest/include/gtest/gtest.h"	24 #include "testing/gtest/include/gtest/gtest.h"

25 #include "webrtc/base/checks.h"	25 #include "webrtc/base/checks.h"

26 #include "webrtc/base/criticalsection.h"	26 #include "webrtc/base/criticalsection.h"

	27 #include "webrtc/common_audio/include/audio_util.h"

27 #include "webrtc/common_audio/real_fourier.h"	28 #include "webrtc/common_audio/real_fourier.h"

28 #include "webrtc/common_audio/wav_file.h"	29 #include "webrtc/common_audio/wav_file.h"

29 #include "webrtc/modules/audio_processing/audio_buffer.h"	30 #include "webrtc/modules/audio_processing/audio_buffer.h"

30 #include "webrtc/modules/audio_processing/include/audio_processing.h"	31 #include "webrtc/modules/audio_processing/include/audio_processing.h"

31 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"	32 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"

32 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"	33 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils. h"

33 #include "webrtc/modules/audio_processing/noise_suppression_impl.h"	34 #include "webrtc/modules/audio_processing/noise_suppression_impl.h"

34 #include "webrtc/system_wrappers/include/critical_section_wrapper.h"	35 #include "webrtc/system_wrappers/include/critical_section_wrapper.h"

35 #include "webrtc/test/testsupport/fileutils.h"	36 #include "webrtc/test/testsupport/fileutils.h"

36	37

37 using std::complex;	38 using std::complex;

38 using webrtc::intelligibility::VarianceArray;

39	39

40 namespace webrtc {	40 namespace webrtc {

41 namespace {	41 namespace {

42	42

43 bool ValidateClearWindow(const char* flagname, int32_t value) {

44 return value > 0;

45 }

46

47 DEFINE_int32(clear_type,

48 webrtc::intelligibility::VarianceArray::kStepDecaying,

49 "Variance algorithm for clear data.");

50 DEFINE_double(clear_alpha, 0.9, "Variance decay factor for clear data.");

51 DEFINE_int32(clear_window,

52 475,

53 "Window size for windowed variance for clear data.");

54 const bool clear_window_dummy =

55 google::RegisterFlagValidator(&FLAGS_clear_window, &ValidateClearWindow);

56 DEFINE_int32(sample_rate,

57 16000,

58 "Audio sample rate used in the input and output files.");

59 DEFINE_int32(ana_rate,

60 800,

61 "Analysis rate; gains recalculated every N blocks.");

62 DEFINE_int32(

63 var_rate,

64 2,

65 "Variance clear rate; history is forgotten every N gain recalculations.");

66 DEFINE_double(gain_limit, 1000.0, "Maximum gain change in one block.");

67

68 DEFINE_string(clear_file, "speech.wav", "Input file with clear speech.");	43 DEFINE_string(clear_file, "speech.wav", "Input file with clear speech.");

69 DEFINE_string(noise_file, "noise.wav", "Input file with noise data.");	44 DEFINE_string(noise_file, "noise.wav", "Input file with noise data.");

70 DEFINE_string(out_file,	45 DEFINE_string(out_file,

71 "proc_enhanced.wav",	46 "proc_enhanced.wav",

72 "Enhanced output. Use '-' to "	47 "Enhanced output. Use '-' to "

73 "play through aplay immediately.");	48 "play through aplay immediately.");

74	49

75 const size_t kNumChannels = 1;

76

77 // void function for gtest	50 // void function for gtest

78 void void_main(int argc, char* argv[]) {	51 void void_main(int argc, char* argv[]) {

79 google::SetUsageMessage(	52 google::SetUsageMessage(

80 "\n\nVariance algorithm types are:\n"	53 "\n\nInput files must be little-endian 16-bit signed raw PCM.\n");

81 " 0 - infinite/normal,\n"

82 " 1 - exponentially decaying,\n"

83 " 2 - rolling window.\n"

84 "\nInput files must be little-endian 16-bit signed raw PCM.\n");

85 google::ParseCommandLineFlags(&argc, &argv, true);	54 google::ParseCommandLineFlags(&argc, &argv, true);

86	55

87 size_t samples; // Number of samples in input PCM file

88 size_t fragment_size; // Number of samples to process at a time

89 // to simulate APM stream processing

90

91 // Load settings and wav input.	56 // Load settings and wav input.

92

93 fragment_size = FLAGS_sample_rate / 100; // Mirror real time APM chunk size.

94 // Duplicates chunk_length_ in

95 // IntelligibilityEnhancer.

96

97 struct stat in_stat, noise_stat;	57 struct stat in_stat, noise_stat;

98 ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)	58 ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)

99 << "Empty speech file.";	59 << "Empty speech file.";

100 ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)	60 ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)

101 << "Empty noise file.";	61 << "Empty noise file.";

102	62

103 samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;	63 const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;

104	64

105 WavReader in_file(FLAGS_clear_file);	65 WavReader in_file(FLAGS_clear_file);

106 std::vector<float> in_fpcm(samples);	66 std::vector<float> in_fpcm(samples);

107 in_file.ReadSamples(samples, &in_fpcm[0]);	67 in_file.ReadSamples(samples, &in_fpcm[0]);

	68 FloatS16ToFloat(&in_fpcm[0], samples, &in_fpcm[0]);

108	69

109 WavReader noise_file(FLAGS_noise_file);	70 WavReader noise_file(FLAGS_noise_file);

110 std::vector<float> noise_fpcm(samples);	71 std::vector<float> noise_fpcm(samples);

111 noise_file.ReadSamples(samples, &noise_fpcm[0]);	72 noise_file.ReadSamples(samples, &noise_fpcm[0]);

	73 FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]);

112	74

113 // Run intelligibility enhancement.	75 // Run intelligibility enhancement.

114 IntelligibilityEnhancer::Config config;	76 IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());

115 config.sample_rate_hz = FLAGS_sample_rate;

116 config.var_type = static_cast<VarianceArray::StepType>(FLAGS_clear_type);

117 config.var_decay_rate = static_cast<float>(FLAGS_clear_alpha);

118 config.var_window_size = static_cast<size_t>(FLAGS_clear_window);

119 config.analysis_rate = FLAGS_ana_rate;

120 config.gain_change_limit = FLAGS_gain_limit;

121 IntelligibilityEnhancer enh(config);

122 rtc::CriticalSection crit;	77 rtc::CriticalSection crit;

123 NoiseSuppressionImpl ns(&crit);	78 NoiseSuppressionImpl ns(&crit);

124 ns.Initialize(kNumChannels, FLAGS_sample_rate);	79 ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());

125 ns.Enable(true);	80 ns.Enable(true);

126	81

127 AudioBuffer capture_audio(fragment_size,	82 // Mirror real time APM chunk size. Duplicates chunk_length_ in

128 kNumChannels,	83 // IntelligibilityEnhancer.

129 fragment_size,	84 size_t fragment_size = in_file.sample_rate() / 100;

130 kNumChannels,	85 AudioBuffer capture_audio(fragment_size, noise_file.num_channels(),

	86 fragment_size, noise_file.num_channels(),

131 fragment_size);	87 fragment_size);

132 StreamConfig stream_config(FLAGS_sample_rate, kNumChannels);	88 StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels());

133	89

134 // Slice the input into smaller chunks, as the APM would do, and feed them	90 // Slice the input into smaller chunks, as the APM would do, and feed them

135 // through the enhancer.	91 // through the enhancer.

136 float* clear_cursor = &in_fpcm[0];	92 float* clear_cursor = &in_fpcm[0];

137 float* noise_cursor = &noise_fpcm[0];	93 float* noise_cursor = &noise_fpcm[0];

138	94

139 for (size_t i = 0; i < samples; i += fragment_size) {	95 for (size_t i = 0; i < samples; i += fragment_size) {

140 capture_audio.CopyFrom(&noise_cursor, stream_config);	96 capture_audio.CopyFrom(&noise_cursor, stream_config);

141 ns.AnalyzeCaptureAudio(&capture_audio);	97 ns.AnalyzeCaptureAudio(&capture_audio);

142 ns.ProcessCaptureAudio(&capture_audio);	98 ns.ProcessCaptureAudio(&capture_audio);

143 enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());	99 enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());

144 enh.ProcessRenderAudio(&clear_cursor, FLAGS_sample_rate, kNumChannels);	100 enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(),

	101 in_file.num_channels());

145 clear_cursor += fragment_size;	102 clear_cursor += fragment_size;

146 noise_cursor += fragment_size;	103 noise_cursor += fragment_size;

147 }	104 }

148	105

	106 FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]);

	107

149 if (FLAGS_out_file.compare("-") == 0) {	108 if (FLAGS_out_file.compare("-") == 0) {

150 const std::string temp_out_filename =	109 const std::string temp_out_filename =

151 test::TempFilename(test::WorkingDir(), "temp_wav_file");	110 test::TempFilename(test::WorkingDir(), "temp_wav_file");

152 {	111 {

153 WavWriter out_file(temp_out_filename, FLAGS_sample_rate, kNumChannels);	112 WavWriter out_file(temp_out_filename, in_file.sample_rate(),

	113 in_file.num_channels());

154 out_file.WriteSamples(&in_fpcm[0], samples);	114 out_file.WriteSamples(&in_fpcm[0], samples);

155 }	115 }

156 system(("aplay " + temp_out_filename).c_str());	116 system(("aplay " + temp_out_filename).c_str());

157 system(("rm " + temp_out_filename).c_str());	117 system(("rm " + temp_out_filename).c_str());

158 } else {	118 } else {

159 WavWriter out_file(FLAGS_out_file, FLAGS_sample_rate, kNumChannels);	119 WavWriter out_file(FLAGS_out_file, in_file.sample_rate(),

	120 in_file.num_channels());

160 out_file.WriteSamples(&in_fpcm[0], samples);	121 out_file.WriteSamples(&in_fpcm[0], samples);

161 }	122 }

162 }	123 }

163	124

164 } // namespace	125 } // namespace

165 } // namespace webrtc	126 } // namespace webrtc

166	127

167 int main(int argc, char* argv[]) {	128 int main(int argc, char* argv[]) {

168 webrtc::void_main(argc, argv);	129 webrtc::void_main(argc, argv);

169 return 0;	130 return 0;

170 }	131 }

OLD	NEW