webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Added resampling support to InterleaveTo; removed VAD logic Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('K') | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | webrtc/modules/audio_processing/test/audioproc_float.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Implements core class for intelligibility enhancer.	12 // Implements core class for intelligibility enhancer.

13 //	13 //

14 // Details of the model and algorithm can be found in the original paper:	14 // Details of the model and algorithm can be found in the original paper:

15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

16 //	16 //

17	17

18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"	18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"

19	19

20 #include <math.h>	20 #include <math.h>

21 #include <stdlib.h>	21 #include <stdlib.h>

22

23 #include <algorithm>	22 #include <algorithm>

24 #include <numeric>	23 #include <numeric>

25	24

26 #include "webrtc/base/checks.h"	25 #include "webrtc/base/checks.h"

27 #include "webrtc/common_audio/vad/include/webrtc_vad.h"

28 #include "webrtc/common_audio/window_generator.h"	26 #include "webrtc/common_audio/window_generator.h"

	27 #include "webrtc/common_audio/include/audio_util.h"
	Andrew MacDonald 2015/07/29 03:52:27 Alpha order. Alpha order. ekm 2015/07/29 23:35:06 Done. Show quoted text On 2015/07/29 03:52:27, andrew wrote: > Alpha order. Done.
29	28

30 namespace webrtc {	29 namespace webrtc {

31	30

32 namespace {	31 namespace {

33	32

34 const int kErbResolution = 2;	33 const int kErbResolution = 2;

35 const int kWindowSizeMs = 2;	34 const int kWindowSizeMs = 2;

36 const int kChunkSizeMs = 10; // Size provided by APM.	35 const int kChunkSizeMs = 10; // Size provided by APM.

37 const float kClipFreq = 200.0f;	36 const float kClipFreq = 200.0f;

38 const float kConfigRho = 0.02f; // Default production and interpretation SNR.	37 const float kConfigRho = 0.02f; // Default production and interpretation SNR.

(...skipping 19 matching lines...) Expand all Loading...
58 int in_channels,	57 int in_channels,

59 int frames,	58 int frames,

60 int /* out_channels */,	59 int /* out_channels */,

61 complex<float>* const* out_block) {	60 complex<float>* const* out_block) {

62 DCHECK_EQ(parent_->freqs_, frames);	61 DCHECK_EQ(parent_->freqs_, frames);

63 for (int i = 0; i < in_channels; ++i) {	62 for (int i = 0; i < in_channels; ++i) {

64 parent_->DispatchAudio(source_, in_block[i], out_block[i]);	63 parent_->DispatchAudio(source_, in_block[i], out_block[i]);

65 }	64 }

66 }	65 }

67	66

68 IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,	67 IntelligibilityEnhancer::IntelligibilityEnhancer()

69 int sample_rate_hz,	68 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {

70 int channels,	69 }

71 int cv_type,	70

72 float cv_alpha,	71 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)

73 int cv_win,

74 int analysis_rate,

75 int variance_rate,

76 float gain_limit)

77 : freqs_(RealFourier::ComplexLength(	72 : freqs_(RealFourier::ComplexLength(

78 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	73 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),

79 window_size_(1 << RealFourier::FftOrder(freqs_)),	74 window_size_(1 << RealFourier::FftOrder(freqs_)),

80 chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),	75 chunk_length_(config.sample_rate_hz * kChunkSizeMs / 1000),

81 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),	76 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),

82 sample_rate_hz_(sample_rate_hz),	77 sample_rate_hz_(config.sample_rate_hz),

83 erb_resolution_(erb_resolution),	78 erb_resolution_(kErbResolution),

84 channels_(channels),	79 num_capture_channels_(config.num_capture_channels),

85 analysis_rate_(analysis_rate),	80 num_render_channels_(config.num_render_channels),

86 variance_rate_(variance_rate),	81 analysis_rate_(config.analysis_rate),

	82 active_(true),

87 clear_variance_(freqs_,	83 clear_variance_(freqs_,

88 static_cast<VarianceType>(cv_type),	84 config.var_type,

89 cv_win,	85 config.var_window_size,

90 cv_alpha),	86 config.var_decay_rate),

91 noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f),	87 noise_variance_(freqs_,

	88 config.var_type,

	89 config.var_window_size,

	90 config.var_decay_rate),

92 filtered_clear_var_(new float[bank_size_]),	91 filtered_clear_var_(new float[bank_size_]),

93 filtered_noise_var_(new float[bank_size_]),	92 filtered_noise_var_(new float[bank_size_]),

94 filter_bank_(bank_size_),	93 filter_bank_(bank_size_),

95 center_freqs_(new float[bank_size_]),	94 center_freqs_(new float[bank_size_]),

96 rho_(new float[bank_size_]),	95 rho_(new float[bank_size_]),

97 gains_eq_(new float[bank_size_]),	96 gains_eq_(new float[bank_size_]),

98 gain_applier_(freqs_, gain_limit),	97 gain_applier_(freqs_, config.gain_change_limit),

99 temp_out_buffer_(nullptr),	98 temp_render_out_buffer_(chunk_length_, num_render_channels_),

100 input_audio_(new float* [channels]),	99 temp_capture_out_buffer_(chunk_length_, num_capture_channels_),

101 kbd_window_(new float[window_size_]),	100 kbd_window_(new float[window_size_]),

102 render_callback_(this, AudioSource::kRenderStream),	101 render_callback_(this, AudioSource::kRenderStream),

103 capture_callback_(this, AudioSource::kCaptureStream),	102 capture_callback_(this, AudioSource::kCaptureStream),

104 block_count_(0),	103 block_count_(0),

105 analysis_step_(0),	104 analysis_step_(0) {

106 vad_high_(WebRtcVad_Create()),	105 DCHECK_LE(config.rho, 1.0f);

107 vad_low_(WebRtcVad_Create()),

108 vad_tmp_buffer_(new int16_t[chunk_length_]) {

109 DCHECK_LE(kConfigRho, 1.0f);

110	106

111 CreateErbBank();	107 CreateErbBank();

112	108

113 WebRtcVad_Init(vad_high_);

114 WebRtcVad_set_mode(vad_high_, 0); // High likelihood of speech.

115 WebRtcVad_Init(vad_low_);

116 WebRtcVad_set_mode(vad_low_, 3); // Low likelihood of speech.

117

118 temp_out_buffer_ = static_cast<float**>(

119 malloc(sizeof(temp_out_buffer_) channels_ +

120 sizeof(*temp_out_buffer_) chunk_length_ * channels_));

121 for (int i = 0; i < channels_; ++i) {

122 temp_out_buffer_[i] =

123 reinterpret_cast<float*>(temp_out_buffer_ + channels_) +

124 chunk_length_ * i;

125 }

126

127 // Assumes all rho equal.	109 // Assumes all rho equal.

128 for (int i = 0; i < bank_size_; ++i) {	110 for (int i = 0; i < bank_size_; ++i) {

129 rho_[i] = kConfigRho * kConfigRho;	111 rho_[i] = config.rho * config.rho;

130 }	112 }

131	113

132 float freqs_khz = kClipFreq / 1000.0f;	114 float freqs_khz = kClipFreq / 1000.0f;

133 int erb_index = static_cast<int>(ceilf(	115 int erb_index = static_cast<int>(ceilf(

134 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));	116 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));

135 start_freq_ = max(1, erb_index * kErbResolution);	117 start_freq_ = max(1, erb_index * erb_resolution_);

136	118

137 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,	119 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,

138 kbd_window_.get());	120 kbd_window_.get());

139 render_mangler_.reset(new LappedTransform(	121 render_mangler_.reset(new LappedTransform(

140 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	122 num_render_channels_, num_render_channels_, chunk_length_,

141 window_size_ / 2, &render_callback_));	123 kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));

142 capture_mangler_.reset(new LappedTransform(	124 capture_mangler_.reset(new LappedTransform(

143 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	125 num_capture_channels_, num_capture_channels_, chunk_length_,

144 window_size_ / 2, &capture_callback_));	126 kbd_window_.get(), window_size_, window_size_ / 2, &capture_callback_));

145 }	127 }

146	128

147 IntelligibilityEnhancer::~IntelligibilityEnhancer() {	129 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,

148 WebRtcVad_Free(vad_low_);	130 int sample_rate_hz,

149 WebRtcVad_Free(vad_high_);	131 int num_channels) {

150 free(temp_out_buffer_);	132 CHECK_EQ(sample_rate_hz_, sample_rate_hz);

151 }	133 CHECK_EQ(num_render_channels_, num_channels);

152	134

153 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {	135 if (active_) {

154 for (int i = 0; i < chunk_length_; ++i) {	136 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());

155 vad_tmp_buffer_[i] = (int16_t)audio[0][i];

156 }	137 }

157 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,

158 vad_tmp_buffer_.get(), chunk_length_) == 1;

159	138

160 // Process and enhance chunk of \|audio\|	139 if (active_) {
	Andrew MacDonald 2015/07/29 03:52:27 This is where you need smoothing as you switch sta This is where you need smoothing as you switch states. aluebs-webrtc 2015/07/29 22:17:10 There is a if(active_) just above where you can pr There is a if(active_) just above where you can probably merge this into. ekm 2015/07/29 23:35:06 Acknowledged. Show quoted text On 2015/07/29 03:52:27, andrew wrote: > This is where you need smoothing as you switch states. Acknowledged. ekm 2015/07/29 23:35:06 Although active_ is const in this cl, in the next Show quoted text On 2015/07/29 22:17:10, aluebs-webrtc wrote: > There is a if(active_) just above where you can probably merge this into. Although active_ is const in this cl, in the next it will be non-const and may change between these two blocks, and the two conditions will be different, so I'd prefer to keep them separate if that's ok. aluebs-webrtc 2015/07/30 15:28:07 Agreed. Show quoted text On 2015/07/29 23:35:06, ekm wrote: > On 2015/07/29 22:17:10, aluebs-webrtc wrote: > > There is a if(active_) just above where you can probably merge this into. > > Although active_ is const in this cl, in the next it will be non-const and may > change between these two blocks, and the two conditions will be different, so > I'd prefer to keep them separate if that's ok. Agreed.
161 render_mangler_->ProcessChunk(audio, temp_out_buffer_);	140 for (int i = 0; i < num_render_channels_; ++i) {

162	141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

163 for (int i = 0; i < channels_; ++i) {	142 chunk_length_ * sizeof(**audio));

164 memcpy(audio[i], temp_out_buffer_[i],	143 }

165 chunk_length_ * sizeof(**temp_out_buffer_));

166 }	144 }

167 }	145 }

168	146

169 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {	147 void IntelligibilityEnhancer::AnalyzeCaptureAudio(float* const* audio,

170 for (int i = 0; i < chunk_length_; ++i) {	148 int sample_rate_hz,

171 vad_tmp_buffer_[i] = (int16_t)audio[0][i];	149 int num_channels) {

172 }	150 CHECK_EQ(sample_rate_hz_, sample_rate_hz);

173 // TODO(bercic): The VAD was always detecting voice in the noise stream,	151 CHECK_EQ(num_capture_channels_, num_channels);

174 // no matter what the aggressiveness, so it was temporarily disabled here.

175	152

176 #if 0	153 capture_mangler_->ProcessChunk(audio, temp_capture_out_buffer_.channels());

177 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),

178 chunk_length_) == 1) {

179 printf("capture HAS speech\n");

180 return;

181 }

182 printf("capture NO speech\n");

183 #endif

184

185 capture_mangler_->ProcessChunk(audio, temp_out_buffer_);

186 }	154 }

187	155

188 void IntelligibilityEnhancer::DispatchAudio(	156 void IntelligibilityEnhancer::DispatchAudio(

189 IntelligibilityEnhancer::AudioSource source,	157 IntelligibilityEnhancer::AudioSource source,

190 const complex<float>* in_block,	158 const complex<float>* in_block,

191 complex<float>* out_block) {	159 complex<float>* out_block) {

192 switch (source) {	160 switch (source) {

193 case kRenderStream:	161 case kRenderStream:

194 ProcessClearBlock(in_block, out_block);	162 ProcessClearBlock(in_block, out_block);

195 break;	163 break;

196 case kCaptureStream:	164 case kCaptureStream:

197 ProcessNoiseBlock(in_block, out_block);	165 ProcessNoiseBlock(in_block, out_block);

198 break;	166 break;

199 }	167 }

200 }	168 }

201	169

202 void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,	170 void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,

203 complex<float>* out_block) {	171 complex<float>* out_block) {

204 if (block_count_ < 2) {	172 if (block_count_ < 2) {

205 memset(out_block, 0, freqs_ * sizeof(*out_block));	173 memset(out_block, 0, freqs_ * sizeof(*out_block));

206 ++block_count_;	174 ++block_count_;

207 return;	175 return;

208 }	176 }

209	177

210 // For now, always assumes enhancement is necessary.	178 // TODO(ekm): Use VAD to \|Step\| and \|AnalyzeClearBlock\| only if necessary.

211 // TODO(ekmeyerson): Change to only enhance if necessary,	179 if (true) {

212 // based on experiments with different cutoffs.

213 if (has_voice_low_ \|\| true) {

214 clear_variance_.Step(in_block, false);	180 clear_variance_.Step(in_block, false);

215 const float power_target = std::accumulate(

216 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.0f);

217

218 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {	181 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {

	182 const float power_target = std::accumulate(

	183 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f);

219 AnalyzeClearBlock(power_target);	184 AnalyzeClearBlock(power_target);

220 ++analysis_step_;	185 ++analysis_step_;

221 if (analysis_step_ == variance_rate_) {

222 analysis_step_ = 0;

223 clear_variance_.Clear();

224 noise_variance_.Clear();

225 }

226 }	186 }

227 ++block_count_;	187 ++block_count_;

228 }	188 }

229	189

230 /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */	190 if (active_) {

231 gain_applier_.Apply(in_block, out_block);	191 gain_applier_.Apply(in_block, out_block);

	192 }

232 }	193 }

233	194

234 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {	195 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {

235 FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());	196 FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());

236 FilterVariance(noise_variance_.variance(), filtered_noise_var_.get());	197 FilterVariance(noise_variance_.variance(), filtered_noise_var_.get());

237	198

238 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	199 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());

239 const float power_top =	200 const float power_top =

240 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);	201 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);

241 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	202 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());

(...skipping 157 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 const float* b,	360 const float* b,

400 int length) {	361 int length) {

401 float ret = 0.0f;	362 float ret = 0.0f;

402	363

403 for (int i = 0; i < length; ++i) {	364 for (int i = 0; i < length; ++i) {

404 ret = fmaf(a[i], b[i], ret);	365 ret = fmaf(a[i], b[i], ret);

405 }	366 }

406 return ret;	367 return ret;

407 }	368 }

408	369

	370 bool IntelligibilityEnhancer::active() const {

	371 return active_;

	372 }

	373

409 } // namespace webrtc	374 } // namespace webrtc

OLD	NEW