webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Fix Mac Error (3) Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Implements core class for intelligibility enhancer.	12 // Implements core class for intelligibility enhancer.

13 //	13 //

14 // Details of the model and algorithm can be found in the original paper:	14 // Details of the model and algorithm can be found in the original paper:

15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

16 //	16 //

17	17

18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"	18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"

19	19

20 #include <math.h>	20 #include <math.h>

21 #include <stdlib.h>	21 #include <stdlib.h>

22

23 #include <algorithm>	22 #include <algorithm>

24 #include <numeric>	23 #include <numeric>

25	24

26 #include "webrtc/base/checks.h"	25 #include "webrtc/base/checks.h"

27 #include "webrtc/common_audio/vad/include/webrtc_vad.h"	26 #include "webrtc/common_audio/include/audio_util.h"

28 #include "webrtc/common_audio/window_generator.h"	27 #include "webrtc/common_audio/window_generator.h"

29	28

30 namespace webrtc {	29 namespace webrtc {

31	30

32 namespace {	31 namespace {

33	32

	33 const int kErbResolution = 2;

34 const int kWindowSizeMs = 2;	34 const int kWindowSizeMs = 2;

35 const int kChunkSizeMs = 10; // Size provided by APM.	35 const int kChunkSizeMs = 10; // Size provided by APM.

36 const float kClipFreq = 200.0f;	36 const float kClipFreq = 200.0f;

37 const float kConfigRho = 0.02f; // Default production and interpretation SNR.	37 const float kConfigRho = 0.02f; // Default production and interpretation SNR.

38 const float kKbdAlpha = 1.5f;	38 const float kKbdAlpha = 1.5f;

39 const float kLambdaBot = -1.0f; // Extreme values in bisection	39 const float kLambdaBot = -1.0f; // Extreme values in bisection

40 const float kLambdaTop = -10e-18f; // search for lamda.	40 const float kLambdaTop = -10e-18f; // search for lamda.

41	41

42 } // namespace	42 } // namespace

43	43

(...skipping 13 matching lines...) Expand all Loading...
57 int in_channels,	57 int in_channels,

58 int frames,	58 int frames,

59 int /* out_channels */,	59 int /* out_channels */,

60 complex<float>* const* out_block) {	60 complex<float>* const* out_block) {

61 DCHECK_EQ(parent_->freqs_, frames);	61 DCHECK_EQ(parent_->freqs_, frames);

62 for (int i = 0; i < in_channels; ++i) {	62 for (int i = 0; i < in_channels; ++i) {

63 parent_->DispatchAudio(source_, in_block[i], out_block[i]);	63 parent_->DispatchAudio(source_, in_block[i], out_block[i]);

64 }	64 }

65 }	65 }

66	66

67 IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,	67 IntelligibilityEnhancer::IntelligibilityEnhancer()

68 int sample_rate_hz,	68 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {

69 int channels,	69 }

70 int cv_type,	70

71 float cv_alpha,	71 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)

72 int cv_win,

73 int analysis_rate,

74 int variance_rate,

75 float gain_limit)

76 : freqs_(RealFourier::ComplexLength(	72 : freqs_(RealFourier::ComplexLength(

77 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	73 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),

78 window_size_(1 << RealFourier::FftOrder(freqs_)),	74 window_size_(1 << RealFourier::FftOrder(freqs_)),

79 chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),	75 chunk_length_(config.sample_rate_hz * kChunkSizeMs / 1000),

80 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),	76 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),

81 sample_rate_hz_(sample_rate_hz),	77 sample_rate_hz_(config.sample_rate_hz),

82 erb_resolution_(erb_resolution),	78 erb_resolution_(kErbResolution),

83 channels_(channels),	79 num_capture_channels_(config.num_capture_channels),

84 analysis_rate_(analysis_rate),	80 num_render_channels_(config.num_render_channels),

85 variance_rate_(variance_rate),	81 analysis_rate_(config.analysis_rate),

	82 active_(true),

86 clear_variance_(freqs_,	83 clear_variance_(freqs_,

87 static_cast<VarianceType>(cv_type),	84 config.var_type,

88 cv_win,	85 config.var_window_size,

89 cv_alpha),	86 config.var_decay_rate),

90 noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f),	87 noise_variance_(freqs_,

	88 config.var_type,

	89 config.var_window_size,

	90 config.var_decay_rate),

91 filtered_clear_var_(new float[bank_size_]),	91 filtered_clear_var_(new float[bank_size_]),

92 filtered_noise_var_(new float[bank_size_]),	92 filtered_noise_var_(new float[bank_size_]),

93 filter_bank_(bank_size_),	93 filter_bank_(bank_size_),

94 center_freqs_(new float[bank_size_]),	94 center_freqs_(new float[bank_size_]),

95 rho_(new float[bank_size_]),	95 rho_(new float[bank_size_]),

96 gains_eq_(new float[bank_size_]),	96 gains_eq_(new float[bank_size_]),

97 gain_applier_(freqs_, gain_limit),	97 gain_applier_(freqs_, config.gain_change_limit),

98 temp_out_buffer_(nullptr),	98 temp_render_out_buffer_(chunk_length_, num_render_channels_),

99 input_audio_(new float* [channels]),	99 temp_capture_out_buffer_(chunk_length_, num_capture_channels_),

100 kbd_window_(new float[window_size_]),	100 kbd_window_(new float[window_size_]),

101 render_callback_(this, AudioSource::kRenderStream),	101 render_callback_(this, AudioSource::kRenderStream),

102 capture_callback_(this, AudioSource::kCaptureStream),	102 capture_callback_(this, AudioSource::kCaptureStream),

103 block_count_(0),	103 block_count_(0),

104 analysis_step_(0),	104 analysis_step_(0) {

105 vad_high_(WebRtcVad_Create()),	105 DCHECK_LE(config.rho, 1.0f);

106 vad_low_(WebRtcVad_Create()),

107 vad_tmp_buffer_(new int16_t[chunk_length_]) {

108 DCHECK_LE(kConfigRho, 1.0f);

109	106

110 CreateErbBank();	107 CreateErbBank();

111	108

112 WebRtcVad_Init(vad_high_);

113 WebRtcVad_set_mode(vad_high_, 0); // High likelihood of speech.

114 WebRtcVad_Init(vad_low_);

115 WebRtcVad_set_mode(vad_low_, 3); // Low likelihood of speech.

116

117 temp_out_buffer_ = static_cast<float**>(

118 malloc(sizeof(temp_out_buffer_) channels_ +

119 sizeof(*temp_out_buffer_) chunk_length_ * channels_));

120 for (int i = 0; i < channels_; ++i) {

121 temp_out_buffer_[i] =

122 reinterpret_cast<float*>(temp_out_buffer_ + channels_) +

123 chunk_length_ * i;

124 }

125

126 // Assumes all rho equal.	109 // Assumes all rho equal.

127 for (int i = 0; i < bank_size_; ++i) {	110 for (int i = 0; i < bank_size_; ++i) {

128 rho_[i] = kConfigRho * kConfigRho;	111 rho_[i] = config.rho * config.rho;

129 }	112 }

130	113

131 float freqs_khz = kClipFreq / 1000.0f;	114 float freqs_khz = kClipFreq / 1000.0f;

132 int erb_index = static_cast<int>(ceilf(	115 int erb_index = static_cast<int>(ceilf(

133 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));	116 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));

134 start_freq_ = std::max(1, erb_index * erb_resolution);	117 start_freq_ = max(1, erb_index * erb_resolution_);

135	118

136 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,	119 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,

137 kbd_window_.get());	120 kbd_window_.get());

138 render_mangler_.reset(new LappedTransform(	121 render_mangler_.reset(new LappedTransform(

139 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	122 num_render_channels_, num_render_channels_, chunk_length_,

140 window_size_ / 2, &render_callback_));	123 kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));

141 capture_mangler_.reset(new LappedTransform(	124 capture_mangler_.reset(new LappedTransform(

142 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	125 num_capture_channels_, num_capture_channels_, chunk_length_,

143 window_size_ / 2, &capture_callback_));	126 kbd_window_.get(), window_size_, window_size_ / 2, &capture_callback_));

144 }	127 }

145	128

146 IntelligibilityEnhancer::~IntelligibilityEnhancer() {	129 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,

147 WebRtcVad_Free(vad_low_);	130 int sample_rate_hz,

148 WebRtcVad_Free(vad_high_);	131 int num_channels) {

149 free(temp_out_buffer_);	132 CHECK_EQ(sample_rate_hz_, sample_rate_hz);

150 }	133 CHECK_EQ(num_render_channels_, num_channels);

151	134

152 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {	135 if (active_) {

153 for (int i = 0; i < chunk_length_; ++i) {	136 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());

154 vad_tmp_buffer_[i] = (int16_t)audio[0][i];

155 }	137 }

156 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,

157 vad_tmp_buffer_.get(), chunk_length_) == 1;

158	138

159 // Process and enhance chunk of \|audio\|	139 if (active_) {

160 render_mangler_->ProcessChunk(audio, temp_out_buffer_);	140 for (int i = 0; i < num_render_channels_; ++i) {

161	141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],

162 for (int i = 0; i < channels_; ++i) {	142 chunk_length_ * sizeof(**audio));

163 memcpy(audio[i], temp_out_buffer_[i],	143 }

164 chunk_length_ * sizeof(**temp_out_buffer_));

165 }	144 }

166 }	145 }

167	146

168 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {	147 void IntelligibilityEnhancer::AnalyzeCaptureAudio(float* const* audio,

169 for (int i = 0; i < chunk_length_; ++i) {	148 int sample_rate_hz,

170 vad_tmp_buffer_[i] = (int16_t)audio[0][i];	149 int num_channels) {

171 }	150 CHECK_EQ(sample_rate_hz_, sample_rate_hz);

172 // TODO(bercic): The VAD was always detecting voice in the noise stream,	151 CHECK_EQ(num_capture_channels_, num_channels);

173 // no matter what the aggressiveness, so it was temporarily disabled here.

174	152

175 #if 0	153 capture_mangler_->ProcessChunk(audio, temp_capture_out_buffer_.channels());

176 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),

177 chunk_length_) == 1) {

178 printf("capture HAS speech\n");

179 return;

180 }

181 printf("capture NO speech\n");

182 #endif

183

184 capture_mangler_->ProcessChunk(audio, temp_out_buffer_);

185 }	154 }

186	155

187 void IntelligibilityEnhancer::DispatchAudio(	156 void IntelligibilityEnhancer::DispatchAudio(

188 IntelligibilityEnhancer::AudioSource source,	157 IntelligibilityEnhancer::AudioSource source,

189 const complex<float>* in_block,	158 const complex<float>* in_block,

190 complex<float>* out_block) {	159 complex<float>* out_block) {

191 switch (source) {	160 switch (source) {

192 case kRenderStream:	161 case kRenderStream:

193 ProcessClearBlock(in_block, out_block);	162 ProcessClearBlock(in_block, out_block);

194 break;	163 break;

195 case kCaptureStream:	164 case kCaptureStream:

196 ProcessNoiseBlock(in_block, out_block);	165 ProcessNoiseBlock(in_block, out_block);

197 break;	166 break;

198 }	167 }

199 }	168 }

200	169

201 void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,	170 void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,

202 complex<float>* out_block) {	171 complex<float>* out_block) {

203 if (block_count_ < 2) {	172 if (block_count_ < 2) {

204 memset(out_block, 0, freqs_ * sizeof(*out_block));	173 memset(out_block, 0, freqs_ * sizeof(*out_block));

205 ++block_count_;	174 ++block_count_;

206 return;	175 return;

207 }	176 }

208	177

209 // For now, always assumes enhancement is necessary.	178 // TODO(ekm): Use VAD to \|Step\| and \|AnalyzeClearBlock\| only if necessary.

210 // TODO(ekmeyerson): Change to only enhance if necessary,	179 if (true) {

211 // based on experiments with different cutoffs.

212 if (has_voice_low_ \|\| true) {

213 clear_variance_.Step(in_block, false);	180 clear_variance_.Step(in_block, false);

214 const float power_target = std::accumulate(

215 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.0f);

216

217 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {	181 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {

	182 const float power_target = std::accumulate(

	183 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f);

218 AnalyzeClearBlock(power_target);	184 AnalyzeClearBlock(power_target);

219 ++analysis_step_;	185 ++analysis_step_;

220 if (analysis_step_ == variance_rate_) {

221 analysis_step_ = 0;

222 clear_variance_.Clear();

223 noise_variance_.Clear();

224 }

225 }	186 }

226 ++block_count_;	187 ++block_count_;

227 }	188 }

228	189

229 /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */	190 if (active_) {

230 gain_applier_.Apply(in_block, out_block);	191 gain_applier_.Apply(in_block, out_block);

	192 }

231 }	193 }

232	194

233 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {	195 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {

234 FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());	196 FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());

235 FilterVariance(noise_variance_.variance(), filtered_noise_var_.get());	197 FilterVariance(noise_variance_.variance(), filtered_noise_var_.get());

236	198

237 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());	199 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());

238 const float power_top =	200 const float power_top =

239 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);	201 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);

240 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());	202 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());

(...skipping 158 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 const float* b,	361 const float* b,

400 int length) {	362 int length) {

401 float ret = 0.0f;	363 float ret = 0.0f;

402	364

403 for (int i = 0; i < length; ++i) {	365 for (int i = 0; i < length; ++i) {

404 ret = fmaf(a[i], b[i], ret);	366 ret = fmaf(a[i], b[i], ret);

405 }	367 }

406 return ret;	368 return ret;

407 }	369 }

408	370

	371 bool IntelligibilityEnhancer::active() const {

	372 return active_;

	373 }

	374

409 } // namespace webrtc	375 } // namespace webrtc

OLD	NEW