webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('K') | « webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h ('k') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('j') | webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 //	11 //

12 // Implements core class for intelligibility enhancer.	12 // Implements core class for intelligibility enhancer.

13 //	13 //

14 // Details of the model and algorithm can be found in the original paper:	14 // Details of the model and algorithm can be found in the original paper:

15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788	15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788

16 //	16 //

17	17

18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"	18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"

19	19

20 #include <math.h>	20 #include <math.h>

21 #include <stdlib.h>	21 #include <stdlib.h>

22

23 #include <algorithm>	22 #include <algorithm>

24 #include <numeric>	23 #include <numeric>

25	24

26 #include "webrtc/base/checks.h"	25 #include "webrtc/base/checks.h"

27 #include "webrtc/common_audio/vad/include/webrtc_vad.h"	26 #include "webrtc/common_audio/vad/include/webrtc_vad.h"

28 #include "webrtc/common_audio/window_generator.h"	27 #include "webrtc/common_audio/window_generator.h"

29	28

30 namespace webrtc {	29 namespace webrtc {

31	30

32 namespace {	31 namespace {

33	32

34 const int kErbResolution = 2;	33 const int kErbResolution = 2;

35 const int kWindowSizeMs = 2;	34 const int kWindowSizeMs = 2;

36 const int kChunkSizeMs = 10; // Size provided by APM.	35 const int kChunkSizeMs = 10; // Size provided by APM.

37 const float kClipFreq = 200.0f;	36 const float kClipFreq = 200.0f;

38 const float kConfigRho = 0.02f; // Default production and interpretation SNR.	37 const float kConfigRho = 0.02f; // Default production and interpretation SNR.

39 const float kKbdAlpha = 1.5f;	38 const float kKbdAlpha = 1.5f;

40 const float kLambdaBot = -1.0f; // Extreme values in bisection	39 const float kLambdaBot = -1.0f; // Extreme values in bisection

41 const float kLambdaTop = -10e-18f; // search for lamda.	40 const float kLambdaTop = -10e-18f; // search for lamda.

	41 const float kVoiceDetected = 1.f;

	42 const float kNoiseDetected = 0.f;

42	43

43 } // namespace	44 } // namespace

44	45

45 using std::complex;	46 using std::complex;

46 using std::max;	47 using std::max;

47 using std::min;	48 using std::min;

48 using VarianceType = intelligibility::VarianceArray::StepType;	49 using VarianceType = intelligibility::VarianceArray::StepType;

49	50

50 IntelligibilityEnhancer::TransformCallback::TransformCallback(	51 IntelligibilityEnhancer::TransformCallback::TransformCallback(

51 IntelligibilityEnhancer* parent,	52 IntelligibilityEnhancer* parent,

52 IntelligibilityEnhancer::AudioSource source)	53 IntelligibilityEnhancer::AudioSource source)

53 : parent_(parent), source_(source) {	54 : parent_(parent), source_(source) {

54 }	55 }

55	56

56 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(	57 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(

57 const complex<float>* const* in_block,	58 const complex<float>* const* in_block,

58 int in_channels,	59 int in_channels,

59 int frames,	60 int frames,

60 int /* out_channels */,	61 int /* out_channels */,

61 complex<float>* const* out_block) {	62 complex<float>* const* out_block) {

62 DCHECK_EQ(parent_->freqs_, frames);	63 DCHECK_EQ(parent_->freqs_, frames);

63 for (int i = 0; i < in_channels; ++i) {	64 for (int i = 0; i < in_channels; ++i) {

64 parent_->DispatchAudio(source_, in_block[i], out_block[i]);	65 parent_->DispatchAudio(source_, in_block[i], out_block[i]);

65 }	66 }

66 }	67 }

67	68

68 IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,	69 IntelligibilityEnhancer::IntelligibilityEnhancer()

69 int sample_rate_hz,	70 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {

70 int channels,	71 }

71 int cv_type,	72

72 float cv_alpha,	73 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)

73 int cv_win,

74 int analysis_rate,

75 int variance_rate,

76 float gain_limit)

77 : freqs_(RealFourier::ComplexLength(	74 : freqs_(RealFourier::ComplexLength(

78 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),	75 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),

79 window_size_(1 << RealFourier::FftOrder(freqs_)),	76 window_size_(1 << RealFourier::FftOrder(freqs_)),

80 chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),	77 chunk_length_(config.sample_rate_hz * kChunkSizeMs / 1000),

81 bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),	78 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),

82 sample_rate_hz_(sample_rate_hz),	79 sample_rate_hz_(config.sample_rate_hz),

83 erb_resolution_(erb_resolution),	80 erb_resolution_(kErbResolution),

84 channels_(channels),	81 channels_(config.channels),

85 analysis_rate_(analysis_rate),	82 analysis_rate_(config.analysis_rate),

86 variance_rate_(variance_rate),	83 capture_vad_thresh_(config.capture_vad_thresh),

	84 render_vad_thresh_(config.render_vad_thresh),

87 clear_variance_(freqs_,	85 clear_variance_(freqs_,

88 static_cast<VarianceType>(cv_type),	86 config.var_type,

89 cv_win,	87 config.var_window_size,

90 cv_alpha),	88 config.var_decay_rate),

91 noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f),	89 noise_variance_(freqs_,

	90 config.var_type,

	91 config.var_window_size,

	92 config.var_decay_rate),

92 filtered_clear_var_(new float[bank_size_]),	93 filtered_clear_var_(new float[bank_size_]),

93 filtered_noise_var_(new float[bank_size_]),	94 filtered_noise_var_(new float[bank_size_]),

94 filter_bank_(bank_size_),	95 filter_bank_(bank_size_),

95 center_freqs_(new float[bank_size_]),	96 center_freqs_(new float[bank_size_]),

96 rho_(new float[bank_size_]),	97 rho_(new float[bank_size_]),

97 gains_eq_(new float[bank_size_]),	98 gains_eq_(new float[bank_size_]),

98 gain_applier_(freqs_, gain_limit),	99 gain_applier_(freqs_, config.gain_change_limit),

99 temp_out_buffer_(nullptr),	100 temp_out_buffer_(nullptr),

100 input_audio_(new float* [channels]),

101 kbd_window_(new float[window_size_]),	101 kbd_window_(new float[window_size_]),

102 render_callback_(this, AudioSource::kRenderStream),	102 render_callback_(this, AudioSource::kRenderStream),

103 capture_callback_(this, AudioSource::kCaptureStream),	103 capture_callback_(this, AudioSource::kCaptureStream),

104 block_count_(0),	104 block_count_(0),

105 analysis_step_(0),	105 analysis_step_(0),

106 vad_high_(WebRtcVad_Create()),	106 vad_high_(WebRtcVad_Create()),

107 vad_low_(WebRtcVad_Create()),	107 vad_low_(WebRtcVad_Create()),

108 vad_tmp_buffer_(new int16_t[chunk_length_]) {	108 vad_tmp_buffer_(new int16_t[chunk_length_]) {

109 DCHECK_LE(kConfigRho, 1.0f);	109 DCHECK_LE(config.rho, 1.0f);

110	110

111 CreateErbBank();	111 CreateErbBank();

112	112

113 WebRtcVad_Init(vad_high_);	113 WebRtcVad_Init(vad_high_);

114 WebRtcVad_set_mode(vad_high_, 0); // High likelihood of speech.	114 WebRtcVad_set_mode(vad_high_, 0); // High likelihood of speech.

115 WebRtcVad_Init(vad_low_);	115 WebRtcVad_Init(vad_low_);

116 WebRtcVad_set_mode(vad_low_, 3); // Low likelihood of speech.	116 WebRtcVad_set_mode(vad_low_, 3); // Low likelihood of speech.

117	117

118 temp_out_buffer_ = static_cast<float**>(	118 temp_out_buffer_ = static_cast<float**>(

119 malloc(sizeof(temp_out_buffer_) channels_ +	119 malloc(sizeof(temp_out_buffer_) channels_ +

120 sizeof(*temp_out_buffer_) chunk_length_ * channels_));	120 sizeof(*temp_out_buffer_) chunk_length_ * channels_));

121 for (int i = 0; i < channels_; ++i) {	121 for (int i = 0; i < channels_; ++i) {

122 temp_out_buffer_[i] =	122 temp_out_buffer_[i] =

123 reinterpret_cast<float*>(temp_out_buffer_ + channels_) +	123 reinterpret_cast<float*>(temp_out_buffer_ + channels_) +

124 chunk_length_ * i;	124 chunk_length_ * i;

125 }	125 }

126	126

127 // Assumes all rho equal.	127 // Assumes all rho equal.

128 for (int i = 0; i < bank_size_; ++i) {	128 for (int i = 0; i < bank_size_; ++i) {

129 rho_[i] = kConfigRho * kConfigRho;	129 rho_[i] = config.rho * config.rho;

130 }	130 }

131	131

132 float freqs_khz = kClipFreq / 1000.0f;	132 float freqs_khz = kClipFreq / 1000.0f;

133 int erb_index = static_cast<int>(ceilf(	133 int erb_index = static_cast<int>(ceilf(

134 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));	134 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));

135 start_freq_ = max(1, erb_index * kErbResolution);	135 start_freq_ = max(1, erb_index * erb_resolution_);

136	136

137 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,	137 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,

138 kbd_window_.get());	138 kbd_window_.get());

139 render_mangler_.reset(new LappedTransform(	139 render_mangler_.reset(new LappedTransform(

140 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	140 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,

141 window_size_ / 2, &render_callback_));	141 window_size_ / 2, &render_callback_));

142 capture_mangler_.reset(new LappedTransform(	142 capture_mangler_.reset(new LappedTransform(

143 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,	143 channels_, channels_, chunk_length_, kbd_window_.get(), window_size_,

144 window_size_ / 2, &capture_callback_));	144 window_size_ / 2, &capture_callback_));

145 }	145 }

146	146

147 IntelligibilityEnhancer::~IntelligibilityEnhancer() {	147 IntelligibilityEnhancer::~IntelligibilityEnhancer() {

148 WebRtcVad_Free(vad_low_);	148 WebRtcVad_Free(vad_low_);

149 WebRtcVad_Free(vad_high_);	149 WebRtcVad_Free(vad_high_);

150 free(temp_out_buffer_);	150 free(temp_out_buffer_);

151 }	151 }

152	152

153 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {	153 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {

	154 ProcessRenderAudio(audio, kVoiceDetected);

	155 }

	156

	157 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
	turaj 2015/07/14 18:28:51 I did not comprehend the logic here. \|voice_proba I did not comprehend the logic here. \|voice_probability\| specifies if the current chunk is active (see below for more explanation on voice probability). Therefore, it has to be used as an indicator whether the current chunk should be considered in variance computation or not. If we have \|voice_probability\| we should not need WebRTC VAD decision. Furthermore, why a call to \|render_mangler_\| is based on \|voice_probability\|? I don't think we should stop applying enhancement gains as soon as a chunk has low activity probability. I think we should keep on applying enhancer, but low activity chunks are discarded from variance computation. We need yet another threshold which specifies if enhancement should be applied at all. So I think the flow of the program is; 1) Update variance of to-be-rendered signal on active chunks. 2) Update variance of captured signal on in-active chunks. 3) Make a decision based on the current state of enhancer (on or off) and the current signal-to-noise ratio whether enhancer should be on or off. Obviously steps 1) and 2) are done through different API calls. I like the idea of having two ProcessRenderAudio(), but I thought the caller is supposed to call ProcessRenderAudio(float* const* audio) if they do not have any information regarding the activity the given audio chunk, then we call WebRTC VAD to get activity flag. On the other hand, if the caller has info regarding the activity, they call ProcessRenderAudio(float* const* audio, float voice_probability), then the given voice_probability is compared with a threshold to decide if the given frame should be included in variance computation. Same thing for ProcessCaptureAudio() ekm 2015/07/17 19:59:38 Done. You're right, the logic was off and not full Show quoted text On 2015/07/14 18:28:51, turaj wrote: > I did not comprehend the logic here. > > \|voice_probability\| specifies if the current chunk is active (see below for more > explanation on voice probability). Therefore, it has to be used as an indicator > whether the current chunk should be considered in variance computation or not. > If we have \|voice_probability\| we should not need WebRTC VAD decision. > > Furthermore, why a call to \|render_mangler_\| is based on \|voice_probability\|? I > don't think we should stop applying enhancement gains as soon as a chunk has low > activity probability. I think we should keep on applying enhancer, but low > activity chunks are discarded from variance computation. > > We need yet another threshold which specifies if enhancement should be applied > at all. So I think the flow of the program is; > 1) Update variance of to-be-rendered signal on active chunks. > 2) Update variance of captured signal on in-active chunks. > 3) Make a decision based on the current state of enhancer (on or off) and the > current signal-to-noise ratio whether enhancer should be on or off. > > Obviously steps 1) and 2) are done through different API calls. > > I like the idea of having two ProcessRenderAudio(), but I thought the caller is > supposed to call ProcessRenderAudio(float* const* audio) if they do not have any > information regarding the activity the given audio chunk, then we call WebRTC > VAD to get activity flag. On the other hand, if the caller has info regarding > the activity, they call ProcessRenderAudio(float* const* audio, float > voice_probability), then the given voice_probability is compared with a > threshold to decide if the given frame should be included in variance > computation. > > Same thing for ProcessCaptureAudio() Done. You're right, the logic was off and not fully implemented. I've updated everything to meet this spec and use the new VAD. It also uses smoothing when deactivating. There may be a more intuitive logic than what I have now. See UpdateActivity().
	158 float voice_probability) {

154 for (int i = 0; i < chunk_length_; ++i) {	159 for (int i = 0; i < chunk_length_; ++i) {

155 vad_tmp_buffer_[i] = (int16_t)audio[0][i];	160 vad_tmp_buffer_[i] = (int16_t)audio[0][i];
	turaj 2015/07/14 18:28:51 You better check with APM guys, but I suppose you You better check with APM guys, but I suppose you get audio in range -1 to 1, therefore, a cast to int16_t is not a good idea. aluebs-webrtc 2015/07/15 01:02:04 No, you get audio in the int16_t range. But for th Show quoted text On 2015/07/14 18:28:51, turaj wrote: > You better check with APM guys, but I suppose you get audio in range -1 to 1, > therefore, a cast to int16_t is not a good idea. No, you get audio in the int16_t range. But for this there are beautiful tools under audio_util. Also, if you need the audio in int16_t and float, you can receive a IFChannelBuffer and let it take care of this? Not sure if it makes it easier or not. ekm 2015/07/17 19:59:38 Done. audio_util is great! Show quoted text On 2015/07/14 18:28:51, turaj wrote: > You better check with APM guys, but I suppose you get audio in range -1 to 1, > therefore, a cast to int16_t is not a good idea. Done. audio_util is great!
156 }	161 }

157 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,	162 has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,

158 vad_tmp_buffer_.get(), chunk_length_) == 1;	163 vad_tmp_buffer_.get(), chunk_length_) == 1;

159	164

160 // Process and enhance chunk of \|audio\|	165 // Process and enhance chunk of \|audio\|

161 render_mangler_->ProcessChunk(audio, temp_out_buffer_);	166 if (voice_probability >= render_vad_thresh_) {

	167 render_mangler_->ProcessChunk(audio, temp_out_buffer_);

	168 }

162	169

163 for (int i = 0; i < channels_; ++i) {	170 for (int i = 0; i < channels_; ++i) {

164 memcpy(audio[i], temp_out_buffer_[i],	171 memcpy(audio[i], temp_out_buffer_[i],

165 chunk_length_ * sizeof(**temp_out_buffer_));	172 chunk_length_ * sizeof(**temp_out_buffer_));

166 }	173 }

167 }	174 }

168	175

169 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {	176 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {

	177 ProcessCaptureAudio(audio, kNoiseDetected);

	178 }

	179

	180 void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio,

	181 float voice_probability) {

170 for (int i = 0; i < chunk_length_; ++i) {	182 for (int i = 0; i < chunk_length_; ++i) {

171 vad_tmp_buffer_[i] = (int16_t)audio[0][i];	183 vad_tmp_buffer_[i] = (int16_t)audio[0][i];

172 }	184 }

173 // TODO(bercic): The VAD was always detecting voice in the noise stream,	185 // TODO(bercic): The VAD was always detecting voice in the noise stream,

174 // no matter what the aggressiveness, so it was temporarily disabled here.	186 // no matter what the aggressiveness, so it was temporarily disabled here.

175	187

176 #if 0	188 #if 0

177 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),	189 if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),

178 chunk_length_) == 1) {	190 chunk_length_) == 1) {

179 printf("capture HAS speech\n");	191 printf("capture HAS speech\n");

180 return;	192 return;

181 }	193 }

182 printf("capture NO speech\n");	194 printf("capture NO speech\n");

183 #endif	195 #endif

184	196

185 capture_mangler_->ProcessChunk(audio, temp_out_buffer_);	197 if (voice_probability <= capture_vad_thresh_) {

	198 capture_mangler_->ProcessChunk(audio, temp_out_buffer_);

	199 }

186 }	200 }

187	201

188 void IntelligibilityEnhancer::DispatchAudio(	202 void IntelligibilityEnhancer::DispatchAudio(

189 IntelligibilityEnhancer::AudioSource source,	203 IntelligibilityEnhancer::AudioSource source,

190 const complex<float>* in_block,	204 const complex<float>* in_block,

191 complex<float>* out_block) {	205 complex<float>* out_block) {

192 switch (source) {	206 switch (source) {

193 case kRenderStream:	207 case kRenderStream:

194 ProcessClearBlock(in_block, out_block);	208 ProcessClearBlock(in_block, out_block);

195 break;	209 break;

(...skipping 15 matching lines...) Expand all Loading...
211 // TODO(ekmeyerson): Change to only enhance if necessary,	225 // TODO(ekmeyerson): Change to only enhance if necessary,

212 // based on experiments with different cutoffs.	226 // based on experiments with different cutoffs.

213 if (has_voice_low_ \|\| true) {	227 if (has_voice_low_ \|\| true) {

214 clear_variance_.Step(in_block, false);	228 clear_variance_.Step(in_block, false);

215 const float power_target = std::accumulate(	229 const float power_target = std::accumulate(

216 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.0f);	230 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.0f);

217	231

218 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {	232 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {

219 AnalyzeClearBlock(power_target);	233 AnalyzeClearBlock(power_target);

220 ++analysis_step_;	234 ++analysis_step_;

221 if (analysis_step_ == variance_rate_) {

222 analysis_step_ = 0;

223 clear_variance_.Clear();

224 noise_variance_.Clear();

225 }

226 }	235 }

227 ++block_count_;	236 ++block_count_;

228 }	237 }

229	238

230 /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */	239 /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */

231 gain_applier_.Apply(in_block, out_block);	240 gain_applier_.Apply(in_block, out_block);

232 }	241 }

233	242

234 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {	243 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {

235 FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());	244 FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());

(...skipping 164 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
400 int length) {	409 int length) {

401 float ret = 0.0f;	410 float ret = 0.0f;

402	411

403 for (int i = 0; i < length; ++i) {	412 for (int i = 0; i < length; ++i) {

404 ret = fmaf(a[i], b[i], ret);	413 ret = fmaf(a[i], b[i], ret);

405 }	414 }

406 return ret;	415 return ret;

407 }	416 }

408	417

409 } // namespace webrtc	418 } // namespace webrtc

OLD	NEW