Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(159)

Side by Side Diff: webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc

Issue 1693823004: Use VAD to get a better speech power estimation in the IntelligibilityEnhancer (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@pow
Patch Set: Make gain change limit relative Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 //
12 // Implements core class for intelligibility enhancer.
13 //
14 // Details of the model and algorithm can be found in the original paper:
15 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
16 //
17
18 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h" 11 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhanc er.h"
19 12
20 #include <math.h> 13 #include <math.h>
21 #include <stdlib.h> 14 #include <stdlib.h>
22 #include <algorithm> 15 #include <algorithm>
23 #include <limits> 16 #include <limits>
24 #include <numeric> 17 #include <numeric>
25 18
26 #include "webrtc/base/checks.h" 19 #include "webrtc/base/checks.h"
27 #include "webrtc/common_audio/include/audio_util.h" 20 #include "webrtc/common_audio/include/audio_util.h"
28 #include "webrtc/common_audio/window_generator.h" 21 #include "webrtc/common_audio/window_generator.h"
29 22
30 namespace webrtc { 23 namespace webrtc {
31 24
32 namespace { 25 namespace {
33 26
34 const size_t kErbResolution = 2; 27 const size_t kErbResolution = 2;
35 const int kWindowSizeMs = 2; 28 const int kWindowSizeMs = 16;
36 const int kChunkSizeMs = 10; // Size provided by APM. 29 const int kChunkSizeMs = 10; // Size provided by APM.
37 const float kClipFreq = 200.0f; 30 const float kClipFreqKhz = 0.2f;
38 const float kConfigRho = 0.02f; // Default production and interpretation SNR.
39 const float kKbdAlpha = 1.5f; 31 const float kKbdAlpha = 1.5f;
40 const float kLambdaBot = -1.0f; // Extreme values in bisection 32 const float kLambdaBot = -1.0f; // Extreme values in bisection
41 const float kLambdaTop = -10e-18f; // search for lamda. 33 const float kLambdaTop = -10e-18f; // search for lamda.
34 const float kVoiceProbabilityThreshold = 0.02;
35 // Number of chunks after voice activity which is still considered speech.
36 const size_t kSpeechOffsetDelay = 80;
37 const float kDecayRate = 0.98f; // Power estimation decay rate.
38 const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.
39 const float kRho = 0.0004f; // Default production and interpretation SNR.
42 40
43 // Returns dot product of vectors |a| and |b| with size |length|. 41 // Returns dot product of vectors |a| and |b| with size |length|.
44 float DotProduct(const float* a, const float* b, size_t length) { 42 float DotProduct(const float* a, const float* b, size_t length) {
45 float ret = 0.f; 43 float ret = 0.f;
46 for (size_t i = 0; i < length; ++i) { 44 for (size_t i = 0; i < length; ++i) {
47 ret = fmaf(a[i], b[i], ret); 45 ret = fmaf(a[i], b[i], ret);
48 } 46 }
49 return ret; 47 return ret;
50 } 48 }
51 49
52 // Computes the power across ERB filters from the power spectral density |var|. 50 // Computes the power across ERB bands from the power spectral density |pow|.
53 // Stores it in |result|. 51 // Stores it in |result|.
54 void FilterVariance(const float* var, 52 void MapToErbBands(const float* pow,
55 const std::vector<std::vector<float>>& filter_bank, 53 const std::vector<std::vector<float>>& filter_bank,
56 float* result) { 54 float* result) {
57 for (size_t i = 0; i < filter_bank.size(); ++i) { 55 for (size_t i = 0; i < filter_bank.size(); ++i) {
58 RTC_DCHECK_GT(filter_bank[i].size(), 0u); 56 RTC_DCHECK_GT(filter_bank[i].size(), 0u);
59 result[i] = DotProduct(&filter_bank[i][0], var, filter_bank[i].size()); 57 result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());
60 } 58 }
61 } 59 }
62 60
63 } // namespace 61 } // namespace
64 62
65 using std::complex;
66 using std::max;
67 using std::min;
68 using VarianceType = intelligibility::VarianceArray::StepType;
69
70 IntelligibilityEnhancer::TransformCallback::TransformCallback( 63 IntelligibilityEnhancer::TransformCallback::TransformCallback(
71 IntelligibilityEnhancer* parent) 64 IntelligibilityEnhancer* parent)
72 : parent_(parent) { 65 : parent_(parent) {
73 } 66 }
74 67
75 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( 68 void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
76 const complex<float>* const* in_block, 69 const std::complex<float>* const* in_block,
77 size_t in_channels, 70 size_t in_channels,
78 size_t frames, 71 size_t frames,
79 size_t /* out_channels */, 72 size_t /* out_channels */,
80 complex<float>* const* out_block) { 73 std::complex<float>* const* out_block) {
81 RTC_DCHECK_EQ(parent_->freqs_, frames); 74 RTC_DCHECK_EQ(parent_->freqs_, frames);
82 for (size_t i = 0; i < in_channels; ++i) { 75 for (size_t i = 0; i < in_channels; ++i) {
83 parent_->ProcessClearBlock(in_block[i], out_block[i]); 76 parent_->ProcessClearBlock(in_block[i], out_block[i]);
84 } 77 }
85 } 78 }
86 79
87 IntelligibilityEnhancer::IntelligibilityEnhancer() 80 IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
88 : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) { 81 size_t num_render_channels)
89 }
90
91 IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
92 : freqs_(RealFourier::ComplexLength( 82 : freqs_(RealFourier::ComplexLength(
93 RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))), 83 RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
94 window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))), 84 chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
95 chunk_length_( 85 bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
96 static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)), 86 sample_rate_hz_(sample_rate_hz),
97 bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)), 87 num_render_channels_(num_render_channels),
98 sample_rate_hz_(config.sample_rate_hz), 88 clear_power_estimator_(freqs_, kDecayRate),
99 erb_resolution_(kErbResolution), 89 noise_power_estimator_(
100 num_capture_channels_(config.num_capture_channels), 90 new intelligibility::PowerEstimator(freqs_, kDecayRate)),
101 num_render_channels_(config.num_render_channels), 91 filtered_clear_pow_(new float[bank_size_]),
102 analysis_rate_(config.analysis_rate), 92 filtered_noise_pow_(new float[bank_size_]),
103 active_(true),
104 clear_variance_(freqs_,
105 config.var_type,
106 config.var_window_size,
107 config.var_decay_rate),
108 noise_power_(freqs_, 0.f),
109 filtered_clear_var_(new float[bank_size_]),
110 filtered_noise_var_(new float[bank_size_]),
111 center_freqs_(new float[bank_size_]), 93 center_freqs_(new float[bank_size_]),
112 render_filter_bank_(CreateErbBank(freqs_)), 94 render_filter_bank_(CreateErbBank(freqs_)),
113 rho_(new float[bank_size_]),
114 gains_eq_(new float[bank_size_]), 95 gains_eq_(new float[bank_size_]),
115 gain_applier_(freqs_, config.gain_change_limit), 96 gain_applier_(freqs_, kMaxRelativeGainChange),
116 temp_render_out_buffer_(chunk_length_, num_render_channels_), 97 temp_render_out_buffer_(chunk_length_, num_render_channels_),
117 kbd_window_(new float[window_size_]),
118 render_callback_(this), 98 render_callback_(this),
119 block_count_(0), 99 audio_s16_(chunk_length_),
120 analysis_step_(0) { 100 chunks_since_voice_(kSpeechOffsetDelay),
121 RTC_DCHECK_LE(config.rho, 1.0f); 101 is_speech_(false) {
102 RTC_DCHECK_LE(kRho, 1.f);
122 103
123 memset(filtered_clear_var_.get(), 104 memset(filtered_clear_pow_.get(), 0,
124 0, 105 bank_size_ * sizeof(filtered_clear_pow_[0]));
125 bank_size_ * sizeof(filtered_clear_var_[0])); 106 memset(filtered_noise_pow_.get(), 0,
126 memset(filtered_noise_var_.get(), 107 bank_size_ * sizeof(filtered_noise_pow_[0]));
127 0,
128 bank_size_ * sizeof(filtered_noise_var_[0]));
129 108
130 // Assumes all rho equal. 109 const size_t erb_index = static_cast<size_t>(
131 for (size_t i = 0; i < bank_size_; ++i) { 110 ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
132 rho_[i] = config.rho * config.rho; 111 43.f));
133 } 112 start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
134 113
135 float freqs_khz = kClipFreq / 1000.0f; 114 size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
136 size_t erb_index = static_cast<size_t>(ceilf( 115 std::vector<float> kbd_window(window_size);
137 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f)); 116 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
138 start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);
139
140 WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
141 kbd_window_.get());
142 render_mangler_.reset(new LappedTransform( 117 render_mangler_.reset(new LappedTransform(
143 num_render_channels_, num_render_channels_, chunk_length_, 118 num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
144 kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_)); 119 window_size, window_size / 2, &render_callback_));
145 } 120 }
146 121
147 void IntelligibilityEnhancer::SetCaptureNoiseEstimate( 122 void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
148 std::vector<float> noise) { 123 std::vector<float> noise) {
149 if (capture_filter_bank_.size() != bank_size_ || 124 if (capture_filter_bank_.size() != bank_size_ ||
150 capture_filter_bank_[0].size() != noise.size()) { 125 capture_filter_bank_[0].size() != noise.size()) {
151 capture_filter_bank_ = CreateErbBank(noise.size()); 126 capture_filter_bank_ = CreateErbBank(noise.size());
127 noise_power_estimator_.reset(
128 new intelligibility::PowerEstimator(noise.size(), kDecayRate));
152 } 129 }
153 if (noise.size() != noise_power_.size()) { 130 noise_power_estimator_->Step(&noise[0]);
154 noise_power_.resize(noise.size());
155 }
156 for (size_t i = 0; i < noise.size(); ++i) {
157 noise_power_[i] = noise[i] * noise[i];
158 }
159 } 131 }
160 132
161 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, 133 void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
162 int sample_rate_hz, 134 int sample_rate_hz,
163 size_t num_channels) { 135 size_t num_channels) {
164 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); 136 RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
165 RTC_CHECK_EQ(num_render_channels_, num_channels); 137 RTC_CHECK_EQ(num_render_channels_, num_channels);
166 138 is_speech_ = IsSpeech(audio[0]);
167 if (active_) { 139 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
168 render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); 140 for (size_t i = 0; i < num_render_channels_; ++i) {
169 } 141 memcpy(audio[i], temp_render_out_buffer_.channels()[i],
170 142 chunk_length_ * sizeof(**audio));
171 if (active_) {
172 for (size_t i = 0; i < num_render_channels_; ++i) {
173 memcpy(audio[i], temp_render_out_buffer_.channels()[i],
174 chunk_length_ * sizeof(**audio));
175 }
176 } 143 }
177 } 144 }
178 145
179 void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block, 146 void IntelligibilityEnhancer::ProcessClearBlock(
180 complex<float>* out_block) { 147 const std::complex<float>* in_block,
181 if (block_count_ < 2) { 148 std::complex<float>* out_block) {
182 memset(out_block, 0, freqs_ * sizeof(*out_block)); 149 if (is_speech_) {
183 ++block_count_; 150 clear_power_estimator_.Step(in_block);
184 return;
185 } 151 }
186 152 const std::vector<float>& clear_power = clear_power_estimator_.power();
187 // TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary. 153 const std::vector<float>& noise_power = noise_power_estimator_->power();
188 if (true) { 154 MapToErbBands(&clear_power[0], render_filter_bank_,
189 clear_variance_.Step(in_block, false); 155 filtered_clear_pow_.get());
190 if (block_count_ % analysis_rate_ == analysis_rate_ - 1) { 156 MapToErbBands(&noise_power[0], capture_filter_bank_,
191 const float power_target = std::accumulate( 157 filtered_noise_pow_.get());
192 clear_variance_.variance(), clear_variance_.variance() + freqs_, 0.f);
193 AnalyzeClearBlock(power_target);
194 ++analysis_step_;
195 }
196 ++block_count_;
197 }
198
199 if (active_) {
200 gain_applier_.Apply(in_block, out_block);
201 }
202 }
203
204 void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {
205 FilterVariance(clear_variance_.variance(),
206 render_filter_bank_,
207 filtered_clear_var_.get());
208 FilterVariance(&noise_power_[0],
209 capture_filter_bank_,
210 filtered_noise_var_.get());
211 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); 158 SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
159 const float power_target =
160 std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
212 const float power_top = 161 const float power_top =
213 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_); 162 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
214 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); 163 SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
215 const float power_bot = 164 const float power_bot =
216 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_); 165 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
217 if (power_target >= power_bot && power_target <= power_top) { 166 if (power_target >= power_bot && power_target <= power_top) {
218 SolveForLambda(power_target, power_bot, power_top); 167 SolveForLambda(power_target, power_bot, power_top);
219 UpdateErbGains(); 168 UpdateErbGains();
220 } // Else experiencing variance underflow, so do nothing. 169 } // Else experiencing power underflow, so do nothing.
170 gain_applier_.Apply(in_block, out_block);
221 } 171 }
222 172
223 void IntelligibilityEnhancer::SolveForLambda(float power_target, 173 void IntelligibilityEnhancer::SolveForLambda(float power_target,
224 float power_bot, 174 float power_bot,
225 float power_top) { 175 float power_top) {
226 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values 176 const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values
227 const int kMaxIters = 100; // for these, based on experiments. 177 const int kMaxIters = 100; // for these, based on experiments.
228 178
229 const float reciprocal_power_target = 179 const float reciprocal_power_target =
230 1.f / (power_target + std::numeric_limits<float>::epsilon()); 180 1.f / (power_target + std::numeric_limits<float>::epsilon());
231 float lambda_bot = kLambdaBot; 181 float lambda_bot = kLambdaBot;
232 float lambda_top = kLambdaTop; 182 float lambda_top = kLambdaTop;
233 float power_ratio = 2.0f; // Ratio of achieved power to target power. 183 float power_ratio = 2.f; // Ratio of achieved power to target power.
234 int iters = 0; 184 int iters = 0;
235 while (std::fabs(power_ratio - 1.0f) > kConvergeThresh && 185 while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
236 iters <= kMaxIters) { 186 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;
237 const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;
238 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); 187 SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
239 const float power = 188 const float power =
240 DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_); 189 DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
241 if (power < power_target) { 190 if (power < power_target) {
242 lambda_bot = lambda; 191 lambda_bot = lambda;
243 } else { 192 } else {
244 lambda_top = lambda; 193 lambda_top = lambda;
245 } 194 }
246 power_ratio = std::fabs(power * reciprocal_power_target); 195 power_ratio = std::fabs(power * reciprocal_power_target);
247 ++iters; 196 ++iters;
248 } 197 }
249 } 198 }
250 199
251 void IntelligibilityEnhancer::UpdateErbGains() { 200 void IntelligibilityEnhancer::UpdateErbGains() {
252 // (ERB gain) = filterbank' * (freq gain) 201 // (ERB gain) = filterbank' * (freq gain)
253 float* gains = gain_applier_.target(); 202 float* gains = gain_applier_.target();
254 for (size_t i = 0; i < freqs_; ++i) { 203 for (size_t i = 0; i < freqs_; ++i) {
255 gains[i] = 0.0f; 204 gains[i] = 0.f;
256 for (size_t j = 0; j < bank_size_; ++j) { 205 for (size_t j = 0; j < bank_size_; ++j) {
257 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]); 206 gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);
258 } 207 }
259 } 208 }
260 } 209 }
261 210
262 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate, 211 size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,
263 size_t erb_resolution) { 212 size_t erb_resolution) {
264 float freq_limit = sample_rate / 2000.0f; 213 float freq_limit = sample_rate / 2000.f;
265 size_t erb_scale = static_cast<size_t>(ceilf( 214 size_t erb_scale = static_cast<size_t>(ceilf(
266 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f)); 215 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));
267 return erb_scale * erb_resolution; 216 return erb_scale * erb_resolution;
268 } 217 }
269 218
270 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank( 219 std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
271 size_t num_freqs) { 220 size_t num_freqs) {
272 std::vector<std::vector<float>> filter_bank(bank_size_); 221 std::vector<std::vector<float>> filter_bank(bank_size_);
273 size_t lf = 1, rf = 4; 222 size_t lf = 1, rf = 4;
274 223
275 for (size_t i = 0; i < bank_size_; ++i) { 224 for (size_t i = 0; i < bank_size_; ++i) {
276 float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_)); 225 float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));
277 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); 226 center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));
278 center_freqs_[i] -= 14678.49f; 227 center_freqs_[i] -= 14678.49f;
279 } 228 }
280 float last_center_freq = center_freqs_[bank_size_ - 1]; 229 float last_center_freq = center_freqs_[bank_size_ - 1];
281 for (size_t i = 0; i < bank_size_; ++i) { 230 for (size_t i = 0; i < bank_size_; ++i) {
282 center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq; 231 center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq;
283 } 232 }
284 233
285 for (size_t i = 0; i < bank_size_; ++i) { 234 for (size_t i = 0; i < bank_size_; ++i) {
286 filter_bank[i].resize(num_freqs); 235 filter_bank[i].resize(num_freqs);
287 } 236 }
288 237
289 for (size_t i = 1; i <= bank_size_; ++i) { 238 for (size_t i = 1; i <= bank_size_; ++i) {
290 size_t lll, ll, rr, rrr; 239 size_t lll, ll, rr, rrr;
291 static const size_t kOne = 1; // Avoids repeated static_cast<>s below. 240 lll = static_cast<size_t>(round(center_freqs_[std::max(1ul, i - lf) - 1] *
292 lll = static_cast<size_t>(round( 241 num_freqs / (0.5f * sample_rate_hz_)));
293 center_freqs_[max(kOne, i - lf) - 1] * num_freqs / 242 ll = static_cast<size_t>(round(center_freqs_[std::max(1ul, i) - 1] *
294 (0.5f * sample_rate_hz_))); 243 num_freqs / (0.5f * sample_rate_hz_)));
295 ll = static_cast<size_t>(round( 244 lll = std::min(num_freqs, std::max(lll, 1ul)) - 1;
296 center_freqs_[max(kOne, i) - 1] * num_freqs / 245 ll = std::min(num_freqs, std::max(ll, 1ul)) - 1;
297 (0.5f * sample_rate_hz_)));
298 lll = min(num_freqs, max(lll, kOne)) - 1;
299 ll = min(num_freqs, max(ll, kOne)) - 1;
300 246
301 rrr = static_cast<size_t>(round( 247 rrr = static_cast<size_t>(
302 center_freqs_[min(bank_size_, i + rf) - 1] * num_freqs / 248 round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
303 (0.5f * sample_rate_hz_))); 249 (0.5f * sample_rate_hz_)));
304 rr = static_cast<size_t>(round( 250 rr = static_cast<size_t>(
305 center_freqs_[min(bank_size_, i + 1) - 1] * num_freqs / 251 round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
306 (0.5f * sample_rate_hz_))); 252 (0.5f * sample_rate_hz_)));
307 rrr = min(num_freqs, max(rrr, kOne)) - 1; 253 rrr = std::min(num_freqs, std::max(rrr, 1ul)) - 1;
308 rr = min(num_freqs, max(rr, kOne)) - 1; 254 rr = std::min(num_freqs, std::max(rr, 1ul)) - 1;
309 255
310 float step, element; 256 float step, element;
311 257
312 step = ll == lll ? 0.f : 1.f / (ll - lll); 258 step = ll == lll ? 0.f : 1.f / (ll - lll);
313 element = 0.0f; 259 element = 0.f;
314 for (size_t j = lll; j <= ll; ++j) { 260 for (size_t j = lll; j <= ll; ++j) {
315 filter_bank[i - 1][j] = element; 261 filter_bank[i - 1][j] = element;
316 element += step; 262 element += step;
317 } 263 }
318 step = rr == rrr ? 0.f : 1.f / (rrr - rr); 264 step = rr == rrr ? 0.f : 1.f / (rrr - rr);
319 element = 1.0f; 265 element = 1.f;
320 for (size_t j = rr; j <= rrr; ++j) { 266 for (size_t j = rr; j <= rrr; ++j) {
321 filter_bank[i - 1][j] = element; 267 filter_bank[i - 1][j] = element;
322 element -= step; 268 element -= step;
323 } 269 }
324 for (size_t j = ll; j <= rr; ++j) { 270 for (size_t j = ll; j <= rr; ++j) {
325 filter_bank[i - 1][j] = 1.0f; 271 filter_bank[i - 1][j] = 1.f;
326 } 272 }
327 } 273 }
328 274
329 float sum; 275 float sum;
330 for (size_t i = 0; i < num_freqs; ++i) { 276 for (size_t i = 0; i < num_freqs; ++i) {
331 sum = 0.0f; 277 sum = 0.f;
332 for (size_t j = 0; j < bank_size_; ++j) { 278 for (size_t j = 0; j < bank_size_; ++j) {
333 sum += filter_bank[j][i]; 279 sum += filter_bank[j][i];
334 } 280 }
335 for (size_t j = 0; j < bank_size_; ++j) { 281 for (size_t j = 0; j < bank_size_; ++j) {
336 filter_bank[j][i] /= sum; 282 filter_bank[j][i] /= sum;
337 } 283 }
338 } 284 }
339 return filter_bank; 285 return filter_bank;
340 } 286 }
341 287
342 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, 288 void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
343 size_t start_freq, 289 size_t start_freq,
344 float* sols) { 290 float* sols) {
345 bool quadratic = (kConfigRho < 1.0f); 291 bool quadratic = (kRho < 1.f);
346 const float* var_x0 = filtered_clear_var_.get(); 292 const float* pow_x0 = filtered_clear_pow_.get();
347 const float* var_n0 = filtered_noise_var_.get(); 293 const float* pow_n0 = filtered_noise_pow_.get();
348 294
349 for (size_t n = 0; n < start_freq; ++n) { 295 for (size_t n = 0; n < start_freq; ++n) {
350 sols[n] = 1.0f; 296 sols[n] = 1.f;
351 } 297 }
352 298
353 // Analytic solution for optimal gains. See paper for derivation. 299 // Analytic solution for optimal gains. See paper for derivation.
354 for (size_t n = start_freq - 1; n < bank_size_; ++n) { 300 for (size_t n = start_freq - 1; n < bank_size_; ++n) {
355 float alpha0, beta0, gamma0; 301 float alpha0, beta0, gamma0;
356 gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] + 302 gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +
357 lambda * var_x0[n] * var_n0[n] * var_n0[n]; 303 lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];
358 beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n]; 304 beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];
359 if (quadratic) { 305 if (quadratic) {
360 alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n]; 306 alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];
361 sols[n] = 307 sols[n] =
362 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / 308 (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /
363 (2 * alpha0 + std::numeric_limits<float>::epsilon()); 309 (2 * alpha0 + std::numeric_limits<float>::epsilon());
364 } else { 310 } else {
365 sols[n] = -gamma0 / beta0; 311 sols[n] = -gamma0 / beta0;
366 } 312 }
367 sols[n] = fmax(0, sols[n]); 313 sols[n] = fmax(0, sols[n]);
368 } 314 }
369 } 315 }
370 316
371 bool IntelligibilityEnhancer::active() const { 317 bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
372 return active_; 318 FloatToS16(audio, chunk_length_, &audio_s16_[0]);
319 vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
320 if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
321 chunks_since_voice_ = 0;
322 } else if (chunks_since_voice_ < kSpeechOffsetDelay) {
323 ++chunks_since_voice_;
324 }
325 return chunks_since_voice_ < kSpeechOffsetDelay;
373 } 326 }
374 327
375 } // namespace webrtc 328 } // namespace webrtc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698