webrtc/modules/audio_processing/vad/vad_audio_proc.cc - Issue 1212543002: Pull the Voice Activity Detector out from the AGC

Unified Diff: webrtc/modules/audio_processing/vad/vad_audio_proc.cc

Issue 1212543002: Pull the Voice Activity Detector out from the AGC (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « webrtc/modules/audio_processing/vad/vad_audio_proc.h ('k') | webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h » ('j') | webrtc/modules/audio_processing/vad/voice_activity_detector.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: webrtc/modules/audio_processing/vad/vad_audio_proc.cc

diff --git a/webrtc/modules/audio_processing/agc/agc_audio_proc.cc b/webrtc/modules/audio_processing/vad/vad_audio_proc.cc

similarity index 78%

rename from webrtc/modules/audio_processing/agc/agc_audio_proc.cc

rename to webrtc/modules/audio_processing/vad/vad_audio_proc.cc

index dc4a5a711c7ef168e7e534190e03c6384dbd0cd8..e8f27f802de5468ca97e36e3e06ea82ef6d7cf44 100644

--- a/webrtc/modules/audio_processing/agc/agc_audio_proc.cc

+++ b/webrtc/modules/audio_processing/vad/vad_audio_proc.cc

@@ -8,15 +8,15 @@

* be found in the AUTHORS file in the root of the source tree.

-#include "webrtc/modules/audio_processing/agc/agc_audio_proc.h"

+#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"

#include <math.h>

#include <stdio.h>

#include "webrtc/common_audio/fft4g.h"

-#include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h"

-#include "webrtc/modules/audio_processing/agc/pitch_internal.h"

-#include "webrtc/modules/audio_processing/agc/pole_zero_filter.h"

+#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"

+#include "webrtc/modules/audio_processing/vad/pitch_internal.h"

+#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"

extern "C" {

#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"

#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"

@@ -29,23 +29,25 @@ namespace webrtc {

// The following structures are declared anonymous in iSAC's structs.h. To

// forward declare them, we use this derived class trick.

-struct AgcAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};

-struct AgcAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};

+struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};

+struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};

-static const float kFrequencyResolution = kSampleRateHz /

- static_cast<float>(AgcAudioProc::kDftSize);

+static const float kFrequencyResolution =

+ kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);

static const int kSilenceRms = 5;

-// TODO(turajs): Make a Create or Init for AgcAudioProc.

-AgcAudioProc::AgcAudioProc()

+// TODO(turajs): Make a Create or Init for VadAudioProc.

+VadAudioProc::VadAudioProc()

: audio_buffer_(),

num_buffer_samples_(kNumPastSignalSamples),

log_old_gain_(-2),

old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).

pitch_analysis_handle_(new PitchAnalysisStruct),

pre_filter_handle_(new PreFiltBankstr),

- high_pass_filter_(PoleZeroFilter::Create(

- kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) {

+ high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,

+ kFilterOrder,

+ kCoeffDenominator,

+ kFilterOrder)) {

static_assert(kNumPastSignalSamples + kNumSubframeSamples ==

sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),

"lpc analysis window incorrect size");

@@ -64,15 +66,16 @@ AgcAudioProc::AgcAudioProc()

WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());

}

-AgcAudioProc::~AgcAudioProc() {}

+VadAudioProc::~VadAudioProc() {

-void AgcAudioProc::ResetBuffer() {

+void VadAudioProc::ResetBuffer() {

memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],

sizeof(audio_buffer_[0]) * kNumPastSignalSamples);

num_buffer_samples_ = kNumPastSignalSamples;

}

-int AgcAudioProc::ExtractFeatures(const int16_t* frame,

+int VadAudioProc::ExtractFeatures(const int16_t* frame,

int length,

AudioFeatures* features) {

features->num_frames = 0;

@@ -85,7 +88,7 @@ int AgcAudioProc::ExtractFeatures(const int16_t* frame,

// classification.

if (high_pass_filter_->Filter(frame, kNumSubframeSamples,

&audio_buffer_[num_buffer_samples_]) != 0) {

- return -1;

+ return -1;

}

num_buffer_samples_ += kNumSubframeSamples;

@@ -115,7 +118,8 @@ int AgcAudioProc::ExtractFeatures(const int16_t* frame,

}

// Computes |kLpcOrder + 1| correlation coefficients.

-void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr,

+void VadAudioProc::SubframeCorrelation(double* corr,

+ int length_corr,

int subframe_index) {

assert(length_corr >= kLpcOrder + 1);

double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];

@@ -124,20 +128,20 @@ void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr,

for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)

windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];

- WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples +

- kNumPastSignalSamples, kLpcOrder);

+ WebRtcIsac_AutoCorr(corr, windowed_audio,

+ kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);

}

// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.

// The analysis window is 15 ms long and it is centered on the first half of

// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the

// first half of each 10 ms subframe.

-void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {

+void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {

assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));

double corr[kLpcOrder + 1];

double reflec_coeff[kLpcOrder];

for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;

- i++, offset_lpc += kLpcOrder + 1) {

+ i++, offset_lpc += kLpcOrder + 1) {

SubframeCorrelation(corr, kLpcOrder + 1, i);

corr[0] *= 1.0001;

// This makes Lev-Durb a bit more stable.

@@ -150,7 +154,8 @@ void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {

// Fit a second order curve to these 3 points and find the location of the

// extremum. The points are inverted before curve fitting.

-static float QuadraticInterpolation(float prev_val, float curr_val,

+static float QuadraticInterpolation(float prev_val,

+ float curr_val,

float next_val) {

// Doing the interpolation in |1 / A(z)|^2.

float fractional_index = 0;

@@ -158,8 +163,8 @@ static float QuadraticInterpolation(float prev_val, float curr_val,

prev_val = 1.0f / prev_val;

curr_val = 1.0f / curr_val;

- fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val -

- 2.f * curr_val);

+ fractional_index =

+ -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);

assert(fabs(fractional_index) < 1);

return fractional_index;

}

@@ -169,7 +174,7 @@ static float QuadraticInterpolation(float prev_val, float curr_val,

// with the local minimum of A(z). It saves complexity, as we save one

// inversion. Furthermore, we find the first local maximum of magnitude squared,

// to save on one square root.

-void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {

+void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {

assert(length_f_peak >= kNum10msSubframes);

double lpc[kNum10msSubframes * (kLpcOrder + 1)];

// For all sub-frames.

@@ -193,8 +198,8 @@ void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {

float next_magn_sqr;

bool found_peak = false;

for (int n = 2; n < kNumDftCoefficients - 1; n++) {

- next_magn_sqr = data[2 * n] * data[2 * n] +

- data[2 * n + 1] * data[2 * n + 1];

+ next_magn_sqr =

+ data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];

if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {

found_peak = true;

index_peak = n - 1;

@@ -213,15 +218,16 @@ void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {

} else {

// A peak is found, do a simple quadratic interpolation to get a more

// accurate estimate of the peak location.

- fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr,

- next_magn_sqr);

+ fractional_index =

+ QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);

}

f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;

}

// Using iSAC functions to estimate pitch gains & lags.

-void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz,

+void VadAudioProc::PitchAnalysis(double* log_pitch_gains,

+ double* pitch_lags_hz,

int length) {

// TODO(turajs): This can be "imported" from iSAC & and the next two

// constants.

@@ -241,28 +247,27 @@ void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz,

kNumLookaheadSamples];

// Split signal to lower and upper bands

- WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples],

- lower, upper, lower_lookahead, upper_lookahead,

+ WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,

+ upper, lower_lookahead, upper_lookahead,

pre_filter_handle_.get());

WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,

pitch_analysis_handle_.get(), lags, gains);

// Lags are computed on lower-band signal with sampling rate half of the

// input signal.

- GetSubframesPitchParameters(kSampleRateHz / 2, gains, lags,

- kNumPitchSubframes, kNum10msSubframes,

- &log_old_gain_, &old_lag_,

- log_pitch_gains, pitch_lags_hz);

+ GetSubframesPitchParameters(

+ kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,

+ &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);

}

-void AgcAudioProc::Rms(double* rms, int length_rms) {

+void VadAudioProc::Rms(double* rms, int length_rms) {

assert(length_rms >= kNum10msSubframes);

int offset = kNumPastSignalSamples;

for (int i = 0; i < kNum10msSubframes; i++) {

rms[i] = 0;

for (int n = 0; n < kNumSubframeSamples; n++, offset++)

rms[i] += audio_buffer_[offset] * audio_buffer_[offset];

- rms[i] = sqrt(rms[i] / kNumSubframeSamples);

+ rms[i] = sqrt(rms[i] / kNumSubframeSamples);

}