webrtc/modules/audio_processing/vad/pitch_based_vad.cc - Issue 1212543002: Pull the Voice Activity Detector out from the AGC

Side by Side Diff: webrtc/modules/audio_processing/vad/pitch_based_vad.cc

Issue 1212543002: Pull the Voice Activity Detector out from the AGC (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « webrtc/modules/audio_processing/vad/pitch_based_vad.h ('k') | webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc » ('j') | webrtc/modules/audio_processing/vad/voice_activity_detector.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "webrtc/modules/audio_processing/agc/pitch_based_vad.h"	11 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"

12	12

13 #include <assert.h>	13 #include <assert.h>

14 #include <math.h>	14 #include <math.h>

15 #include <string.h>	15 #include <string.h>

16	16

17 #include "webrtc/modules/audio_processing/agc/circular_buffer.h"	17 #include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"

18 #include "webrtc/modules/audio_processing/agc/common.h"	18 #include "webrtc/modules/audio_processing/vad/common.h"

19 #include "webrtc/modules/audio_processing/agc/noise_gmm_tables.h"	19 #include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h"

20 #include "webrtc/modules/audio_processing/agc/voice_gmm_tables.h"	20 #include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h"

21 #include "webrtc/modules/interface/module_common_types.h"	21 #include "webrtc/modules/interface/module_common_types.h"

22	22

23 namespace webrtc {	23 namespace webrtc {

24	24

25 static_assert(kNoiseGmmDim == kVoiceGmmDim,	25 static_assert(kNoiseGmmDim == kVoiceGmmDim,

26 "noise and voice gmm dimension not equal");	26 "noise and voice gmm dimension not equal");

27	27

28 // These values should match MATLAB counterparts for unit-tests to pass.	28 // These values should match MATLAB counterparts for unit-tests to pass.

29 static const int kPosteriorHistorySize = 500; // 5 sec of 10 ms frames.	29 static const int kPosteriorHistorySize = 500; // 5 sec of 10 ms frames.

30 static const double kInitialPriorProbability = 0.3;	30 static const double kInitialPriorProbability = 0.3;

31 static const int kTransientWidthThreshold = 7;	31 static const int kTransientWidthThreshold = 7;

32 static const double kLowProbabilityThreshold = 0.2;	32 static const double kLowProbabilityThreshold = 0.2;

33	33

34 static double LimitProbability(double p) {	34 static double LimitProbability(double p) {

35 const double kLimHigh = 0.99;	35 const double kLimHigh = 0.99;

36 const double kLimLow = 0.01;	36 const double kLimLow = 0.01;

37	37

38 if (p > kLimHigh)	38 if (p > kLimHigh)

39 p = kLimHigh;	39 p = kLimHigh;

40 else if (p < kLimLow)	40 else if (p < kLimLow)

41 p = kLimLow;	41 p = kLimLow;

42 return p;	42 return p;

43 }	43 }

44	44

45 PitchBasedVad::PitchBasedVad()	45 PitchBasedVad::PitchBasedVad()

46 : p_prior_(kInitialPriorProbability),	46 : p_prior_(kInitialPriorProbability),

47 circular_buffer_(AgcCircularBuffer::Create(kPosteriorHistorySize)) {	47 circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) {

48 // Setup noise GMM.	48 // Setup noise GMM.

49 noise_gmm_.dimension = kNoiseGmmDim;	49 noise_gmm_.dimension = kNoiseGmmDim;

50 noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;	50 noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;

51 noise_gmm_.weight = kNoiseGmmWeights;	51 noise_gmm_.weight = kNoiseGmmWeights;

52 noise_gmm_.mean = &kNoiseGmmMean[0][0];	52 noise_gmm_.mean = &kNoiseGmmMean[0][0];

53 noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];	53 noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];

54	54

55 // Setup voice GMM.	55 // Setup voice GMM.

56 voice_gmm_.dimension = kVoiceGmmDim;	56 voice_gmm_.dimension = kVoiceGmmDim;

57 voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;	57 voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;

58 voice_gmm_.weight = kVoiceGmmWeights;	58 voice_gmm_.weight = kVoiceGmmWeights;

59 voice_gmm_.mean = &kVoiceGmmMean[0][0];	59 voice_gmm_.mean = &kVoiceGmmMean[0][0];

60 voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];	60 voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];

61 }	61 }

62	62

63 PitchBasedVad::~PitchBasedVad() {}	63 PitchBasedVad::~PitchBasedVad() {

	64 }

64	65

65 int PitchBasedVad::VoicingProbability(const AudioFeatures& features,	66 int PitchBasedVad::VoicingProbability(const AudioFeatures& features,

66 double* p_combined) {	67 double* p_combined) {

67 double p;	68 double p;

68 double gmm_features[3];	69 double gmm_features[3];

69 double pdf_features_given_voice;	70 double pdf_features_given_voice;

70 double pdf_features_given_noise;	71 double pdf_features_given_noise;

71 // These limits are the same in matlab implementation 'VoicingProbGMM().'	72 // These limits are the same in matlab implementation 'VoicingProbGMM().'

72 const double kLimLowLogPitchGain = -2.0;	73 const double kLimLowLogPitchGain = -2.0;

73 const double kLimHighLogPitchGain = -0.9;	74 const double kLimHighLogPitchGain = -0.9;

74 const double kLimLowSpectralPeak = 200;	75 const double kLimLowSpectralPeak = 200;

75 const double kLimHighSpectralPeak = 2000;	76 const double kLimHighSpectralPeak = 2000;

76 const double kEps = 1e-12;	77 const double kEps = 1e-12;

77 for (int n = 0; n < features.num_frames; n++) {	78 for (int n = 0; n < features.num_frames; n++) {

78 gmm_features[0] = features.log_pitch_gain[n];	79 gmm_features[0] = features.log_pitch_gain[n];

79 gmm_features[1] = features.spectral_peak[n];	80 gmm_features[1] = features.spectral_peak[n];

80 gmm_features[2] = features.pitch_lag_hz[n];	81 gmm_features[2] = features.pitch_lag_hz[n];

81	82

82 pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);	83 pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);

83 pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);	84 pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);

84	85

85 if (features.spectral_peak[n] < kLimLowSpectralPeak \|\|	86 if (features.spectral_peak[n] < kLimLowSpectralPeak \|\|

86 features.spectral_peak[n] > kLimHighSpectralPeak \|\|	87 features.spectral_peak[n] > kLimHighSpectralPeak \|\|

87 features.log_pitch_gain[n] < kLimLowLogPitchGain) {	88 features.log_pitch_gain[n] < kLimLowLogPitchGain) {

88 pdf_features_given_voice = kEps * pdf_features_given_noise;	89 pdf_features_given_voice = kEps * pdf_features_given_noise;

89 } else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {	90 } else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {

90 pdf_features_given_noise = kEps * pdf_features_given_voice;	91 pdf_features_given_noise = kEps * pdf_features_given_voice;

91 }	92 }

92	93

93 p = p_prior_ * pdf_features_given_voice / (pdf_features_given_voice *	94 p = p_prior_ * pdf_features_given_voice /

94 p_prior_ + pdf_features_given_noise * (1 - p_prior_));	95 (pdf_features_given_voice * p_prior_ +

	96 pdf_features_given_noise * (1 - p_prior_));

95	97

96 p = LimitProbability(p);	98 p = LimitProbability(p);

97	99

98 // Combine pitch-based probability with standalone probability, before	100 // Combine pitch-based probability with standalone probability, before

99 // updating prior probabilities.	101 // updating prior probabilities.

100 double prod_active = p * p_combined[n];	102 double prod_active = p * p_combined[n];

101 double prod_inactive = (1 - p) * (1 - p_combined[n]);	103 double prod_inactive = (1 - p) * (1 - p_combined[n]);

102 p_combined[n] = prod_active / (prod_active + prod_inactive);	104 p_combined[n] = prod_active / (prod_active + prod_inactive);

103	105

104 if (UpdatePrior(p_combined[n]) < 0)	106 if (UpdatePrior(p_combined[n]) < 0)

105 return -1;	107 return -1;

106 // Limit prior probability. With a zero prior probability the posterior	108 // Limit prior probability. With a zero prior probability the posterior

107 // probability is always zero.	109 // probability is always zero.

108 p_prior_ = LimitProbability(p_prior_);	110 p_prior_ = LimitProbability(p_prior_);

109 }	111 }

110 return 0;	112 return 0;

111 }	113 }

112	114

113 int PitchBasedVad::UpdatePrior(double p) {	115 int PitchBasedVad::UpdatePrior(double p) {

114 circular_buffer_->Insert(p);	116 circular_buffer_->Insert(p);

115 if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,	117 if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,

116 kLowProbabilityThreshold) < 0)	118 kLowProbabilityThreshold) < 0)

117 return -1;	119 return -1;

118 p_prior_ = circular_buffer_->Mean();	120 p_prior_ = circular_buffer_->Mean();

119 return 0;	121 return 0;

120 }	122 }

121	123

122 } // namespace webrtc	124 } // namespace webrtc

OLD	NEW