OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
115 // type of signal is most probable. | 115 // type of signal is most probable. |
116 // | 116 // |
117 // - self [i/o] : Pointer to VAD instance | 117 // - self [i/o] : Pointer to VAD instance |
118 // - features [i] : Feature vector of length |kNumChannels| | 118 // - features [i] : Feature vector of length |kNumChannels| |
119 // = log10(energy in frequency band) | 119 // = log10(energy in frequency band) |
120 // - total_power [i] : Total power in audio frame. | 120 // - total_power [i] : Total power in audio frame. |
121 // - frame_length [i] : Number of input samples | 121 // - frame_length [i] : Number of input samples |
122 // | 122 // |
123 // - returns : the VAD decision (0 - noise, 1 - speech). | 123 // - returns : the VAD decision (0 - noise, 1 - speech). |
124 static int16_t GmmProbability(VadInstT* self, int16_t* features, | 124 static int16_t GmmProbability(VadInstT* self, int16_t* features, |
125 int16_t total_power, int frame_length) { | 125 int16_t total_power, size_t frame_length) { |
126 int channel, k; | 126 int channel, k; |
127 int16_t feature_minimum; | 127 int16_t feature_minimum; |
128 int16_t h0, h1; | 128 int16_t h0, h1; |
129 int16_t log_likelihood_ratio; | 129 int16_t log_likelihood_ratio; |
130 int16_t vadflag = 0; | 130 int16_t vadflag = 0; |
131 int16_t shifts_h0, shifts_h1; | 131 int16_t shifts_h0, shifts_h1; |
132 int16_t tmp_s16, tmp1_s16, tmp2_s16; | 132 int16_t tmp_s16, tmp1_s16, tmp2_s16; |
133 int16_t diff; | 133 int16_t diff; |
134 int gaussian; | 134 int gaussian; |
135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; | 135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; |
(...skipping 453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
589 break; | 589 break; |
590 } | 590 } |
591 | 591 |
592 return return_value; | 592 return return_value; |
593 } | 593 } |
594 | 594 |
595 // Calculate VAD decision by first extracting feature values and then calculate | 595 // Calculate VAD decision by first extracting feature values and then calculate |
596 // probability for both speech and background noise. | 596 // probability for both speech and background noise. |
597 | 597 |
598 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, | 598 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, |
599 int frame_length) { | 599 size_t frame_length) { |
600 int vad; | 600 int vad; |
601 int i; | 601 size_t i; |
602 int16_t speech_nb[240]; // 30 ms in 8 kHz. | 602 int16_t speech_nb[240]; // 30 ms in 8 kHz. |
603 // |tmp_mem| is a temporary memory used by resample function, length is | 603 // |tmp_mem| is a temporary memory used by resample function, length is |
604 // frame length in 10 ms (480 samples) + 256 extra. | 604 // frame length in 10 ms (480 samples) + 256 extra. |
605 int32_t tmp_mem[480 + 256] = { 0 }; | 605 int32_t tmp_mem[480 + 256] = { 0 }; |
606 const int kFrameLen10ms48khz = 480; | 606 const size_t kFrameLen10ms48khz = 480; |
607 const int kFrameLen10ms8khz = 80; | 607 const size_t kFrameLen10ms8khz = 80; |
608 int num_10ms_frames = frame_length / kFrameLen10ms48khz; | 608 size_t num_10ms_frames = frame_length / kFrameLen10ms48khz; |
609 | 609 |
610 for (i = 0; i < num_10ms_frames; i++) { | 610 for (i = 0; i < num_10ms_frames; i++) { |
611 WebRtcSpl_Resample48khzTo8khz(speech_frame, | 611 WebRtcSpl_Resample48khzTo8khz(speech_frame, |
612 &speech_nb[i * kFrameLen10ms8khz], | 612 &speech_nb[i * kFrameLen10ms8khz], |
613 &inst->state_48_to_8, | 613 &inst->state_48_to_8, |
614 tmp_mem); | 614 tmp_mem); |
615 } | 615 } |
616 | 616 |
617 // Do VAD on an 8 kHz signal | 617 // Do VAD on an 8 kHz signal |
618 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); | 618 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); |
619 | 619 |
620 return vad; | 620 return vad; |
621 } | 621 } |
622 | 622 |
623 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, | 623 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, |
624 int frame_length) | 624 size_t frame_length) |
625 { | 625 { |
626 int len, vad; | 626 size_t len; |
| 627 int vad; |
627 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB
) | 628 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB
) |
628 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) | 629 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
629 | 630 |
630 | 631 |
631 // Downsample signal 32->16->8 before doing VAD | 632 // Downsample signal 32->16->8 before doing VAD |
632 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_s
tates[2]), | 633 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_s
tates[2]), |
633 frame_length); | 634 frame_length); |
634 len = frame_length / 2; | 635 len = frame_length / 2; |
635 | 636 |
636 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states,
len); | 637 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states,
len); |
637 len /= 2; | 638 len /= 2; |
638 | 639 |
639 // Do VAD on an 8 kHz signal | 640 // Do VAD on an 8 kHz signal |
640 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); | 641 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
641 | 642 |
642 return vad; | 643 return vad; |
643 } | 644 } |
644 | 645 |
645 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, | 646 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, |
646 int frame_length) | 647 size_t frame_length) |
647 { | 648 { |
648 int len, vad; | 649 size_t len; |
| 650 int vad; |
649 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) | 651 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
650 | 652 |
651 // Wideband: Downsample signal before doing VAD | 653 // Wideband: Downsample signal before doing VAD |
652 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_sta
tes, | 654 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_sta
tes, |
653 frame_length); | 655 frame_length); |
654 | 656 |
655 len = frame_length / 2; | 657 len = frame_length / 2; |
656 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); | 658 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
657 | 659 |
658 return vad; | 660 return vad; |
659 } | 661 } |
660 | 662 |
661 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, | 663 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, |
662 int frame_length) | 664 size_t frame_length) |
663 { | 665 { |
664 int16_t feature_vector[kNumChannels], total_power; | 666 int16_t feature_vector[kNumChannels], total_power; |
665 | 667 |
666 // Get power in the bands | 668 // Get power in the bands |
667 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, | 669 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, |
668 feature_vector); | 670 feature_vector); |
669 | 671 |
670 // Make a VAD | 672 // Make a VAD |
671 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); | 673 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); |
672 | 674 |
673 return inst->vad; | 675 return inst->vad; |
674 } | 676 } |
OLD | NEW |