| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 115 // type of signal is most probable. | 115 // type of signal is most probable. |
| 116 // | 116 // |
| 117 // - self [i/o] : Pointer to VAD instance | 117 // - self [i/o] : Pointer to VAD instance |
| 118 // - features [i] : Feature vector of length |kNumChannels| | 118 // - features [i] : Feature vector of length |kNumChannels| |
| 119 // = log10(energy in frequency band) | 119 // = log10(energy in frequency band) |
| 120 // - total_power [i] : Total power in audio frame. | 120 // - total_power [i] : Total power in audio frame. |
| 121 // - frame_length [i] : Number of input samples | 121 // - frame_length [i] : Number of input samples |
| 122 // | 122 // |
| 123 // - returns : the VAD decision (0 - noise, 1 - speech). | 123 // - returns : the VAD decision (0 - noise, 1 - speech). |
| 124 static int16_t GmmProbability(VadInstT* self, int16_t* features, | 124 static int16_t GmmProbability(VadInstT* self, int16_t* features, |
| 125 int16_t total_power, int frame_length) { | 125 int16_t total_power, size_t frame_length) { |
| 126 int channel, k; | 126 int channel, k; |
| 127 int16_t feature_minimum; | 127 int16_t feature_minimum; |
| 128 int16_t h0, h1; | 128 int16_t h0, h1; |
| 129 int16_t log_likelihood_ratio; | 129 int16_t log_likelihood_ratio; |
| 130 int16_t vadflag = 0; | 130 int16_t vadflag = 0; |
| 131 int16_t shifts_h0, shifts_h1; | 131 int16_t shifts_h0, shifts_h1; |
| 132 int16_t tmp_s16, tmp1_s16, tmp2_s16; | 132 int16_t tmp_s16, tmp1_s16, tmp2_s16; |
| 133 int16_t diff; | 133 int16_t diff; |
| 134 int gaussian; | 134 int gaussian; |
| 135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; | 135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; |
| (...skipping 453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 589 break; | 589 break; |
| 590 } | 590 } |
| 591 | 591 |
| 592 return return_value; | 592 return return_value; |
| 593 } | 593 } |
| 594 | 594 |
| 595 // Calculate VAD decision by first extracting feature values and then calculate | 595 // Calculate VAD decision by first extracting feature values and then calculate |
| 596 // probability for both speech and background noise. | 596 // probability for both speech and background noise. |
| 597 | 597 |
| 598 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, | 598 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, |
| 599 int frame_length) { | 599 size_t frame_length) { |
| 600 int vad; | 600 int vad; |
| 601 int i; | 601 size_t i; |
| 602 int16_t speech_nb[240]; // 30 ms in 8 kHz. | 602 int16_t speech_nb[240]; // 30 ms in 8 kHz. |
| 603 // |tmp_mem| is a temporary memory used by resample function, length is | 603 // |tmp_mem| is a temporary memory used by resample function, length is |
| 604 // frame length in 10 ms (480 samples) + 256 extra. | 604 // frame length in 10 ms (480 samples) + 256 extra. |
| 605 int32_t tmp_mem[480 + 256] = { 0 }; | 605 int32_t tmp_mem[480 + 256] = { 0 }; |
| 606 const int kFrameLen10ms48khz = 480; | 606 const size_t kFrameLen10ms48khz = 480; |
| 607 const int kFrameLen10ms8khz = 80; | 607 const size_t kFrameLen10ms8khz = 80; |
| 608 int num_10ms_frames = frame_length / kFrameLen10ms48khz; | 608 size_t num_10ms_frames = frame_length / kFrameLen10ms48khz; |
| 609 | 609 |
| 610 for (i = 0; i < num_10ms_frames; i++) { | 610 for (i = 0; i < num_10ms_frames; i++) { |
| 611 WebRtcSpl_Resample48khzTo8khz(speech_frame, | 611 WebRtcSpl_Resample48khzTo8khz(speech_frame, |
| 612 &speech_nb[i * kFrameLen10ms8khz], | 612 &speech_nb[i * kFrameLen10ms8khz], |
| 613 &inst->state_48_to_8, | 613 &inst->state_48_to_8, |
| 614 tmp_mem); | 614 tmp_mem); |
| 615 } | 615 } |
| 616 | 616 |
| 617 // Do VAD on an 8 kHz signal | 617 // Do VAD on an 8 kHz signal |
| 618 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); | 618 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); |
| 619 | 619 |
| 620 return vad; | 620 return vad; |
| 621 } | 621 } |
| 622 | 622 |
| 623 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, | 623 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, |
| 624 int frame_length) | 624 size_t frame_length) |
| 625 { | 625 { |
| 626 int len, vad; | 626 size_t len; |
| 627 int vad; |
| 627 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB
) | 628 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB
) |
| 628 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) | 629 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
| 629 | 630 |
| 630 | 631 |
| 631 // Downsample signal 32->16->8 before doing VAD | 632 // Downsample signal 32->16->8 before doing VAD |
| 632 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_s
tates[2]), | 633 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_s
tates[2]), |
| 633 frame_length); | 634 frame_length); |
| 634 len = frame_length / 2; | 635 len = frame_length / 2; |
| 635 | 636 |
| 636 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states,
len); | 637 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states,
len); |
| 637 len /= 2; | 638 len /= 2; |
| 638 | 639 |
| 639 // Do VAD on an 8 kHz signal | 640 // Do VAD on an 8 kHz signal |
| 640 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); | 641 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
| 641 | 642 |
| 642 return vad; | 643 return vad; |
| 643 } | 644 } |
| 644 | 645 |
| 645 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, | 646 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, |
| 646 int frame_length) | 647 size_t frame_length) |
| 647 { | 648 { |
| 648 int len, vad; | 649 size_t len; |
| 650 int vad; |
| 649 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) | 651 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
| 650 | 652 |
| 651 // Wideband: Downsample signal before doing VAD | 653 // Wideband: Downsample signal before doing VAD |
| 652 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_sta
tes, | 654 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_sta
tes, |
| 653 frame_length); | 655 frame_length); |
| 654 | 656 |
| 655 len = frame_length / 2; | 657 len = frame_length / 2; |
| 656 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); | 658 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
| 657 | 659 |
| 658 return vad; | 660 return vad; |
| 659 } | 661 } |
| 660 | 662 |
| 661 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, | 663 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, |
| 662 int frame_length) | 664 size_t frame_length) |
| 663 { | 665 { |
| 664 int16_t feature_vector[kNumChannels], total_power; | 666 int16_t feature_vector[kNumChannels], total_power; |
| 665 | 667 |
| 666 // Get power in the bands | 668 // Get power in the bands |
| 667 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, | 669 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, |
| 668 feature_vector); | 670 feature_vector); |
| 669 | 671 |
| 670 // Make a VAD | 672 // Make a VAD |
| 671 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); | 673 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); |
| 672 | 674 |
| 673 return inst->vad; | 675 return inst->vad; |
| 674 } | 676 } |
| OLD | NEW |