| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 54 | 54 |
| 55 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a | 55 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a |
| 56 // silence frame. Otherwise true VAD would drift with respect to the audio. | 56 // silence frame. Otherwise true VAD would drift with respect to the audio. |
| 57 // We only consider mono inputs. | 57 // We only consider mono inputs. |
| 58 static void DitherSilence(AudioFrame* frame) { | 58 static void DitherSilence(AudioFrame* frame) { |
| 59 ASSERT_EQ(1, frame->num_channels_); | 59 ASSERT_EQ(1, frame->num_channels_); |
| 60 const double kRmsSilence = 5; | 60 const double kRmsSilence = 5; |
| 61 const double sum_squared_silence = kRmsSilence * kRmsSilence * | 61 const double sum_squared_silence = kRmsSilence * kRmsSilence * |
| 62 frame->samples_per_channel_; | 62 frame->samples_per_channel_; |
| 63 double sum_squared = 0; | 63 double sum_squared = 0; |
| 64 for (int n = 0; n < frame->samples_per_channel_; n++) | 64 for (size_t n = 0; n < frame->samples_per_channel_; n++) |
| 65 sum_squared += frame->data_[n] * frame->data_[n]; | 65 sum_squared += frame->data_[n] * frame->data_[n]; |
| 66 if (sum_squared <= sum_squared_silence) { | 66 if (sum_squared <= sum_squared_silence) { |
| 67 for (int n = 0; n < frame->samples_per_channel_; n++) | 67 for (size_t n = 0; n < frame->samples_per_channel_; n++) |
| 68 frame->data_[n] = (rand() & 0xF) - 8; | 68 frame->data_[n] = (rand() & 0xF) - 8; |
| 69 } | 69 } |
| 70 } | 70 } |
| 71 | 71 |
| 72 class AgcStat { | 72 class AgcStat { |
| 73 public: | 73 public: |
| 74 AgcStat() | 74 AgcStat() |
| 75 : video_index_(0), | 75 : video_index_(0), |
| 76 activity_threshold_(kDefaultActivityThreshold), | 76 activity_threshold_(kDefaultActivityThreshold), |
| 77 audio_content_(Histogram::Create(kAgcAnalWindowSamples)), | 77 audio_content_(Histogram::Create(kAgcAnalWindowSamples)), |
| 78 audio_processing_(new VadAudioProc()), | 78 audio_processing_(new VadAudioProc()), |
| 79 vad_(new PitchBasedVad()), | 79 vad_(new PitchBasedVad()), |
| 80 standalone_vad_(StandaloneVad::Create()), | 80 standalone_vad_(StandaloneVad::Create()), |
| 81 audio_content_fid_(NULL) { | 81 audio_content_fid_(NULL) { |
| 82 for (int n = 0; n < kMaxNumFrames; n++) | 82 for (size_t n = 0; n < kMaxNumFrames; n++) |
| 83 video_vad_[n] = 0.5; | 83 video_vad_[n] = 0.5; |
| 84 } | 84 } |
| 85 | 85 |
| 86 ~AgcStat() { | 86 ~AgcStat() { |
| 87 if (audio_content_fid_ != NULL) { | 87 if (audio_content_fid_ != NULL) { |
| 88 fclose(audio_content_fid_); | 88 fclose(audio_content_fid_); |
| 89 } | 89 } |
| 90 } | 90 } |
| 91 | 91 |
| 92 void set_audio_content_file(FILE* audio_content_fid) { | 92 void set_audio_content_file(FILE* audio_content_fid) { |
| (...skipping 16 matching lines...) Expand all Loading... |
| 109 frame.samples_per_channel_); | 109 frame.samples_per_channel_); |
| 110 } | 110 } |
| 111 if (features.num_frames > 0) { | 111 if (features.num_frames > 0) { |
| 112 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; | 112 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; |
| 113 if (FLAGS_standalone_vad) { | 113 if (FLAGS_standalone_vad) { |
| 114 standalone_vad_->GetActivity(p, kMaxNumFrames); | 114 standalone_vad_->GetActivity(p, kMaxNumFrames); |
| 115 } | 115 } |
| 116 // TODO(turajs) combining and limiting are used in the source files as | 116 // TODO(turajs) combining and limiting are used in the source files as |
| 117 // well they can be moved to utility. | 117 // well they can be moved to utility. |
| 118 // Combine Video and stand-alone VAD. | 118 // Combine Video and stand-alone VAD. |
| 119 for (int n = 0; n < features.num_frames; n++) { | 119 for (size_t n = 0; n < features.num_frames; n++) { |
| 120 double p_active = p[n] * video_vad_[n]; | 120 double p_active = p[n] * video_vad_[n]; |
| 121 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); | 121 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); |
| 122 p[n] = p_active / (p_active + p_passive); | 122 p[n] = p_active / (p_active + p_passive); |
| 123 // Limit probabilities. | 123 // Limit probabilities. |
| 124 p[n] = std::min(std::max(p[n], 0.01), 0.99); | 124 p[n] = std::min(std::max(p[n], 0.01), 0.99); |
| 125 } | 125 } |
| 126 if (vad_->VoicingProbability(features, p) < 0) | 126 if (vad_->VoicingProbability(features, p) < 0) |
| 127 return -1; | 127 return -1; |
| 128 for (int n = 0; n < features.num_frames; n++) { | 128 for (size_t n = 0; n < features.num_frames; n++) { |
| 129 audio_content_->Update(features.rms[n], p[n]); | 129 audio_content_->Update(features.rms[n], p[n]); |
| 130 double ac = audio_content_->AudioContent(); | 130 double ac = audio_content_->AudioContent(); |
| 131 if (audio_content_fid_ != NULL) { | 131 if (audio_content_fid_ != NULL) { |
| 132 fwrite(&ac, sizeof(ac), 1, audio_content_fid_); | 132 fwrite(&ac, sizeof(ac), 1, audio_content_fid_); |
| 133 } | 133 } |
| 134 if (ac > kAgcAnalWindowSamples * activity_threshold_) { | 134 if (ac > kAgcAnalWindowSamples * activity_threshold_) { |
| 135 combined_vad[n] = 1; | 135 combined_vad[n] = 1; |
| 136 } else { | 136 } else { |
| 137 combined_vad[n] = 0; | 137 combined_vad[n] = 0; |
| 138 } | 138 } |
| 139 } | 139 } |
| 140 video_index_ = 0; | 140 video_index_ = 0; |
| 141 } | 141 } |
| 142 return features.num_frames; | 142 return static_cast<int>(features.num_frames); |
| 143 } | 143 } |
| 144 | 144 |
| 145 void Reset() { | 145 void Reset() { |
| 146 audio_content_->Reset(); | 146 audio_content_->Reset(); |
| 147 } | 147 } |
| 148 | 148 |
| 149 void SetActivityThreshold(double activity_threshold) { | 149 void SetActivityThreshold(double activity_threshold) { |
| 150 activity_threshold_ = activity_threshold; | 150 activity_threshold_ = activity_threshold; |
| 151 } | 151 } |
| 152 | 152 |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 239 double p_video = 0.5; | 239 double p_video = 0.5; |
| 240 int total_active = 0; | 240 int total_active = 0; |
| 241 int total_passive = 0; | 241 int total_passive = 0; |
| 242 int total_false_positive = 0; | 242 int total_false_positive = 0; |
| 243 int total_missed_detection = 0; | 243 int total_missed_detection = 0; |
| 244 int onset_adaptation = 0; | 244 int onset_adaptation = 0; |
| 245 int num_onsets = 0; | 245 int num_onsets = 0; |
| 246 bool onset = false; | 246 bool onset = false; |
| 247 uint8_t previous_true_vad = 0; | 247 uint8_t previous_true_vad = 0; |
| 248 int num_not_adapted = 0; | 248 int num_not_adapted = 0; |
| 249 int true_vad_index = 0; | 249 size_t true_vad_index = 0; |
| 250 bool in_false_positive_region = false; | 250 bool in_false_positive_region = false; |
| 251 int total_false_positive_duration = 0; | 251 int total_false_positive_duration = 0; |
| 252 bool video_adapted = false; | 252 bool video_adapted = false; |
| 253 while (kSamplesToRead == fread(frame.data_, sizeof(int16_t), | 253 while (kSamplesToRead == fread(frame.data_, sizeof(int16_t), |
| 254 kSamplesToRead, pcm_fid)) { | 254 kSamplesToRead, pcm_fid)) { |
| 255 assert(true_vad_index < kMaxNumFrames); | 255 assert(true_vad_index < kMaxNumFrames); |
| 256 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, | 256 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, |
| 257 true_vad_fid)) | 257 true_vad_fid)) |
| 258 << "Size mismatch between True-VAD and the PCM file.\n"; | 258 << "Size mismatch between True-VAD and the PCM file.\n"; |
| 259 if (video_vad_fid != NULL) { | 259 if (video_vad_fid != NULL) { |
| (...skipping 25 matching lines...) Expand all Loading... |
| 285 agc_stat.Reset(); | 285 agc_stat.Reset(); |
| 286 } | 286 } |
| 287 true_vad_index++; | 287 true_vad_index++; |
| 288 | 288 |
| 289 DitherSilence(&frame); | 289 DitherSilence(&frame); |
| 290 | 290 |
| 291 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); | 291 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); |
| 292 ASSERT_GE(ret_val, 0); | 292 ASSERT_GE(ret_val, 0); |
| 293 | 293 |
| 294 if (ret_val > 0) { | 294 if (ret_val > 0) { |
| 295 ASSERT_EQ(true_vad_index, ret_val); | 295 ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); |
| 296 for (int n = 0; n < ret_val; n++) { | 296 for (int n = 0; n < ret_val; n++) { |
| 297 if (true_vad[n] == 1) { | 297 if (true_vad[n] == 1) { |
| 298 total_active++; | 298 total_active++; |
| 299 if (previous_true_vad == 0) { | 299 if (previous_true_vad == 0) { |
| 300 num_onsets++; | 300 num_onsets++; |
| 301 onset = true; | 301 onset = true; |
| 302 } | 302 } |
| 303 if (agc_vad[n] == 0) { | 303 if (agc_vad[n] == 0) { |
| 304 total_missed_detection++; | 304 total_missed_detection++; |
| 305 if (onset) | 305 if (onset) |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 375 " one probability per frame.\n" | 375 " one probability per frame.\n" |
| 376 "\nUsage:\n\n" | 376 "\nUsage:\n\n" |
| 377 "activity_metric input_pcm [options]\n" | 377 "activity_metric input_pcm [options]\n" |
| 378 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " | 378 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " |
| 379 "format.\n\n"; | 379 "format.\n\n"; |
| 380 google::SetUsageMessage(kUsage); | 380 google::SetUsageMessage(kUsage); |
| 381 google::ParseCommandLineFlags(&argc, &argv, true); | 381 google::ParseCommandLineFlags(&argc, &argv, true); |
| 382 webrtc::void_main(argc, argv); | 382 webrtc::void_main(argc, argv); |
| 383 return 0; | 383 return 0; |
| 384 } | 384 } |
| OLD | NEW |