OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
55 | 55 |
56 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a | 56 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a |
57 // silence frame. Otherwise true VAD would drift with respect to the audio. | 57 // silence frame. Otherwise true VAD would drift with respect to the audio. |
58 // We only consider mono inputs. | 58 // We only consider mono inputs. |
59 static void DitherSilence(AudioFrame* frame) { | 59 static void DitherSilence(AudioFrame* frame) { |
60 ASSERT_EQ(1u, frame->num_channels_); | 60 ASSERT_EQ(1u, frame->num_channels_); |
61 const double kRmsSilence = 5; | 61 const double kRmsSilence = 5; |
62 const double sum_squared_silence = kRmsSilence * kRmsSilence * | 62 const double sum_squared_silence = kRmsSilence * kRmsSilence * |
63 frame->samples_per_channel_; | 63 frame->samples_per_channel_; |
64 double sum_squared = 0; | 64 double sum_squared = 0; |
| 65 int16_t* frame_data = frame->mutable_data(); |
65 for (size_t n = 0; n < frame->samples_per_channel_; n++) | 66 for (size_t n = 0; n < frame->samples_per_channel_; n++) |
66 sum_squared += frame->data_[n] * frame->data_[n]; | 67 sum_squared += frame_data[n] * frame_data[n]; |
67 if (sum_squared <= sum_squared_silence) { | 68 if (sum_squared <= sum_squared_silence) { |
68 for (size_t n = 0; n < frame->samples_per_channel_; n++) | 69 for (size_t n = 0; n < frame->samples_per_channel_; n++) |
69 frame->data_[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. | 70 frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. |
70 } | 71 } |
71 } | 72 } |
72 | 73 |
73 class AgcStat { | 74 class AgcStat { |
74 public: | 75 public: |
75 AgcStat() | 76 AgcStat() |
76 : video_index_(0), | 77 : video_index_(0), |
77 activity_threshold_(kDefaultActivityThreshold), | 78 activity_threshold_(kDefaultActivityThreshold), |
78 audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), | 79 audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), |
79 audio_processing_(new VadAudioProc()), | 80 audio_processing_(new VadAudioProc()), |
(...skipping 16 matching lines...) Expand all Loading... |
96 | 97 |
97 int AddAudio(const AudioFrame& frame, double p_video, | 98 int AddAudio(const AudioFrame& frame, double p_video, |
98 int* combined_vad) { | 99 int* combined_vad) { |
99 if (frame.num_channels_ != 1 || | 100 if (frame.num_channels_ != 1 || |
100 frame.samples_per_channel_ != | 101 frame.samples_per_channel_ != |
101 kSampleRateHz / 100 || | 102 kSampleRateHz / 100 || |
102 frame.sample_rate_hz_ != kSampleRateHz) | 103 frame.sample_rate_hz_ != kSampleRateHz) |
103 return -1; | 104 return -1; |
104 video_vad_[video_index_++] = p_video; | 105 video_vad_[video_index_++] = p_video; |
105 AudioFeatures features; | 106 AudioFeatures features; |
| 107 const int16_t* frame_data = frame.data(); |
106 audio_processing_->ExtractFeatures( | 108 audio_processing_->ExtractFeatures( |
107 frame.data_, frame.samples_per_channel_, &features); | 109 frame_data, frame.samples_per_channel_, &features); |
108 if (FLAGS_standalone_vad) { | 110 if (FLAGS_standalone_vad) { |
109 standalone_vad_->AddAudio(frame.data_, | 111 standalone_vad_->AddAudio(frame_data, |
110 frame.samples_per_channel_); | 112 frame.samples_per_channel_); |
111 } | 113 } |
112 if (features.num_frames > 0) { | 114 if (features.num_frames > 0) { |
113 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; | 115 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; |
114 if (FLAGS_standalone_vad) { | 116 if (FLAGS_standalone_vad) { |
115 standalone_vad_->GetActivity(p, kMaxNumFrames); | 117 standalone_vad_->GetActivity(p, kMaxNumFrames); |
116 } | 118 } |
117 // TODO(turajs) combining and limiting are used in the source files as | 119 // TODO(turajs) combining and limiting are used in the source files as |
118 // well they can be moved to utility. | 120 // well they can be moved to utility. |
119 // Combine Video and stand-alone VAD. | 121 // Combine Video and stand-alone VAD. |
(...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
244 int total_missed_detection = 0; | 246 int total_missed_detection = 0; |
245 int onset_adaptation = 0; | 247 int onset_adaptation = 0; |
246 int num_onsets = 0; | 248 int num_onsets = 0; |
247 bool onset = false; | 249 bool onset = false; |
248 uint8_t previous_true_vad = 0; | 250 uint8_t previous_true_vad = 0; |
249 int num_not_adapted = 0; | 251 int num_not_adapted = 0; |
250 size_t true_vad_index = 0; | 252 size_t true_vad_index = 0; |
251 bool in_false_positive_region = false; | 253 bool in_false_positive_region = false; |
252 int total_false_positive_duration = 0; | 254 int total_false_positive_duration = 0; |
253 bool video_adapted = false; | 255 bool video_adapted = false; |
254 while (kSamplesToRead == fread(frame.data_, sizeof(int16_t), | 256 while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t), |
255 kSamplesToRead, pcm_fid)) { | 257 kSamplesToRead, pcm_fid)) { |
256 assert(true_vad_index < kMaxNumFrames); | 258 assert(true_vad_index < kMaxNumFrames); |
257 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, | 259 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, |
258 true_vad_fid)) | 260 true_vad_fid)) |
259 << "Size mismatch between True-VAD and the PCM file.\n"; | 261 << "Size mismatch between True-VAD and the PCM file.\n"; |
260 if (video_vad_fid != NULL) { | 262 if (video_vad_fid != NULL) { |
261 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << | 263 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << |
262 "Not enough video-based VAD probabilities."; | 264 "Not enough video-based VAD probabilities."; |
263 } | 265 } |
264 | 266 |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
376 " one probability per frame.\n" | 378 " one probability per frame.\n" |
377 "\nUsage:\n\n" | 379 "\nUsage:\n\n" |
378 "activity_metric input_pcm [options]\n" | 380 "activity_metric input_pcm [options]\n" |
379 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " | 381 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " |
380 "format.\n\n"; | 382 "format.\n\n"; |
381 google::SetUsageMessage(kUsage); | 383 google::SetUsageMessage(kUsage); |
382 google::ParseCommandLineFlags(&argc, &argv, true); | 384 google::ParseCommandLineFlags(&argc, &argv, true); |
383 webrtc::void_main(argc, argv); | 385 webrtc::void_main(argc, argv); |
384 return 0; | 386 return 0; |
385 } | 387 } |
OLD | NEW |