OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 | 11 |
12 #include <math.h> | 12 #include <math.h> |
13 #include <stdio.h> | 13 #include <stdio.h> |
14 #include <stdlib.h> | 14 #include <stdlib.h> |
15 | 15 |
16 #include <algorithm> | 16 #include <algorithm> |
17 #include <memory> | 17 #include <memory> |
18 | 18 |
19 #include "gflags/gflags.h" | 19 #include "webrtc/base/flags.h" |
20 #include "webrtc/modules/audio_processing/agc/agc.h" | 20 #include "webrtc/modules/audio_processing/agc/agc.h" |
21 #include "webrtc/modules/audio_processing/agc/loudness_histogram.h" | 21 #include "webrtc/modules/audio_processing/agc/loudness_histogram.h" |
22 #include "webrtc/modules/audio_processing/agc/utility.h" | 22 #include "webrtc/modules/audio_processing/agc/utility.h" |
23 #include "webrtc/modules/audio_processing/vad/common.h" | 23 #include "webrtc/modules/audio_processing/vad/common.h" |
24 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" | 24 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" |
25 #include "webrtc/modules/audio_processing/vad/standalone_vad.h" | 25 #include "webrtc/modules/audio_processing/vad/standalone_vad.h" |
26 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" | 26 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" |
27 #include "webrtc/modules/include/module_common_types.h" | 27 #include "webrtc/modules/include/module_common_types.h" |
28 #include "webrtc/test/gtest.h" | 28 #include "webrtc/test/gtest.h" |
29 | 29 |
30 static const int kAgcAnalWindowSamples = 100; | 30 static const int kAgcAnalWindowSamples = 100; |
31 static const double kDefaultActivityThreshold = 0.3; | 31 static const float kDefaultActivityThreshold = 0.3f; |
32 | 32 |
33 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); | 33 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); |
34 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" | 34 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" |
35 " format"); | 35 " format"); |
36 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" | 36 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" |
37 " probabilities) in double format. One activity per 10ms is" | 37 " probabilities) in double format. One activity per 10ms is" |
38 " required. If no file is given the video information is not" | 38 " required. If no file is given the video information is not" |
39 " incorporated. Negative activity is interpreted as video is" | 39 " incorporated. Negative activity is interpreted as video is" |
40 " not adapted and the statistics are not computed during" | 40 " not adapted and the statistics are not computed during" |
41 " the learning phase. Note that the negative video activities" | 41 " the learning phase. Note that the negative video activities" |
42 " are ONLY allowed at the beginning."); | 42 " are ONLY allowed at the beginning."); |
43 DEFINE_string(result, "", "name of a file to write the results. The results" | 43 DEFINE_string(result, "", "name of a file to write the results. The results" |
44 " will be appended to the end of the file. This is optional."); | 44 " will be appended to the end of the file. This is optional."); |
45 DEFINE_string(audio_content, "", "name of a file where audio content is written" | 45 DEFINE_string(audio_content, "", "name of a file where audio content is written" |
46 " to, in double format."); | 46 " to, in double format."); |
47 DEFINE_double(activity_threshold, kDefaultActivityThreshold, | 47 DEFINE_float(activity_threshold, kDefaultActivityThreshold, |
48 "Activity threshold"); | 48 "Activity threshold"); |
49 | 49 |
50 namespace webrtc { | 50 namespace webrtc { |
51 | 51 |
52 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will | 52 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will |
53 // notify the caller of "silence" input, instead of bailing out. We would not | 53 // notify the caller of "silence" input, instead of bailing out. We would not |
54 // need the following function when such a change is made. | 54 // need the following function when such a change is made. |
55 | 55 |
56 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a | 56 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a |
57 // silence frame. Otherwise true VAD would drift with respect to the audio. | 57 // silence frame. Otherwise true VAD would drift with respect to the audio. |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
98 int* combined_vad) { | 98 int* combined_vad) { |
99 if (frame.num_channels_ != 1 || | 99 if (frame.num_channels_ != 1 || |
100 frame.samples_per_channel_ != | 100 frame.samples_per_channel_ != |
101 kSampleRateHz / 100 || | 101 kSampleRateHz / 100 || |
102 frame.sample_rate_hz_ != kSampleRateHz) | 102 frame.sample_rate_hz_ != kSampleRateHz) |
103 return -1; | 103 return -1; |
104 video_vad_[video_index_++] = p_video; | 104 video_vad_[video_index_++] = p_video; |
105 AudioFeatures features; | 105 AudioFeatures features; |
106 audio_processing_->ExtractFeatures( | 106 audio_processing_->ExtractFeatures( |
107 frame.data_, frame.samples_per_channel_, &features); | 107 frame.data_, frame.samples_per_channel_, &features); |
108 if (FLAGS_standalone_vad) { | 108 if (FLAG_standalone_vad) { |
109 standalone_vad_->AddAudio(frame.data_, | 109 standalone_vad_->AddAudio(frame.data_, |
110 frame.samples_per_channel_); | 110 frame.samples_per_channel_); |
111 } | 111 } |
112 if (features.num_frames > 0) { | 112 if (features.num_frames > 0) { |
113 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; | 113 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; |
114 if (FLAGS_standalone_vad) { | 114 if (FLAG_standalone_vad) { |
115 standalone_vad_->GetActivity(p, kMaxNumFrames); | 115 standalone_vad_->GetActivity(p, kMaxNumFrames); |
116 } | 116 } |
117 // TODO(turajs) combining and limiting are used in the source files as | 117 // TODO(turajs) combining and limiting are used in the source files as |
118 // well they can be moved to utility. | 118 // well they can be moved to utility. |
119 // Combine Video and stand-alone VAD. | 119 // Combine Video and stand-alone VAD. |
120 for (size_t n = 0; n < features.num_frames; n++) { | 120 for (size_t n = 0; n < features.num_frames; n++) { |
121 double p_active = p[n] * video_vad_[n]; | 121 double p_active = p[n] * video_vad_[n]; |
122 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); | 122 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); |
123 p[n] = p_active / (p_active + p_passive); | 123 p[n] = p_active / (p_active + p_passive); |
124 // Limit probabilities. | 124 // Limit probabilities. |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
168 webrtc::AgcStat agc_stat; | 168 webrtc::AgcStat agc_stat; |
169 | 169 |
170 FILE* pcm_fid = fopen(argv[1], "rb"); | 170 FILE* pcm_fid = fopen(argv[1], "rb"); |
171 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; | 171 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; |
172 | 172 |
173 if (argc < 2) { | 173 if (argc < 2) { |
174 fprintf(stderr, "\nNot Enough arguments\n"); | 174 fprintf(stderr, "\nNot Enough arguments\n"); |
175 } | 175 } |
176 | 176 |
177 FILE* true_vad_fid = NULL; | 177 FILE* true_vad_fid = NULL; |
178 ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true " | 178 ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true " |
179 "VADs using --true_vad flag."; | 179 "VADs using --true_vad flag."; |
180 true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb"); | 180 true_vad_fid = fopen(FLAG_true_vad, "rb"); |
181 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << | 181 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << |
182 FLAGS_true_vad; | 182 FLAG_true_vad; |
183 | 183 |
184 FILE* results_fid = NULL; | 184 FILE* results_fid = NULL; |
185 if (FLAGS_result.size() > 0) { | 185 if (strlen(FLAG_result) > 0) { |
186 // True if this is the first time writing to this function and we add a | 186 // True if this is the first time writing to this function and we add a |
187 // header to the beginning of the file. | 187 // header to the beginning of the file. |
188 bool write_header; | 188 bool write_header; |
189 // Open in the read mode. If it fails, the file doesn't exist and has to | 189 // Open in the read mode. If it fails, the file doesn't exist and has to |
190 // write a header for it. Otherwise no need to write a header. | 190 // write a header for it. Otherwise no need to write a header. |
191 results_fid = fopen(FLAGS_result.c_str(), "r"); | 191 results_fid = fopen(FLAG_result, "r"); |
192 if (results_fid == NULL) { | 192 if (results_fid == NULL) { |
193 write_header = true; | 193 write_header = true; |
194 } else { | 194 } else { |
195 fclose(results_fid); | 195 fclose(results_fid); |
196 write_header = false; | 196 write_header = false; |
197 } | 197 } |
198 // Open in append mode. | 198 // Open in append mode. |
199 results_fid = fopen(FLAGS_result.c_str(), "a"); | 199 results_fid = fopen(FLAG_result, "a"); |
200 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << | 200 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << |
201 FLAGS_result << ", to write the results."; | 201 FLAG_result << ", to write the results."; |
202 // Write the header if required. | 202 // Write the header if required. |
203 if (write_header) { | 203 if (write_header) { |
204 fprintf(results_fid, "%% Total Active, Misdetection, " | 204 fprintf(results_fid, "%% Total Active, Misdetection, " |
205 "Total inactive, False Positive, On-sets, Missed segments, " | 205 "Total inactive, False Positive, On-sets, Missed segments, " |
206 "Average response\n"); | 206 "Average response\n"); |
207 } | 207 } |
208 } | 208 } |
209 | 209 |
210 FILE* video_vad_fid = NULL; | 210 FILE* video_vad_fid = NULL; |
211 if (FLAGS_video_vad.size() > 0) { | 211 if (strlen(FLAG_video_vad) > 0) { |
212 video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb"); | 212 video_vad_fid = fopen(FLAG_video_vad, "rb"); |
213 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << | 213 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << |
214 FLAGS_video_vad << " to read video-based VAD decisions.\n"; | 214 FLAG_video_vad << " to read video-based VAD decisions.\n"; |
215 } | 215 } |
216 | 216 |
217 // AgsStat will be the owner of this file and will close it at its | 217 // AgsStat will be the owner of this file and will close it at its |
218 // destructor. | 218 // destructor. |
219 FILE* audio_content_fid = NULL; | 219 FILE* audio_content_fid = NULL; |
220 if (FLAGS_audio_content.size() > 0) { | 220 if (strlen(FLAG_audio_content) > 0) { |
221 audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb"); | 221 audio_content_fid = fopen(FLAG_audio_content, "wb"); |
222 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << | 222 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << |
223 FLAGS_audio_content << " to write audio-content.\n"; | 223 FLAG_audio_content << " to write audio-content.\n"; |
224 agc_stat.set_audio_content_file(audio_content_fid); | 224 agc_stat.set_audio_content_file(audio_content_fid); |
225 } | 225 } |
226 | 226 |
227 webrtc::AudioFrame frame; | 227 webrtc::AudioFrame frame; |
228 frame.num_channels_ = 1; | 228 frame.num_channels_ = 1; |
229 frame.sample_rate_hz_ = 16000; | 229 frame.sample_rate_hz_ = 16000; |
230 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; | 230 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; |
231 const size_t kSamplesToRead = frame.num_channels_ * | 231 const size_t kSamplesToRead = frame.num_channels_ * |
232 frame.samples_per_channel_; | 232 frame.samples_per_channel_; |
233 | 233 |
234 agc_stat.SetActivityThreshold(FLAGS_activity_threshold); | 234 agc_stat.SetActivityThreshold(FLAG_activity_threshold); |
235 | 235 |
236 int ret_val = 0; | 236 int ret_val = 0; |
237 int num_frames = 0; | 237 int num_frames = 0; |
238 int agc_vad[kMaxNumFrames]; | 238 int agc_vad[kMaxNumFrames]; |
239 uint8_t true_vad[kMaxNumFrames]; | 239 uint8_t true_vad[kMaxNumFrames]; |
240 double p_video = 0.5; | 240 double p_video = 0.5; |
241 int total_active = 0; | 241 int total_active = 0; |
242 int total_passive = 0; | 242 int total_passive = 0; |
243 int total_false_positive = 0; | 243 int total_false_positive = 0; |
244 int total_missed_detection = 0; | 244 int total_missed_detection = 0; |
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
362 fclose(video_vad_fid); | 362 fclose(video_vad_fid); |
363 } | 363 } |
364 if (results_fid != NULL) { | 364 if (results_fid != NULL) { |
365 fclose(results_fid); | 365 fclose(results_fid); |
366 } | 366 } |
367 } | 367 } |
368 | 368 |
369 } // namespace webrtc | 369 } // namespace webrtc |
370 | 370 |
371 int main(int argc, char* argv[]) { | 371 int main(int argc, char* argv[]) { |
372 char kUsage[] = | 372 std::string usage = |
kwiberg-webrtc
2017/05/16 04:10:09
Wouldn't it make more sense to change this to a co
kjellander_webrtc
2017/05/16 05:44:17
Right, I changed that now.
| |
373 "\nCompute the number of misdetected and false-positive frames. Not\n" | 373 "\nCompute the number of misdetected and false-positive frames. Not\n" |
374 " that for each frame of audio (10 ms) there should be one true\n" | 374 " that for each frame of audio (10 ms) there should be one true\n" |
375 " activity. If any video-based activity is given, there should also be\n" | 375 " activity. If any video-based activity is given, there should also be\n" |
376 " one probability per frame.\n" | 376 " one probability per frame.\n" |
377 "\nUsage:\n\n" | 377 "\nUsage:\n\n" |
378 "activity_metric input_pcm [options]\n" | 378 "activity_metric input_pcm [options]\n" |
379 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " | 379 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " |
380 "format.\n\n"; | 380 "format.\n\n"; |
381 google::SetUsageMessage(kUsage); | 381 if (argc == 1) { |
382 google::ParseCommandLineFlags(&argc, &argv, true); | 382 // Print usage information. |
383 std::cout << usage; | |
384 return 0; | |
385 } | |
386 rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true); | |
383 webrtc::void_main(argc, argv); | 387 webrtc::void_main(argc, argv); |
384 return 0; | 388 return 0; |
385 } | 389 } |
OLD | NEW |