OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 | |
12 #include <math.h> | |
13 #include <stdio.h> | |
14 #include <stdlib.h> | |
15 | |
16 #include <algorithm> | |
17 #include <memory> | |
18 | |
19 #include "webrtc/base/flags.h" | |
20 #include "webrtc/base/safe_minmax.h" | |
21 #include "webrtc/modules/audio_processing/agc/agc.h" | |
22 #include "webrtc/modules/audio_processing/agc/loudness_histogram.h" | |
23 #include "webrtc/modules/audio_processing/agc/utility.h" | |
24 #include "webrtc/modules/audio_processing/vad/common.h" | |
25 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" | |
26 #include "webrtc/modules/audio_processing/vad/standalone_vad.h" | |
27 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" | |
28 #include "webrtc/modules/include/module_common_types.h" | |
29 #include "webrtc/test/gtest.h" | |
30 | |
31 static const int kAgcAnalWindowSamples = 100; | |
32 static const float kDefaultActivityThreshold = 0.3f; | |
33 | |
34 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); | |
35 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" | |
36 " format"); | |
37 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" | |
38 " probabilities) in double format. One activity per 10ms is" | |
39 " required. If no file is given the video information is not" | |
40 " incorporated. Negative activity is interpreted as video is" | |
41 " not adapted and the statistics are not computed during" | |
42 " the learning phase. Note that the negative video activities" | |
43 " are ONLY allowed at the beginning."); | |
44 DEFINE_string(result, "", "name of a file to write the results. The results" | |
45 " will be appended to the end of the file. This is optional."); | |
46 DEFINE_string(audio_content, "", "name of a file where audio content is written" | |
47 " to, in double format."); | |
48 DEFINE_float(activity_threshold, kDefaultActivityThreshold, | |
49 "Activity threshold"); | |
50 DEFINE_bool(help, false, "prints this message"); | |
51 | |
52 namespace webrtc { | |
53 | |
54 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will | |
55 // notify the caller of "silence" input, instead of bailing out. We would not | |
56 // need the following function when such a change is made. | |
57 | |
58 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a | |
59 // silence frame. Otherwise true VAD would drift with respect to the audio. | |
60 // We only consider mono inputs. | |
61 static void DitherSilence(AudioFrame* frame) { | |
62 ASSERT_EQ(1u, frame->num_channels_); | |
63 const double kRmsSilence = 5; | |
64 const double sum_squared_silence = kRmsSilence * kRmsSilence * | |
65 frame->samples_per_channel_; | |
66 double sum_squared = 0; | |
67 int16_t* frame_data = frame->mutable_data(); | |
68 for (size_t n = 0; n < frame->samples_per_channel_; n++) | |
69 sum_squared += frame_data[n] * frame_data[n]; | |
70 if (sum_squared <= sum_squared_silence) { | |
71 for (size_t n = 0; n < frame->samples_per_channel_; n++) | |
72 frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. | |
73 } | |
74 } | |
75 | |
76 class AgcStat { | |
77 public: | |
78 AgcStat() | |
79 : video_index_(0), | |
80 activity_threshold_(kDefaultActivityThreshold), | |
81 audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), | |
82 audio_processing_(new VadAudioProc()), | |
83 vad_(new PitchBasedVad()), | |
84 standalone_vad_(StandaloneVad::Create()), | |
85 audio_content_fid_(NULL) { | |
86 for (size_t n = 0; n < kMaxNumFrames; n++) | |
87 video_vad_[n] = 0.5; | |
88 } | |
89 | |
90 ~AgcStat() { | |
91 if (audio_content_fid_ != NULL) { | |
92 fclose(audio_content_fid_); | |
93 } | |
94 } | |
95 | |
96 void set_audio_content_file(FILE* audio_content_fid) { | |
97 audio_content_fid_ = audio_content_fid; | |
98 } | |
99 | |
100 int AddAudio(const AudioFrame& frame, double p_video, | |
101 int* combined_vad) { | |
102 if (frame.num_channels_ != 1 || | |
103 frame.samples_per_channel_ != | |
104 kSampleRateHz / 100 || | |
105 frame.sample_rate_hz_ != kSampleRateHz) | |
106 return -1; | |
107 video_vad_[video_index_++] = p_video; | |
108 AudioFeatures features; | |
109 const int16_t* frame_data = frame.data(); | |
110 audio_processing_->ExtractFeatures( | |
111 frame_data, frame.samples_per_channel_, &features); | |
112 if (FLAG_standalone_vad) { | |
113 standalone_vad_->AddAudio(frame_data, | |
114 frame.samples_per_channel_); | |
115 } | |
116 if (features.num_frames > 0) { | |
117 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; | |
118 if (FLAG_standalone_vad) { | |
119 standalone_vad_->GetActivity(p, kMaxNumFrames); | |
120 } | |
121 // TODO(turajs) combining and limiting are used in the source files as | |
122 // well they can be moved to utility. | |
123 // Combine Video and stand-alone VAD. | |
124 for (size_t n = 0; n < features.num_frames; n++) { | |
125 double p_active = p[n] * video_vad_[n]; | |
126 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); | |
127 p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99); | |
128 } | |
129 if (vad_->VoicingProbability(features, p) < 0) | |
130 return -1; | |
131 for (size_t n = 0; n < features.num_frames; n++) { | |
132 audio_content_->Update(features.rms[n], p[n]); | |
133 double ac = audio_content_->AudioContent(); | |
134 if (audio_content_fid_ != NULL) { | |
135 fwrite(&ac, sizeof(ac), 1, audio_content_fid_); | |
136 } | |
137 if (ac > kAgcAnalWindowSamples * activity_threshold_) { | |
138 combined_vad[n] = 1; | |
139 } else { | |
140 combined_vad[n] = 0; | |
141 } | |
142 } | |
143 video_index_ = 0; | |
144 } | |
145 return static_cast<int>(features.num_frames); | |
146 } | |
147 | |
148 void Reset() { | |
149 audio_content_->Reset(); | |
150 } | |
151 | |
152 void SetActivityThreshold(double activity_threshold) { | |
153 activity_threshold_ = activity_threshold; | |
154 } | |
155 | |
156 private: | |
157 int video_index_; | |
158 double activity_threshold_; | |
159 double video_vad_[kMaxNumFrames]; | |
160 std::unique_ptr<LoudnessHistogram> audio_content_; | |
161 std::unique_ptr<VadAudioProc> audio_processing_; | |
162 std::unique_ptr<PitchBasedVad> vad_; | |
163 std::unique_ptr<StandaloneVad> standalone_vad_; | |
164 | |
165 FILE* audio_content_fid_; | |
166 }; | |
167 | |
168 | |
169 void void_main(int argc, char* argv[]) { | |
170 webrtc::AgcStat agc_stat; | |
171 | |
172 FILE* pcm_fid = fopen(argv[1], "rb"); | |
173 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; | |
174 | |
175 if (argc < 2) { | |
176 fprintf(stderr, "\nNot Enough arguments\n"); | |
177 } | |
178 | |
179 FILE* true_vad_fid = NULL; | |
180 ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true " | |
181 "VADs using --true_vad flag."; | |
182 true_vad_fid = fopen(FLAG_true_vad, "rb"); | |
183 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << | |
184 FLAG_true_vad; | |
185 | |
186 FILE* results_fid = NULL; | |
187 if (strlen(FLAG_result) > 0) { | |
188 // True if this is the first time writing to this function and we add a | |
189 // header to the beginning of the file. | |
190 bool write_header; | |
191 // Open in the read mode. If it fails, the file doesn't exist and has to | |
192 // write a header for it. Otherwise no need to write a header. | |
193 results_fid = fopen(FLAG_result, "r"); | |
194 if (results_fid == NULL) { | |
195 write_header = true; | |
196 } else { | |
197 fclose(results_fid); | |
198 write_header = false; | |
199 } | |
200 // Open in append mode. | |
201 results_fid = fopen(FLAG_result, "a"); | |
202 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << | |
203 FLAG_result << ", to write the results."; | |
204 // Write the header if required. | |
205 if (write_header) { | |
206 fprintf(results_fid, "%% Total Active, Misdetection, " | |
207 "Total inactive, False Positive, On-sets, Missed segments, " | |
208 "Average response\n"); | |
209 } | |
210 } | |
211 | |
212 FILE* video_vad_fid = NULL; | |
213 if (strlen(FLAG_video_vad) > 0) { | |
214 video_vad_fid = fopen(FLAG_video_vad, "rb"); | |
215 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << | |
216 FLAG_video_vad << " to read video-based VAD decisions.\n"; | |
217 } | |
218 | |
219 // AgsStat will be the owner of this file and will close it at its | |
220 // destructor. | |
221 FILE* audio_content_fid = NULL; | |
222 if (strlen(FLAG_audio_content) > 0) { | |
223 audio_content_fid = fopen(FLAG_audio_content, "wb"); | |
224 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << | |
225 FLAG_audio_content << " to write audio-content.\n"; | |
226 agc_stat.set_audio_content_file(audio_content_fid); | |
227 } | |
228 | |
229 webrtc::AudioFrame frame; | |
230 frame.num_channels_ = 1; | |
231 frame.sample_rate_hz_ = 16000; | |
232 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; | |
233 const size_t kSamplesToRead = frame.num_channels_ * | |
234 frame.samples_per_channel_; | |
235 | |
236 agc_stat.SetActivityThreshold(FLAG_activity_threshold); | |
237 | |
238 int ret_val = 0; | |
239 int num_frames = 0; | |
240 int agc_vad[kMaxNumFrames]; | |
241 uint8_t true_vad[kMaxNumFrames]; | |
242 double p_video = 0.5; | |
243 int total_active = 0; | |
244 int total_passive = 0; | |
245 int total_false_positive = 0; | |
246 int total_missed_detection = 0; | |
247 int onset_adaptation = 0; | |
248 int num_onsets = 0; | |
249 bool onset = false; | |
250 uint8_t previous_true_vad = 0; | |
251 int num_not_adapted = 0; | |
252 size_t true_vad_index = 0; | |
253 bool in_false_positive_region = false; | |
254 int total_false_positive_duration = 0; | |
255 bool video_adapted = false; | |
256 while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t), | |
257 kSamplesToRead, pcm_fid)) { | |
258 assert(true_vad_index < kMaxNumFrames); | |
259 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, | |
260 true_vad_fid)) | |
261 << "Size mismatch between True-VAD and the PCM file.\n"; | |
262 if (video_vad_fid != NULL) { | |
263 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << | |
264 "Not enough video-based VAD probabilities."; | |
265 } | |
266 | |
267 // Negative video activity indicates that the video-based VAD is not yet | |
268 // adapted. Disregards the learning phase in statistics. | |
269 if (p_video < 0) { | |
270 if (video_adapted) { | |
271 fprintf(stderr, "Negative video probabilities ONLY allowed at the " | |
272 "beginning of the sequence, not in the middle.\n"); | |
273 exit(1); | |
274 } | |
275 continue; | |
276 } else { | |
277 video_adapted = true; | |
278 } | |
279 | |
280 num_frames++; | |
281 uint8_t last_true_vad; | |
282 if (true_vad_index == 0) { | |
283 last_true_vad = previous_true_vad; | |
284 } else { | |
285 last_true_vad = true_vad[true_vad_index - 1]; | |
286 } | |
287 if (last_true_vad == 1 && true_vad[true_vad_index] == 0) { | |
288 agc_stat.Reset(); | |
289 } | |
290 true_vad_index++; | |
291 | |
292 DitherSilence(&frame); | |
293 | |
294 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); | |
295 ASSERT_GE(ret_val, 0); | |
296 | |
297 if (ret_val > 0) { | |
298 ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); | |
299 for (int n = 0; n < ret_val; n++) { | |
300 if (true_vad[n] == 1) { | |
301 total_active++; | |
302 if (previous_true_vad == 0) { | |
303 num_onsets++; | |
304 onset = true; | |
305 } | |
306 if (agc_vad[n] == 0) { | |
307 total_missed_detection++; | |
308 if (onset) | |
309 onset_adaptation++; | |
310 } else { | |
311 in_false_positive_region = false; | |
312 onset = false; | |
313 } | |
314 } else if (true_vad[n] == 0) { | |
315 // Check if |on_set| flag is still up. If so it means that we totally | |
316 // missed an active region | |
317 if (onset) | |
318 num_not_adapted++; | |
319 onset = false; | |
320 | |
321 total_passive++; | |
322 if (agc_vad[n] == 1) { | |
323 total_false_positive++; | |
324 in_false_positive_region = true; | |
325 } | |
326 if (in_false_positive_region) { | |
327 total_false_positive_duration++; | |
328 } | |
329 } else { | |
330 ASSERT_TRUE(false) << "Invalid value for true-VAD.\n"; | |
331 } | |
332 previous_true_vad = true_vad[n]; | |
333 } | |
334 true_vad_index = 0; | |
335 } | |
336 } | |
337 | |
338 if (results_fid != NULL) { | |
339 fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", | |
340 total_active, | |
341 total_missed_detection, | |
342 total_passive, | |
343 total_false_positive, | |
344 num_onsets, | |
345 num_not_adapted, | |
346 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), | |
347 static_cast<float>(total_false_positive_duration) / | |
348 (total_passive + 1e-12)); | |
349 } | |
350 fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", | |
351 total_active, | |
352 total_missed_detection, | |
353 total_passive, | |
354 total_false_positive, | |
355 num_onsets, | |
356 num_not_adapted, | |
357 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), | |
358 static_cast<float>(total_false_positive_duration) / | |
359 (total_passive + 1e-12)); | |
360 | |
361 fclose(true_vad_fid); | |
362 fclose(pcm_fid); | |
363 if (video_vad_fid != NULL) { | |
364 fclose(video_vad_fid); | |
365 } | |
366 if (results_fid != NULL) { | |
367 fclose(results_fid); | |
368 } | |
369 } | |
370 | |
371 } // namespace webrtc | |
372 | |
373 int main(int argc, char* argv[]) { | |
374 if (argc == 1) { | |
375 // Print usage information. | |
376 std::cout << | |
377 "\nCompute the number of misdetected and false-positive frames. Not\n" | |
378 " that for each frame of audio (10 ms) there should be one true\n" | |
379 " activity. If any video-based activity is given, there should also be\n" | |
380 " one probability per frame.\n" | |
381 "Run with --help for more details on available flags.\n" | |
382 "\nUsage:\n\n" | |
383 "activity_metric input_pcm [options]\n" | |
384 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " | |
385 "format.\n\n"; | |
386 return 0; | |
387 } | |
388 rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true); | |
389 if (FLAG_help) { | |
390 rtc::FlagList::Print(nullptr, false); | |
391 return 0; | |
392 } | |
393 webrtc::void_main(argc, argv); | |
394 return 0; | |
395 } | |
OLD | NEW |