| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 | |
| 12 #include <math.h> | |
| 13 #include <stdio.h> | |
| 14 #include <stdlib.h> | |
| 15 | |
| 16 #include <algorithm> | |
| 17 #include <memory> | |
| 18 | |
| 19 #include "webrtc/base/flags.h" | |
| 20 #include "webrtc/base/safe_minmax.h" | |
| 21 #include "webrtc/modules/audio_processing/agc/agc.h" | |
| 22 #include "webrtc/modules/audio_processing/agc/loudness_histogram.h" | |
| 23 #include "webrtc/modules/audio_processing/agc/utility.h" | |
| 24 #include "webrtc/modules/audio_processing/vad/common.h" | |
| 25 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" | |
| 26 #include "webrtc/modules/audio_processing/vad/standalone_vad.h" | |
| 27 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" | |
| 28 #include "webrtc/modules/include/module_common_types.h" | |
| 29 #include "webrtc/test/gtest.h" | |
| 30 | |
| 31 static const int kAgcAnalWindowSamples = 100; | |
| 32 static const float kDefaultActivityThreshold = 0.3f; | |
| 33 | |
| 34 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); | |
| 35 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" | |
| 36 " format"); | |
| 37 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" | |
| 38 " probabilities) in double format. One activity per 10ms is" | |
| 39 " required. If no file is given the video information is not" | |
| 40 " incorporated. Negative activity is interpreted as video is" | |
| 41 " not adapted and the statistics are not computed during" | |
| 42 " the learning phase. Note that the negative video activities" | |
| 43 " are ONLY allowed at the beginning."); | |
| 44 DEFINE_string(result, "", "name of a file to write the results. The results" | |
| 45 " will be appended to the end of the file. This is optional."); | |
| 46 DEFINE_string(audio_content, "", "name of a file where audio content is written" | |
| 47 " to, in double format."); | |
| 48 DEFINE_float(activity_threshold, kDefaultActivityThreshold, | |
| 49 "Activity threshold"); | |
| 50 DEFINE_bool(help, false, "prints this message"); | |
| 51 | |
| 52 namespace webrtc { | |
| 53 | |
| 54 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will | |
| 55 // notify the caller of "silence" input, instead of bailing out. We would not | |
| 56 // need the following function when such a change is made. | |
| 57 | |
| 58 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a | |
| 59 // silence frame. Otherwise true VAD would drift with respect to the audio. | |
| 60 // We only consider mono inputs. | |
| 61 static void DitherSilence(AudioFrame* frame) { | |
| 62 ASSERT_EQ(1u, frame->num_channels_); | |
| 63 const double kRmsSilence = 5; | |
| 64 const double sum_squared_silence = kRmsSilence * kRmsSilence * | |
| 65 frame->samples_per_channel_; | |
| 66 double sum_squared = 0; | |
| 67 int16_t* frame_data = frame->mutable_data(); | |
| 68 for (size_t n = 0; n < frame->samples_per_channel_; n++) | |
| 69 sum_squared += frame_data[n] * frame_data[n]; | |
| 70 if (sum_squared <= sum_squared_silence) { | |
| 71 for (size_t n = 0; n < frame->samples_per_channel_; n++) | |
| 72 frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. | |
| 73 } | |
| 74 } | |
| 75 | |
| 76 class AgcStat { | |
| 77 public: | |
| 78 AgcStat() | |
| 79 : video_index_(0), | |
| 80 activity_threshold_(kDefaultActivityThreshold), | |
| 81 audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), | |
| 82 audio_processing_(new VadAudioProc()), | |
| 83 vad_(new PitchBasedVad()), | |
| 84 standalone_vad_(StandaloneVad::Create()), | |
| 85 audio_content_fid_(NULL) { | |
| 86 for (size_t n = 0; n < kMaxNumFrames; n++) | |
| 87 video_vad_[n] = 0.5; | |
| 88 } | |
| 89 | |
| 90 ~AgcStat() { | |
| 91 if (audio_content_fid_ != NULL) { | |
| 92 fclose(audio_content_fid_); | |
| 93 } | |
| 94 } | |
| 95 | |
| 96 void set_audio_content_file(FILE* audio_content_fid) { | |
| 97 audio_content_fid_ = audio_content_fid; | |
| 98 } | |
| 99 | |
| 100 int AddAudio(const AudioFrame& frame, double p_video, | |
| 101 int* combined_vad) { | |
| 102 if (frame.num_channels_ != 1 || | |
| 103 frame.samples_per_channel_ != | |
| 104 kSampleRateHz / 100 || | |
| 105 frame.sample_rate_hz_ != kSampleRateHz) | |
| 106 return -1; | |
| 107 video_vad_[video_index_++] = p_video; | |
| 108 AudioFeatures features; | |
| 109 const int16_t* frame_data = frame.data(); | |
| 110 audio_processing_->ExtractFeatures( | |
| 111 frame_data, frame.samples_per_channel_, &features); | |
| 112 if (FLAG_standalone_vad) { | |
| 113 standalone_vad_->AddAudio(frame_data, | |
| 114 frame.samples_per_channel_); | |
| 115 } | |
| 116 if (features.num_frames > 0) { | |
| 117 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; | |
| 118 if (FLAG_standalone_vad) { | |
| 119 standalone_vad_->GetActivity(p, kMaxNumFrames); | |
| 120 } | |
| 121 // TODO(turajs) combining and limiting are used in the source files as | |
| 122 // well they can be moved to utility. | |
| 123 // Combine Video and stand-alone VAD. | |
| 124 for (size_t n = 0; n < features.num_frames; n++) { | |
| 125 double p_active = p[n] * video_vad_[n]; | |
| 126 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); | |
| 127 p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99); | |
| 128 } | |
| 129 if (vad_->VoicingProbability(features, p) < 0) | |
| 130 return -1; | |
| 131 for (size_t n = 0; n < features.num_frames; n++) { | |
| 132 audio_content_->Update(features.rms[n], p[n]); | |
| 133 double ac = audio_content_->AudioContent(); | |
| 134 if (audio_content_fid_ != NULL) { | |
| 135 fwrite(&ac, sizeof(ac), 1, audio_content_fid_); | |
| 136 } | |
| 137 if (ac > kAgcAnalWindowSamples * activity_threshold_) { | |
| 138 combined_vad[n] = 1; | |
| 139 } else { | |
| 140 combined_vad[n] = 0; | |
| 141 } | |
| 142 } | |
| 143 video_index_ = 0; | |
| 144 } | |
| 145 return static_cast<int>(features.num_frames); | |
| 146 } | |
| 147 | |
| 148 void Reset() { | |
| 149 audio_content_->Reset(); | |
| 150 } | |
| 151 | |
| 152 void SetActivityThreshold(double activity_threshold) { | |
| 153 activity_threshold_ = activity_threshold; | |
| 154 } | |
| 155 | |
| 156 private: | |
| 157 int video_index_; | |
| 158 double activity_threshold_; | |
| 159 double video_vad_[kMaxNumFrames]; | |
| 160 std::unique_ptr<LoudnessHistogram> audio_content_; | |
| 161 std::unique_ptr<VadAudioProc> audio_processing_; | |
| 162 std::unique_ptr<PitchBasedVad> vad_; | |
| 163 std::unique_ptr<StandaloneVad> standalone_vad_; | |
| 164 | |
| 165 FILE* audio_content_fid_; | |
| 166 }; | |
| 167 | |
| 168 | |
| 169 void void_main(int argc, char* argv[]) { | |
| 170 webrtc::AgcStat agc_stat; | |
| 171 | |
| 172 FILE* pcm_fid = fopen(argv[1], "rb"); | |
| 173 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; | |
| 174 | |
| 175 if (argc < 2) { | |
| 176 fprintf(stderr, "\nNot Enough arguments\n"); | |
| 177 } | |
| 178 | |
| 179 FILE* true_vad_fid = NULL; | |
| 180 ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true " | |
| 181 "VADs using --true_vad flag."; | |
| 182 true_vad_fid = fopen(FLAG_true_vad, "rb"); | |
| 183 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << | |
| 184 FLAG_true_vad; | |
| 185 | |
| 186 FILE* results_fid = NULL; | |
| 187 if (strlen(FLAG_result) > 0) { | |
| 188 // True if this is the first time writing to this function and we add a | |
| 189 // header to the beginning of the file. | |
| 190 bool write_header; | |
| 191 // Open in the read mode. If it fails, the file doesn't exist and has to | |
| 192 // write a header for it. Otherwise no need to write a header. | |
| 193 results_fid = fopen(FLAG_result, "r"); | |
| 194 if (results_fid == NULL) { | |
| 195 write_header = true; | |
| 196 } else { | |
| 197 fclose(results_fid); | |
| 198 write_header = false; | |
| 199 } | |
| 200 // Open in append mode. | |
| 201 results_fid = fopen(FLAG_result, "a"); | |
| 202 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << | |
| 203 FLAG_result << ", to write the results."; | |
| 204 // Write the header if required. | |
| 205 if (write_header) { | |
| 206 fprintf(results_fid, "%% Total Active, Misdetection, " | |
| 207 "Total inactive, False Positive, On-sets, Missed segments, " | |
| 208 "Average response\n"); | |
| 209 } | |
| 210 } | |
| 211 | |
| 212 FILE* video_vad_fid = NULL; | |
| 213 if (strlen(FLAG_video_vad) > 0) { | |
| 214 video_vad_fid = fopen(FLAG_video_vad, "rb"); | |
| 215 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << | |
| 216 FLAG_video_vad << " to read video-based VAD decisions.\n"; | |
| 217 } | |
| 218 | |
| 219 // AgsStat will be the owner of this file and will close it at its | |
| 220 // destructor. | |
| 221 FILE* audio_content_fid = NULL; | |
| 222 if (strlen(FLAG_audio_content) > 0) { | |
| 223 audio_content_fid = fopen(FLAG_audio_content, "wb"); | |
| 224 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << | |
| 225 FLAG_audio_content << " to write audio-content.\n"; | |
| 226 agc_stat.set_audio_content_file(audio_content_fid); | |
| 227 } | |
| 228 | |
| 229 webrtc::AudioFrame frame; | |
| 230 frame.num_channels_ = 1; | |
| 231 frame.sample_rate_hz_ = 16000; | |
| 232 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; | |
| 233 const size_t kSamplesToRead = frame.num_channels_ * | |
| 234 frame.samples_per_channel_; | |
| 235 | |
| 236 agc_stat.SetActivityThreshold(FLAG_activity_threshold); | |
| 237 | |
| 238 int ret_val = 0; | |
| 239 int num_frames = 0; | |
| 240 int agc_vad[kMaxNumFrames]; | |
| 241 uint8_t true_vad[kMaxNumFrames]; | |
| 242 double p_video = 0.5; | |
| 243 int total_active = 0; | |
| 244 int total_passive = 0; | |
| 245 int total_false_positive = 0; | |
| 246 int total_missed_detection = 0; | |
| 247 int onset_adaptation = 0; | |
| 248 int num_onsets = 0; | |
| 249 bool onset = false; | |
| 250 uint8_t previous_true_vad = 0; | |
| 251 int num_not_adapted = 0; | |
| 252 size_t true_vad_index = 0; | |
| 253 bool in_false_positive_region = false; | |
| 254 int total_false_positive_duration = 0; | |
| 255 bool video_adapted = false; | |
| 256 while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t), | |
| 257 kSamplesToRead, pcm_fid)) { | |
| 258 assert(true_vad_index < kMaxNumFrames); | |
| 259 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, | |
| 260 true_vad_fid)) | |
| 261 << "Size mismatch between True-VAD and the PCM file.\n"; | |
| 262 if (video_vad_fid != NULL) { | |
| 263 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << | |
| 264 "Not enough video-based VAD probabilities."; | |
| 265 } | |
| 266 | |
| 267 // Negative video activity indicates that the video-based VAD is not yet | |
| 268 // adapted. Disregards the learning phase in statistics. | |
| 269 if (p_video < 0) { | |
| 270 if (video_adapted) { | |
| 271 fprintf(stderr, "Negative video probabilities ONLY allowed at the " | |
| 272 "beginning of the sequence, not in the middle.\n"); | |
| 273 exit(1); | |
| 274 } | |
| 275 continue; | |
| 276 } else { | |
| 277 video_adapted = true; | |
| 278 } | |
| 279 | |
| 280 num_frames++; | |
| 281 uint8_t last_true_vad; | |
| 282 if (true_vad_index == 0) { | |
| 283 last_true_vad = previous_true_vad; | |
| 284 } else { | |
| 285 last_true_vad = true_vad[true_vad_index - 1]; | |
| 286 } | |
| 287 if (last_true_vad == 1 && true_vad[true_vad_index] == 0) { | |
| 288 agc_stat.Reset(); | |
| 289 } | |
| 290 true_vad_index++; | |
| 291 | |
| 292 DitherSilence(&frame); | |
| 293 | |
| 294 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); | |
| 295 ASSERT_GE(ret_val, 0); | |
| 296 | |
| 297 if (ret_val > 0) { | |
| 298 ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); | |
| 299 for (int n = 0; n < ret_val; n++) { | |
| 300 if (true_vad[n] == 1) { | |
| 301 total_active++; | |
| 302 if (previous_true_vad == 0) { | |
| 303 num_onsets++; | |
| 304 onset = true; | |
| 305 } | |
| 306 if (agc_vad[n] == 0) { | |
| 307 total_missed_detection++; | |
| 308 if (onset) | |
| 309 onset_adaptation++; | |
| 310 } else { | |
| 311 in_false_positive_region = false; | |
| 312 onset = false; | |
| 313 } | |
| 314 } else if (true_vad[n] == 0) { | |
| 315 // Check if |on_set| flag is still up. If so it means that we totally | |
| 316 // missed an active region | |
| 317 if (onset) | |
| 318 num_not_adapted++; | |
| 319 onset = false; | |
| 320 | |
| 321 total_passive++; | |
| 322 if (agc_vad[n] == 1) { | |
| 323 total_false_positive++; | |
| 324 in_false_positive_region = true; | |
| 325 } | |
| 326 if (in_false_positive_region) { | |
| 327 total_false_positive_duration++; | |
| 328 } | |
| 329 } else { | |
| 330 ASSERT_TRUE(false) << "Invalid value for true-VAD.\n"; | |
| 331 } | |
| 332 previous_true_vad = true_vad[n]; | |
| 333 } | |
| 334 true_vad_index = 0; | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 if (results_fid != NULL) { | |
| 339 fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", | |
| 340 total_active, | |
| 341 total_missed_detection, | |
| 342 total_passive, | |
| 343 total_false_positive, | |
| 344 num_onsets, | |
| 345 num_not_adapted, | |
| 346 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), | |
| 347 static_cast<float>(total_false_positive_duration) / | |
| 348 (total_passive + 1e-12)); | |
| 349 } | |
| 350 fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", | |
| 351 total_active, | |
| 352 total_missed_detection, | |
| 353 total_passive, | |
| 354 total_false_positive, | |
| 355 num_onsets, | |
| 356 num_not_adapted, | |
| 357 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), | |
| 358 static_cast<float>(total_false_positive_duration) / | |
| 359 (total_passive + 1e-12)); | |
| 360 | |
| 361 fclose(true_vad_fid); | |
| 362 fclose(pcm_fid); | |
| 363 if (video_vad_fid != NULL) { | |
| 364 fclose(video_vad_fid); | |
| 365 } | |
| 366 if (results_fid != NULL) { | |
| 367 fclose(results_fid); | |
| 368 } | |
| 369 } | |
| 370 | |
| 371 } // namespace webrtc | |
| 372 | |
| 373 int main(int argc, char* argv[]) { | |
| 374 if (argc == 1) { | |
| 375 // Print usage information. | |
| 376 std::cout << | |
| 377 "\nCompute the number of misdetected and false-positive frames. Not\n" | |
| 378 " that for each frame of audio (10 ms) there should be one true\n" | |
| 379 " activity. If any video-based activity is given, there should also be\n" | |
| 380 " one probability per frame.\n" | |
| 381 "Run with --help for more details on available flags.\n" | |
| 382 "\nUsage:\n\n" | |
| 383 "activity_metric input_pcm [options]\n" | |
| 384 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " | |
| 385 "format.\n\n"; | |
| 386 return 0; | |
| 387 } | |
| 388 rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true); | |
| 389 if (FLAG_help) { | |
| 390 rtc::FlagList::Print(nullptr, false); | |
| 391 return 0; | |
| 392 } | |
| 393 webrtc::void_main(argc, argv); | |
| 394 return 0; | |
| 395 } | |
| OLD | NEW |