Index: webrtc/tools/agc/activity_metric.cc |
diff --git a/webrtc/tools/agc/activity_metric.cc b/webrtc/tools/agc/activity_metric.cc |
deleted file mode 100644 |
index 8ea193913cb43a736dcb2adbc9f85b6c422ffa1d..0000000000000000000000000000000000000000 |
--- a/webrtc/tools/agc/activity_metric.cc |
+++ /dev/null |
@@ -1,395 +0,0 @@ |
-/* |
- * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
- * |
- * Use of this source code is governed by a BSD-style license |
- * that can be found in the LICENSE file in the root of the source |
- * tree. An additional intellectual property rights grant can be found |
- * in the file PATENTS. All contributing project authors may |
- * be found in the AUTHORS file in the root of the source tree. |
- */ |
- |
- |
-#include <math.h> |
-#include <stdio.h> |
-#include <stdlib.h> |
- |
-#include <algorithm> |
-#include <memory> |
- |
-#include "webrtc/base/flags.h" |
-#include "webrtc/base/safe_minmax.h" |
-#include "webrtc/modules/audio_processing/agc/agc.h" |
-#include "webrtc/modules/audio_processing/agc/loudness_histogram.h" |
-#include "webrtc/modules/audio_processing/agc/utility.h" |
-#include "webrtc/modules/audio_processing/vad/common.h" |
-#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" |
-#include "webrtc/modules/audio_processing/vad/standalone_vad.h" |
-#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" |
-#include "webrtc/modules/include/module_common_types.h" |
-#include "webrtc/test/gtest.h" |
- |
-static const int kAgcAnalWindowSamples = 100; |
-static const float kDefaultActivityThreshold = 0.3f; |
- |
-DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); |
-DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" |
- " format"); |
-DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" |
- " probabilities) in double format. One activity per 10ms is" |
- " required. If no file is given the video information is not" |
- " incorporated. Negative activity is interpreted as video is" |
- " not adapted and the statistics are not computed during" |
- " the learning phase. Note that the negative video activities" |
- " are ONLY allowed at the beginning."); |
-DEFINE_string(result, "", "name of a file to write the results. The results" |
- " will be appended to the end of the file. This is optional."); |
-DEFINE_string(audio_content, "", "name of a file where audio content is written" |
- " to, in double format."); |
-DEFINE_float(activity_threshold, kDefaultActivityThreshold, |
- "Activity threshold"); |
-DEFINE_bool(help, false, "prints this message"); |
- |
-namespace webrtc { |
- |
-// TODO(turajs) A new CL will be committed soon where ExtractFeatures will |
-// notify the caller of "silence" input, instead of bailing out. We would not |
-// need the following function when such a change is made. |
- |
-// Add some dither to quiet frames. This avoids the ExtractFeatures skip a |
-// silence frame. Otherwise true VAD would drift with respect to the audio. |
-// We only consider mono inputs. |
-static void DitherSilence(AudioFrame* frame) { |
- ASSERT_EQ(1u, frame->num_channels_); |
- const double kRmsSilence = 5; |
- const double sum_squared_silence = kRmsSilence * kRmsSilence * |
- frame->samples_per_channel_; |
- double sum_squared = 0; |
- int16_t* frame_data = frame->mutable_data(); |
- for (size_t n = 0; n < frame->samples_per_channel_; n++) |
- sum_squared += frame_data[n] * frame_data[n]; |
- if (sum_squared <= sum_squared_silence) { |
- for (size_t n = 0; n < frame->samples_per_channel_; n++) |
- frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. |
- } |
-} |
- |
-class AgcStat { |
- public: |
- AgcStat() |
- : video_index_(0), |
- activity_threshold_(kDefaultActivityThreshold), |
- audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), |
- audio_processing_(new VadAudioProc()), |
- vad_(new PitchBasedVad()), |
- standalone_vad_(StandaloneVad::Create()), |
- audio_content_fid_(NULL) { |
- for (size_t n = 0; n < kMaxNumFrames; n++) |
- video_vad_[n] = 0.5; |
- } |
- |
- ~AgcStat() { |
- if (audio_content_fid_ != NULL) { |
- fclose(audio_content_fid_); |
- } |
- } |
- |
- void set_audio_content_file(FILE* audio_content_fid) { |
- audio_content_fid_ = audio_content_fid; |
- } |
- |
- int AddAudio(const AudioFrame& frame, double p_video, |
- int* combined_vad) { |
- if (frame.num_channels_ != 1 || |
- frame.samples_per_channel_ != |
- kSampleRateHz / 100 || |
- frame.sample_rate_hz_ != kSampleRateHz) |
- return -1; |
- video_vad_[video_index_++] = p_video; |
- AudioFeatures features; |
- const int16_t* frame_data = frame.data(); |
- audio_processing_->ExtractFeatures( |
- frame_data, frame.samples_per_channel_, &features); |
- if (FLAG_standalone_vad) { |
- standalone_vad_->AddAudio(frame_data, |
- frame.samples_per_channel_); |
- } |
- if (features.num_frames > 0) { |
- double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; |
- if (FLAG_standalone_vad) { |
- standalone_vad_->GetActivity(p, kMaxNumFrames); |
- } |
- // TODO(turajs) combining and limiting are used in the source files as |
- // well they can be moved to utility. |
- // Combine Video and stand-alone VAD. |
- for (size_t n = 0; n < features.num_frames; n++) { |
- double p_active = p[n] * video_vad_[n]; |
- double p_passive = (1 - p[n]) * (1 - video_vad_[n]); |
- p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99); |
- } |
- if (vad_->VoicingProbability(features, p) < 0) |
- return -1; |
- for (size_t n = 0; n < features.num_frames; n++) { |
- audio_content_->Update(features.rms[n], p[n]); |
- double ac = audio_content_->AudioContent(); |
- if (audio_content_fid_ != NULL) { |
- fwrite(&ac, sizeof(ac), 1, audio_content_fid_); |
- } |
- if (ac > kAgcAnalWindowSamples * activity_threshold_) { |
- combined_vad[n] = 1; |
- } else { |
- combined_vad[n] = 0; |
- } |
- } |
- video_index_ = 0; |
- } |
- return static_cast<int>(features.num_frames); |
- } |
- |
- void Reset() { |
- audio_content_->Reset(); |
- } |
- |
- void SetActivityThreshold(double activity_threshold) { |
- activity_threshold_ = activity_threshold; |
- } |
- |
- private: |
- int video_index_; |
- double activity_threshold_; |
- double video_vad_[kMaxNumFrames]; |
- std::unique_ptr<LoudnessHistogram> audio_content_; |
- std::unique_ptr<VadAudioProc> audio_processing_; |
- std::unique_ptr<PitchBasedVad> vad_; |
- std::unique_ptr<StandaloneVad> standalone_vad_; |
- |
- FILE* audio_content_fid_; |
-}; |
- |
- |
-void void_main(int argc, char* argv[]) { |
- webrtc::AgcStat agc_stat; |
- |
- FILE* pcm_fid = fopen(argv[1], "rb"); |
- ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; |
- |
- if (argc < 2) { |
- fprintf(stderr, "\nNot Enough arguments\n"); |
- } |
- |
- FILE* true_vad_fid = NULL; |
- ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true " |
- "VADs using --true_vad flag."; |
- true_vad_fid = fopen(FLAG_true_vad, "rb"); |
- ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << |
- FLAG_true_vad; |
- |
- FILE* results_fid = NULL; |
- if (strlen(FLAG_result) > 0) { |
- // True if this is the first time writing to this function and we add a |
- // header to the beginning of the file. |
- bool write_header; |
- // Open in the read mode. If it fails, the file doesn't exist and has to |
- // write a header for it. Otherwise no need to write a header. |
- results_fid = fopen(FLAG_result, "r"); |
- if (results_fid == NULL) { |
- write_header = true; |
- } else { |
- fclose(results_fid); |
- write_header = false; |
- } |
- // Open in append mode. |
- results_fid = fopen(FLAG_result, "a"); |
- ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << |
- FLAG_result << ", to write the results."; |
- // Write the header if required. |
- if (write_header) { |
- fprintf(results_fid, "%% Total Active, Misdetection, " |
- "Total inactive, False Positive, On-sets, Missed segments, " |
- "Average response\n"); |
- } |
- } |
- |
- FILE* video_vad_fid = NULL; |
- if (strlen(FLAG_video_vad) > 0) { |
- video_vad_fid = fopen(FLAG_video_vad, "rb"); |
- ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << |
- FLAG_video_vad << " to read video-based VAD decisions.\n"; |
- } |
- |
- // AgsStat will be the owner of this file and will close it at its |
- // destructor. |
- FILE* audio_content_fid = NULL; |
- if (strlen(FLAG_audio_content) > 0) { |
- audio_content_fid = fopen(FLAG_audio_content, "wb"); |
- ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << |
- FLAG_audio_content << " to write audio-content.\n"; |
- agc_stat.set_audio_content_file(audio_content_fid); |
- } |
- |
- webrtc::AudioFrame frame; |
- frame.num_channels_ = 1; |
- frame.sample_rate_hz_ = 16000; |
- frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; |
- const size_t kSamplesToRead = frame.num_channels_ * |
- frame.samples_per_channel_; |
- |
- agc_stat.SetActivityThreshold(FLAG_activity_threshold); |
- |
- int ret_val = 0; |
- int num_frames = 0; |
- int agc_vad[kMaxNumFrames]; |
- uint8_t true_vad[kMaxNumFrames]; |
- double p_video = 0.5; |
- int total_active = 0; |
- int total_passive = 0; |
- int total_false_positive = 0; |
- int total_missed_detection = 0; |
- int onset_adaptation = 0; |
- int num_onsets = 0; |
- bool onset = false; |
- uint8_t previous_true_vad = 0; |
- int num_not_adapted = 0; |
- size_t true_vad_index = 0; |
- bool in_false_positive_region = false; |
- int total_false_positive_duration = 0; |
- bool video_adapted = false; |
- while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t), |
- kSamplesToRead, pcm_fid)) { |
- assert(true_vad_index < kMaxNumFrames); |
- ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, |
- true_vad_fid)) |
- << "Size mismatch between True-VAD and the PCM file.\n"; |
- if (video_vad_fid != NULL) { |
- ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << |
- "Not enough video-based VAD probabilities."; |
- } |
- |
- // Negative video activity indicates that the video-based VAD is not yet |
- // adapted. Disregards the learning phase in statistics. |
- if (p_video < 0) { |
- if (video_adapted) { |
- fprintf(stderr, "Negative video probabilities ONLY allowed at the " |
- "beginning of the sequence, not in the middle.\n"); |
- exit(1); |
- } |
- continue; |
- } else { |
- video_adapted = true; |
- } |
- |
- num_frames++; |
- uint8_t last_true_vad; |
- if (true_vad_index == 0) { |
- last_true_vad = previous_true_vad; |
- } else { |
- last_true_vad = true_vad[true_vad_index - 1]; |
- } |
- if (last_true_vad == 1 && true_vad[true_vad_index] == 0) { |
- agc_stat.Reset(); |
- } |
- true_vad_index++; |
- |
- DitherSilence(&frame); |
- |
- ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); |
- ASSERT_GE(ret_val, 0); |
- |
- if (ret_val > 0) { |
- ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); |
- for (int n = 0; n < ret_val; n++) { |
- if (true_vad[n] == 1) { |
- total_active++; |
- if (previous_true_vad == 0) { |
- num_onsets++; |
- onset = true; |
- } |
- if (agc_vad[n] == 0) { |
- total_missed_detection++; |
- if (onset) |
- onset_adaptation++; |
- } else { |
- in_false_positive_region = false; |
- onset = false; |
- } |
- } else if (true_vad[n] == 0) { |
- // Check if |on_set| flag is still up. If so it means that we totally |
- // missed an active region |
- if (onset) |
- num_not_adapted++; |
- onset = false; |
- |
- total_passive++; |
- if (agc_vad[n] == 1) { |
- total_false_positive++; |
- in_false_positive_region = true; |
- } |
- if (in_false_positive_region) { |
- total_false_positive_duration++; |
- } |
- } else { |
- ASSERT_TRUE(false) << "Invalid value for true-VAD.\n"; |
- } |
- previous_true_vad = true_vad[n]; |
- } |
- true_vad_index = 0; |
- } |
- } |
- |
- if (results_fid != NULL) { |
- fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", |
- total_active, |
- total_missed_detection, |
- total_passive, |
- total_false_positive, |
- num_onsets, |
- num_not_adapted, |
- static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), |
- static_cast<float>(total_false_positive_duration) / |
- (total_passive + 1e-12)); |
- } |
- fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", |
- total_active, |
- total_missed_detection, |
- total_passive, |
- total_false_positive, |
- num_onsets, |
- num_not_adapted, |
- static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), |
- static_cast<float>(total_false_positive_duration) / |
- (total_passive + 1e-12)); |
- |
- fclose(true_vad_fid); |
- fclose(pcm_fid); |
- if (video_vad_fid != NULL) { |
- fclose(video_vad_fid); |
- } |
- if (results_fid != NULL) { |
- fclose(results_fid); |
- } |
-} |
- |
-} // namespace webrtc |
- |
-int main(int argc, char* argv[]) { |
- if (argc == 1) { |
- // Print usage information. |
- std::cout << |
- "\nCompute the number of misdetected and false-positive frames. Not\n" |
- " that for each frame of audio (10 ms) there should be one true\n" |
- " activity. If any video-based activity is given, there should also be\n" |
- " one probability per frame.\n" |
- "Run with --help for more details on available flags.\n" |
- "\nUsage:\n\n" |
- "activity_metric input_pcm [options]\n" |
- "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " |
- "format.\n\n"; |
- return 0; |
- } |
- rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true); |
- if (FLAG_help) { |
- rtc::FlagList::Print(nullptr, false); |
- return 0; |
- } |
- webrtc::void_main(argc, argv); |
- return 0; |
-} |