| Index: webrtc/tools/agc/activity_metric.cc
|
| diff --git a/webrtc/tools/agc/activity_metric.cc b/webrtc/tools/agc/activity_metric.cc
|
| deleted file mode 100644
|
| index 8ea193913cb43a736dcb2adbc9f85b6c422ffa1d..0000000000000000000000000000000000000000
|
| --- a/webrtc/tools/agc/activity_metric.cc
|
| +++ /dev/null
|
| @@ -1,395 +0,0 @@
|
| -/*
|
| - * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
| - *
|
| - * Use of this source code is governed by a BSD-style license
|
| - * that can be found in the LICENSE file in the root of the source
|
| - * tree. An additional intellectual property rights grant can be found
|
| - * in the file PATENTS. All contributing project authors may
|
| - * be found in the AUTHORS file in the root of the source tree.
|
| - */
|
| -
|
| -
|
| -#include <math.h>
|
| -#include <stdio.h>
|
| -#include <stdlib.h>
|
| -
|
| -#include <algorithm>
|
| -#include <memory>
|
| -
|
| -#include "webrtc/base/flags.h"
|
| -#include "webrtc/base/safe_minmax.h"
|
| -#include "webrtc/modules/audio_processing/agc/agc.h"
|
| -#include "webrtc/modules/audio_processing/agc/loudness_histogram.h"
|
| -#include "webrtc/modules/audio_processing/agc/utility.h"
|
| -#include "webrtc/modules/audio_processing/vad/common.h"
|
| -#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
|
| -#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
|
| -#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
|
| -#include "webrtc/modules/include/module_common_types.h"
|
| -#include "webrtc/test/gtest.h"
|
| -
|
| -static const int kAgcAnalWindowSamples = 100;
|
| -static const float kDefaultActivityThreshold = 0.3f;
|
| -
|
| -DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");
|
| -DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"
|
| - " format");
|
| -DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"
|
| - " probabilities) in double format. One activity per 10ms is"
|
| - " required. If no file is given the video information is not"
|
| - " incorporated. Negative activity is interpreted as video is"
|
| - " not adapted and the statistics are not computed during"
|
| - " the learning phase. Note that the negative video activities"
|
| - " are ONLY allowed at the beginning.");
|
| -DEFINE_string(result, "", "name of a file to write the results. The results"
|
| - " will be appended to the end of the file. This is optional.");
|
| -DEFINE_string(audio_content, "", "name of a file where audio content is written"
|
| - " to, in double format.");
|
| -DEFINE_float(activity_threshold, kDefaultActivityThreshold,
|
| - "Activity threshold");
|
| -DEFINE_bool(help, false, "prints this message");
|
| -
|
| -namespace webrtc {
|
| -
|
| -// TODO(turajs) A new CL will be committed soon where ExtractFeatures will
|
| -// notify the caller of "silence" input, instead of bailing out. We would not
|
| -// need the following function when such a change is made.
|
| -
|
| -// Add some dither to quiet frames. This avoids the ExtractFeatures skip a
|
| -// silence frame. Otherwise true VAD would drift with respect to the audio.
|
| -// We only consider mono inputs.
|
| -static void DitherSilence(AudioFrame* frame) {
|
| - ASSERT_EQ(1u, frame->num_channels_);
|
| - const double kRmsSilence = 5;
|
| - const double sum_squared_silence = kRmsSilence * kRmsSilence *
|
| - frame->samples_per_channel_;
|
| - double sum_squared = 0;
|
| - int16_t* frame_data = frame->mutable_data();
|
| - for (size_t n = 0; n < frame->samples_per_channel_; n++)
|
| - sum_squared += frame_data[n] * frame_data[n];
|
| - if (sum_squared <= sum_squared_silence) {
|
| - for (size_t n = 0; n < frame->samples_per_channel_; n++)
|
| - frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe.
|
| - }
|
| -}
|
| -
|
| -class AgcStat {
|
| - public:
|
| - AgcStat()
|
| - : video_index_(0),
|
| - activity_threshold_(kDefaultActivityThreshold),
|
| - audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)),
|
| - audio_processing_(new VadAudioProc()),
|
| - vad_(new PitchBasedVad()),
|
| - standalone_vad_(StandaloneVad::Create()),
|
| - audio_content_fid_(NULL) {
|
| - for (size_t n = 0; n < kMaxNumFrames; n++)
|
| - video_vad_[n] = 0.5;
|
| - }
|
| -
|
| - ~AgcStat() {
|
| - if (audio_content_fid_ != NULL) {
|
| - fclose(audio_content_fid_);
|
| - }
|
| - }
|
| -
|
| - void set_audio_content_file(FILE* audio_content_fid) {
|
| - audio_content_fid_ = audio_content_fid;
|
| - }
|
| -
|
| - int AddAudio(const AudioFrame& frame, double p_video,
|
| - int* combined_vad) {
|
| - if (frame.num_channels_ != 1 ||
|
| - frame.samples_per_channel_ !=
|
| - kSampleRateHz / 100 ||
|
| - frame.sample_rate_hz_ != kSampleRateHz)
|
| - return -1;
|
| - video_vad_[video_index_++] = p_video;
|
| - AudioFeatures features;
|
| - const int16_t* frame_data = frame.data();
|
| - audio_processing_->ExtractFeatures(
|
| - frame_data, frame.samples_per_channel_, &features);
|
| - if (FLAG_standalone_vad) {
|
| - standalone_vad_->AddAudio(frame_data,
|
| - frame.samples_per_channel_);
|
| - }
|
| - if (features.num_frames > 0) {
|
| - double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};
|
| - if (FLAG_standalone_vad) {
|
| - standalone_vad_->GetActivity(p, kMaxNumFrames);
|
| - }
|
| - // TODO(turajs) combining and limiting are used in the source files as
|
| - // well they can be moved to utility.
|
| - // Combine Video and stand-alone VAD.
|
| - for (size_t n = 0; n < features.num_frames; n++) {
|
| - double p_active = p[n] * video_vad_[n];
|
| - double p_passive = (1 - p[n]) * (1 - video_vad_[n]);
|
| - p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99);
|
| - }
|
| - if (vad_->VoicingProbability(features, p) < 0)
|
| - return -1;
|
| - for (size_t n = 0; n < features.num_frames; n++) {
|
| - audio_content_->Update(features.rms[n], p[n]);
|
| - double ac = audio_content_->AudioContent();
|
| - if (audio_content_fid_ != NULL) {
|
| - fwrite(&ac, sizeof(ac), 1, audio_content_fid_);
|
| - }
|
| - if (ac > kAgcAnalWindowSamples * activity_threshold_) {
|
| - combined_vad[n] = 1;
|
| - } else {
|
| - combined_vad[n] = 0;
|
| - }
|
| - }
|
| - video_index_ = 0;
|
| - }
|
| - return static_cast<int>(features.num_frames);
|
| - }
|
| -
|
| - void Reset() {
|
| - audio_content_->Reset();
|
| - }
|
| -
|
| - void SetActivityThreshold(double activity_threshold) {
|
| - activity_threshold_ = activity_threshold;
|
| - }
|
| -
|
| - private:
|
| - int video_index_;
|
| - double activity_threshold_;
|
| - double video_vad_[kMaxNumFrames];
|
| - std::unique_ptr<LoudnessHistogram> audio_content_;
|
| - std::unique_ptr<VadAudioProc> audio_processing_;
|
| - std::unique_ptr<PitchBasedVad> vad_;
|
| - std::unique_ptr<StandaloneVad> standalone_vad_;
|
| -
|
| - FILE* audio_content_fid_;
|
| -};
|
| -
|
| -
|
| -void void_main(int argc, char* argv[]) {
|
| - webrtc::AgcStat agc_stat;
|
| -
|
| - FILE* pcm_fid = fopen(argv[1], "rb");
|
| - ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];
|
| -
|
| - if (argc < 2) {
|
| - fprintf(stderr, "\nNot Enough arguments\n");
|
| - }
|
| -
|
| - FILE* true_vad_fid = NULL;
|
| - ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true "
|
| - "VADs using --true_vad flag.";
|
| - true_vad_fid = fopen(FLAG_true_vad, "rb");
|
| - ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<
|
| - FLAG_true_vad;
|
| -
|
| - FILE* results_fid = NULL;
|
| - if (strlen(FLAG_result) > 0) {
|
| - // True if this is the first time writing to this function and we add a
|
| - // header to the beginning of the file.
|
| - bool write_header;
|
| - // Open in the read mode. If it fails, the file doesn't exist and has to
|
| - // write a header for it. Otherwise no need to write a header.
|
| - results_fid = fopen(FLAG_result, "r");
|
| - if (results_fid == NULL) {
|
| - write_header = true;
|
| - } else {
|
| - fclose(results_fid);
|
| - write_header = false;
|
| - }
|
| - // Open in append mode.
|
| - results_fid = fopen(FLAG_result, "a");
|
| - ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<
|
| - FLAG_result << ", to write the results.";
|
| - // Write the header if required.
|
| - if (write_header) {
|
| - fprintf(results_fid, "%% Total Active, Misdetection, "
|
| - "Total inactive, False Positive, On-sets, Missed segments, "
|
| - "Average response\n");
|
| - }
|
| - }
|
| -
|
| - FILE* video_vad_fid = NULL;
|
| - if (strlen(FLAG_video_vad) > 0) {
|
| - video_vad_fid = fopen(FLAG_video_vad, "rb");
|
| - ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " <<
|
| - FLAG_video_vad << " to read video-based VAD decisions.\n";
|
| - }
|
| -
|
| - // AgsStat will be the owner of this file and will close it at its
|
| - // destructor.
|
| - FILE* audio_content_fid = NULL;
|
| - if (strlen(FLAG_audio_content) > 0) {
|
| - audio_content_fid = fopen(FLAG_audio_content, "wb");
|
| - ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<
|
| - FLAG_audio_content << " to write audio-content.\n";
|
| - agc_stat.set_audio_content_file(audio_content_fid);
|
| - }
|
| -
|
| - webrtc::AudioFrame frame;
|
| - frame.num_channels_ = 1;
|
| - frame.sample_rate_hz_ = 16000;
|
| - frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;
|
| - const size_t kSamplesToRead = frame.num_channels_ *
|
| - frame.samples_per_channel_;
|
| -
|
| - agc_stat.SetActivityThreshold(FLAG_activity_threshold);
|
| -
|
| - int ret_val = 0;
|
| - int num_frames = 0;
|
| - int agc_vad[kMaxNumFrames];
|
| - uint8_t true_vad[kMaxNumFrames];
|
| - double p_video = 0.5;
|
| - int total_active = 0;
|
| - int total_passive = 0;
|
| - int total_false_positive = 0;
|
| - int total_missed_detection = 0;
|
| - int onset_adaptation = 0;
|
| - int num_onsets = 0;
|
| - bool onset = false;
|
| - uint8_t previous_true_vad = 0;
|
| - int num_not_adapted = 0;
|
| - size_t true_vad_index = 0;
|
| - bool in_false_positive_region = false;
|
| - int total_false_positive_duration = 0;
|
| - bool video_adapted = false;
|
| - while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t),
|
| - kSamplesToRead, pcm_fid)) {
|
| - assert(true_vad_index < kMaxNumFrames);
|
| - ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,
|
| - true_vad_fid))
|
| - << "Size mismatch between True-VAD and the PCM file.\n";
|
| - if (video_vad_fid != NULL) {
|
| - ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<
|
| - "Not enough video-based VAD probabilities.";
|
| - }
|
| -
|
| - // Negative video activity indicates that the video-based VAD is not yet
|
| - // adapted. Disregards the learning phase in statistics.
|
| - if (p_video < 0) {
|
| - if (video_adapted) {
|
| - fprintf(stderr, "Negative video probabilities ONLY allowed at the "
|
| - "beginning of the sequence, not in the middle.\n");
|
| - exit(1);
|
| - }
|
| - continue;
|
| - } else {
|
| - video_adapted = true;
|
| - }
|
| -
|
| - num_frames++;
|
| - uint8_t last_true_vad;
|
| - if (true_vad_index == 0) {
|
| - last_true_vad = previous_true_vad;
|
| - } else {
|
| - last_true_vad = true_vad[true_vad_index - 1];
|
| - }
|
| - if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {
|
| - agc_stat.Reset();
|
| - }
|
| - true_vad_index++;
|
| -
|
| - DitherSilence(&frame);
|
| -
|
| - ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);
|
| - ASSERT_GE(ret_val, 0);
|
| -
|
| - if (ret_val > 0) {
|
| - ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val));
|
| - for (int n = 0; n < ret_val; n++) {
|
| - if (true_vad[n] == 1) {
|
| - total_active++;
|
| - if (previous_true_vad == 0) {
|
| - num_onsets++;
|
| - onset = true;
|
| - }
|
| - if (agc_vad[n] == 0) {
|
| - total_missed_detection++;
|
| - if (onset)
|
| - onset_adaptation++;
|
| - } else {
|
| - in_false_positive_region = false;
|
| - onset = false;
|
| - }
|
| - } else if (true_vad[n] == 0) {
|
| - // Check if |on_set| flag is still up. If so it means that we totally
|
| - // missed an active region
|
| - if (onset)
|
| - num_not_adapted++;
|
| - onset = false;
|
| -
|
| - total_passive++;
|
| - if (agc_vad[n] == 1) {
|
| - total_false_positive++;
|
| - in_false_positive_region = true;
|
| - }
|
| - if (in_false_positive_region) {
|
| - total_false_positive_duration++;
|
| - }
|
| - } else {
|
| - ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";
|
| - }
|
| - previous_true_vad = true_vad[n];
|
| - }
|
| - true_vad_index = 0;
|
| - }
|
| - }
|
| -
|
| - if (results_fid != NULL) {
|
| - fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
|
| - total_active,
|
| - total_missed_detection,
|
| - total_passive,
|
| - total_false_positive,
|
| - num_onsets,
|
| - num_not_adapted,
|
| - static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
|
| - static_cast<float>(total_false_positive_duration) /
|
| - (total_passive + 1e-12));
|
| - }
|
| - fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
|
| - total_active,
|
| - total_missed_detection,
|
| - total_passive,
|
| - total_false_positive,
|
| - num_onsets,
|
| - num_not_adapted,
|
| - static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
|
| - static_cast<float>(total_false_positive_duration) /
|
| - (total_passive + 1e-12));
|
| -
|
| - fclose(true_vad_fid);
|
| - fclose(pcm_fid);
|
| - if (video_vad_fid != NULL) {
|
| - fclose(video_vad_fid);
|
| - }
|
| - if (results_fid != NULL) {
|
| - fclose(results_fid);
|
| - }
|
| -}
|
| -
|
| -} // namespace webrtc
|
| -
|
| -int main(int argc, char* argv[]) {
|
| - if (argc == 1) {
|
| - // Print usage information.
|
| - std::cout <<
|
| - "\nCompute the number of misdetected and false-positive frames. Not\n"
|
| - " that for each frame of audio (10 ms) there should be one true\n"
|
| - " activity. If any video-based activity is given, there should also be\n"
|
| - " one probability per frame.\n"
|
| - "Run with --help for more details on available flags.\n"
|
| - "\nUsage:\n\n"
|
| - "activity_metric input_pcm [options]\n"
|
| - "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "
|
| - "format.\n\n";
|
| - return 0;
|
| - }
|
| - rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true);
|
| - if (FLAG_help) {
|
| - rtc::FlagList::Print(nullptr, false);
|
| - return 0;
|
| - }
|
| - webrtc::void_main(argc, argv);
|
| - return 0;
|
| -}
|
|
|