webrtc/tools/agc/activity_metric.cc - Issue 2965593002: Move webrtc/{tools => rtc_tools}

Side by Side Diff: webrtc/tools/agc/activity_metric.cc

Issue 2965593002: Move webrtc/{tools => rtc_tools} (Closed)

Patch Set: Adding back root changes Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11

12 #include <math.h>

13 #include <stdio.h>

14 #include <stdlib.h>

15

16 #include <algorithm>

17 #include <memory>

18

19 #include "webrtc/base/flags.h"

20 #include "webrtc/base/safe_minmax.h"

21 #include "webrtc/modules/audio_processing/agc/agc.h"

22 #include "webrtc/modules/audio_processing/agc/loudness_histogram.h"

23 #include "webrtc/modules/audio_processing/agc/utility.h"

24 #include "webrtc/modules/audio_processing/vad/common.h"

25 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"

26 #include "webrtc/modules/audio_processing/vad/standalone_vad.h"

27 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"

28 #include "webrtc/modules/include/module_common_types.h"

29 #include "webrtc/test/gtest.h"

30

31 static const int kAgcAnalWindowSamples = 100;

32 static const float kDefaultActivityThreshold = 0.3f;

33

34 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");

35 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"

36 " format");

37 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"

38 " probabilities) in double format. One activity per 10ms is"

39 " required. If no file is given the video information is not"

40 " incorporated. Negative activity is interpreted as video is"

41 " not adapted and the statistics are not computed during"

42 " the learning phase. Note that the negative video activities"

43 " are ONLY allowed at the beginning.");

44 DEFINE_string(result, "", "name of a file to write the results. The results"

45 " will be appended to the end of the file. This is optional.");

46 DEFINE_string(audio_content, "", "name of a file where audio content is written"

47 " to, in double format.");

48 DEFINE_float(activity_threshold, kDefaultActivityThreshold,

49 "Activity threshold");

50 DEFINE_bool(help, false, "prints this message");

51

52 namespace webrtc {

53

54 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will

55 // notify the caller of "silence" input, instead of bailing out. We would not

56 // need the following function when such a change is made.

57

58 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a

59 // silence frame. Otherwise true VAD would drift with respect to the audio.

60 // We only consider mono inputs.

61 static void DitherSilence(AudioFrame* frame) {

62 ASSERT_EQ(1u, frame->num_channels_);

63 const double kRmsSilence = 5;

64 const double sum_squared_silence = kRmsSilence * kRmsSilence *

65 frame->samples_per_channel_;

66 double sum_squared = 0;

67 int16_t* frame_data = frame->mutable_data();

68 for (size_t n = 0; n < frame->samples_per_channel_; n++)

69 sum_squared += frame_data[n] * frame_data[n];

70 if (sum_squared <= sum_squared_silence) {

71 for (size_t n = 0; n < frame->samples_per_channel_; n++)

72 frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe.

73 }

74 }

75

76 class AgcStat {

77 public:

78 AgcStat()

79 : video_index_(0),

80 activity_threshold_(kDefaultActivityThreshold),

81 audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)),

82 audio_processing_(new VadAudioProc()),

83 vad_(new PitchBasedVad()),

84 standalone_vad_(StandaloneVad::Create()),

85 audio_content_fid_(NULL) {

86 for (size_t n = 0; n < kMaxNumFrames; n++)

87 video_vad_[n] = 0.5;

88 }

89

90 ~AgcStat() {

91 if (audio_content_fid_ != NULL) {

92 fclose(audio_content_fid_);

93 }

94 }

95

96 void set_audio_content_file(FILE* audio_content_fid) {

97 audio_content_fid_ = audio_content_fid;

98 }

99

100 int AddAudio(const AudioFrame& frame, double p_video,

101 int* combined_vad) {

102 if (frame.num_channels_ != 1 \|\|

103 frame.samples_per_channel_ !=

104 kSampleRateHz / 100 \|\|

105 frame.sample_rate_hz_ != kSampleRateHz)

106 return -1;

107 video_vad_[video_index_++] = p_video;

108 AudioFeatures features;

109 const int16_t* frame_data = frame.data();

110 audio_processing_->ExtractFeatures(

111 frame_data, frame.samples_per_channel_, &features);

112 if (FLAG_standalone_vad) {

113 standalone_vad_->AddAudio(frame_data,

114 frame.samples_per_channel_);

115 }

116 if (features.num_frames > 0) {

117 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};

118 if (FLAG_standalone_vad) {

119 standalone_vad_->GetActivity(p, kMaxNumFrames);

120 }

121 // TODO(turajs) combining and limiting are used in the source files as

122 // well they can be moved to utility.

123 // Combine Video and stand-alone VAD.

124 for (size_t n = 0; n < features.num_frames; n++) {

125 double p_active = p[n] * video_vad_[n];

126 double p_passive = (1 - p[n]) * (1 - video_vad_[n]);

127 p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99);

128 }

129 if (vad_->VoicingProbability(features, p) < 0)

130 return -1;

131 for (size_t n = 0; n < features.num_frames; n++) {

132 audio_content_->Update(features.rms[n], p[n]);

133 double ac = audio_content_->AudioContent();

134 if (audio_content_fid_ != NULL) {

135 fwrite(&ac, sizeof(ac), 1, audio_content_fid_);

136 }

137 if (ac > kAgcAnalWindowSamples * activity_threshold_) {

138 combined_vad[n] = 1;

139 } else {

140 combined_vad[n] = 0;

141 }

142 }

143 video_index_ = 0;

144 }

145 return static_cast<int>(features.num_frames);

146 }

147

148 void Reset() {

149 audio_content_->Reset();

150 }

151

152 void SetActivityThreshold(double activity_threshold) {

153 activity_threshold_ = activity_threshold;

154 }

155

156 private:

157 int video_index_;

158 double activity_threshold_;

159 double video_vad_[kMaxNumFrames];

160 std::unique_ptr<LoudnessHistogram> audio_content_;

161 std::unique_ptr<VadAudioProc> audio_processing_;

162 std::unique_ptr<PitchBasedVad> vad_;

163 std::unique_ptr<StandaloneVad> standalone_vad_;

164

165 FILE* audio_content_fid_;

166 };

167

168

169 void void_main(int argc, char* argv[]) {

170 webrtc::AgcStat agc_stat;

171

172 FILE* pcm_fid = fopen(argv[1], "rb");

173 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];

174

175 if (argc < 2) {

176 fprintf(stderr, "\nNot Enough arguments\n");

177 }

178

179 FILE* true_vad_fid = NULL;

180 ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true "

181 "VADs using --true_vad flag.";

182 true_vad_fid = fopen(FLAG_true_vad, "rb");

183 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<

184 FLAG_true_vad;

185

186 FILE* results_fid = NULL;

187 if (strlen(FLAG_result) > 0) {

188 // True if this is the first time writing to this function and we add a

189 // header to the beginning of the file.

190 bool write_header;

191 // Open in the read mode. If it fails, the file doesn't exist and has to

192 // write a header for it. Otherwise no need to write a header.

193 results_fid = fopen(FLAG_result, "r");

194 if (results_fid == NULL) {

195 write_header = true;

196 } else {

197 fclose(results_fid);

198 write_header = false;

199 }

200 // Open in append mode.

201 results_fid = fopen(FLAG_result, "a");

202 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<

203 FLAG_result << ", to write the results.";

204 // Write the header if required.

205 if (write_header) {

206 fprintf(results_fid, "%% Total Active, Misdetection, "

207 "Total inactive, False Positive, On-sets, Missed segments, "

208 "Average response\n");

209 }

210 }

211

212 FILE* video_vad_fid = NULL;

213 if (strlen(FLAG_video_vad) > 0) {

214 video_vad_fid = fopen(FLAG_video_vad, "rb");

215 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " <<

216 FLAG_video_vad << " to read video-based VAD decisions.\n";

217 }

218

219 // AgsStat will be the owner of this file and will close it at its

220 // destructor.

221 FILE* audio_content_fid = NULL;

222 if (strlen(FLAG_audio_content) > 0) {

223 audio_content_fid = fopen(FLAG_audio_content, "wb");

224 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<

225 FLAG_audio_content << " to write audio-content.\n";

226 agc_stat.set_audio_content_file(audio_content_fid);

227 }

228

229 webrtc::AudioFrame frame;

230 frame.num_channels_ = 1;

231 frame.sample_rate_hz_ = 16000;

232 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;

233 const size_t kSamplesToRead = frame.num_channels_ *

234 frame.samples_per_channel_;

235

236 agc_stat.SetActivityThreshold(FLAG_activity_threshold);

237

238 int ret_val = 0;

239 int num_frames = 0;

240 int agc_vad[kMaxNumFrames];

241 uint8_t true_vad[kMaxNumFrames];

242 double p_video = 0.5;

243 int total_active = 0;

244 int total_passive = 0;

245 int total_false_positive = 0;

246 int total_missed_detection = 0;

247 int onset_adaptation = 0;

248 int num_onsets = 0;

249 bool onset = false;

250 uint8_t previous_true_vad = 0;

251 int num_not_adapted = 0;

252 size_t true_vad_index = 0;

253 bool in_false_positive_region = false;

254 int total_false_positive_duration = 0;

255 bool video_adapted = false;

256 while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t),

257 kSamplesToRead, pcm_fid)) {

258 assert(true_vad_index < kMaxNumFrames);

259 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,

260 true_vad_fid))

261 << "Size mismatch between True-VAD and the PCM file.\n";

262 if (video_vad_fid != NULL) {

263 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<

264 "Not enough video-based VAD probabilities.";

265 }

266

267 // Negative video activity indicates that the video-based VAD is not yet

268 // adapted. Disregards the learning phase in statistics.

269 if (p_video < 0) {

270 if (video_adapted) {

271 fprintf(stderr, "Negative video probabilities ONLY allowed at the "

272 "beginning of the sequence, not in the middle.\n");

273 exit(1);

274 }

275 continue;

276 } else {

277 video_adapted = true;

278 }

279

280 num_frames++;

281 uint8_t last_true_vad;

282 if (true_vad_index == 0) {

283 last_true_vad = previous_true_vad;

284 } else {

285 last_true_vad = true_vad[true_vad_index - 1];

286 }

287 if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {

288 agc_stat.Reset();

289 }

290 true_vad_index++;

291

292 DitherSilence(&frame);

293

294 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);

295 ASSERT_GE(ret_val, 0);

296

297 if (ret_val > 0) {

298 ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val));

299 for (int n = 0; n < ret_val; n++) {

300 if (true_vad[n] == 1) {

301 total_active++;

302 if (previous_true_vad == 0) {

303 num_onsets++;

304 onset = true;

305 }

306 if (agc_vad[n] == 0) {

307 total_missed_detection++;

308 if (onset)

309 onset_adaptation++;

310 } else {

311 in_false_positive_region = false;

312 onset = false;

313 }

314 } else if (true_vad[n] == 0) {

315 // Check if \|on_set\| flag is still up. If so it means that we totally

316 // missed an active region

317 if (onset)

318 num_not_adapted++;

319 onset = false;

320

321 total_passive++;

322 if (agc_vad[n] == 1) {

323 total_false_positive++;

324 in_false_positive_region = true;

325 }

326 if (in_false_positive_region) {

327 total_false_positive_duration++;

328 }

329 } else {

330 ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";

331 }

332 previous_true_vad = true_vad[n];

333 }

334 true_vad_index = 0;

335 }

336 }

337

338 if (results_fid != NULL) {

339 fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",

340 total_active,

341 total_missed_detection,

342 total_passive,

343 total_false_positive,

344 num_onsets,

345 num_not_adapted,

346 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),

347 static_cast<float>(total_false_positive_duration) /

348 (total_passive + 1e-12));

349 }

350 fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",

351 total_active,

352 total_missed_detection,

353 total_passive,

354 total_false_positive,

355 num_onsets,

356 num_not_adapted,

357 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),

358 static_cast<float>(total_false_positive_duration) /

359 (total_passive + 1e-12));

360

361 fclose(true_vad_fid);

362 fclose(pcm_fid);

363 if (video_vad_fid != NULL) {

364 fclose(video_vad_fid);

365 }

366 if (results_fid != NULL) {

367 fclose(results_fid);

368 }

369 }

370

371 } // namespace webrtc

372

373 int main(int argc, char* argv[]) {

374 if (argc == 1) {

375 // Print usage information.

376 std::cout <<

377 "\nCompute the number of misdetected and false-positive frames. Not\n"

378 " that for each frame of audio (10 ms) there should be one true\n"

379 " activity. If any video-based activity is given, there should also be\n"

380 " one probability per frame.\n"

381 "Run with --help for more details on available flags.\n"

382 "\nUsage:\n\n"

383 "activity_metric input_pcm [options]\n"

384 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "

385 "format.\n\n";

386 return 0;

387 }

388 rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true);

389 if (FLAG_help) {

390 rtc::FlagList::Print(nullptr, false);

391 return 0;

392 }

393 webrtc::void_main(argc, argv);

394 return 0;

395 }

OLD	NEW

« no previous file with comments | « webrtc/tools/OWNERS ('k') | webrtc/tools/author_line_count.sh » ('j') | no next file with comments »