| Index: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| index f16aa753fa427a42b7def1298c145a52f3154afd..ba36514e779e916108b43b5c8bde4bb413bd6b08 100644
|
| --- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| +++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| @@ -10,8 +10,7 @@
|
|
|
| #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
|
|
| -#include <utility>
|
| -
|
| +#include "webrtc/base/logging.h"
|
| #include "webrtc/base/pathutils.h"
|
|
|
| namespace webrtc {
|
| @@ -25,7 +24,7 @@ MultiEndCall::MultiEndCall(
|
| wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {
|
| FindSpeakerNames();
|
| CreateAudioTrackReaders();
|
| - CheckTiming();
|
| + valid_ = CheckTiming();
|
| }
|
|
|
| MultiEndCall::~MultiEndCall() = default;
|
| @@ -39,6 +38,10 @@ const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
| return audiotrack_readers_;
|
| }
|
|
|
| +bool MultiEndCall::valid() {
|
| + return valid_;
|
| +}
|
| +
|
| void MultiEndCall::FindSpeakerNames() {
|
| RTC_DCHECK(speaker_names_.empty());
|
| for (const Turn& turn : timing_) {
|
| @@ -65,9 +68,106 @@ void MultiEndCall::CreateAudioTrackReaders() {
|
| }
|
| }
|
|
|
| -void MultiEndCall::CheckTiming() {
|
| - // TODO(alessiob): use audiotrack lengths and offset to check whether the
|
| - // timing is valid.
|
| +bool MultiEndCall::CheckTiming() const {
|
| + std::size_t number_of_turns = timing_.size();
|
| + auto millisecond_to_samples = [](int ms, int sr) -> int {
|
| + return ms * sr / 1000;
|
| + };
|
| + auto in_interval = [](std::size_t value, const Interval& interval) {
|
| + return interval.first <= value && value < interval.second;
|
| + };
|
| +
|
| + // Begin and end timestamps for the last two turns (unit: number of samples).
|
| + Interval second_last_turn = {0, 0};
|
| + Interval last_turn = {0, 0};
|
| +
|
| + // Initialize map to store turn intervals of each speaker (used to detect self
|
| + // cross-talk).
|
| + std::map<std::string, std::unique_ptr<IntervalsVector>> speakers_intervals;
|
| + for (const std::string& speaker_name : speaker_names_) {
|
| + // Initialize a vector.
|
| + speakers_intervals.insert(std::make_pair(
|
| + speaker_name, std::unique_ptr<IntervalsVector>(
|
| + new IntervalsVector())));
|
| + LOG(LS_VERBOSE) << "speaker_intervals vector for <" << speaker_name
|
| + << "> preallocated (capacity: "
|
| + << speakers_intervals[speaker_name]->capacity() << ")";
|
| + }
|
| +
|
| + // Parse turns.
|
| + for (std::size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {
|
| + const Turn& turn = timing_[turn_index];
|
| + auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
|
| + RTC_CHECK(it != audiotrack_readers_.end())
|
| + << "Audio track reader not created";
|
| +
|
| + // Begin and end timestamps for the current turn.
|
| + int offset_samples = millisecond_to_samples(
|
| + turn.offset, it->second->sample_rate());
|
| + std::size_t begin_timestamp = last_turn.second + offset_samples;
|
| + std::size_t end_timestamp = begin_timestamp + it->second->num_samples();
|
| + LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp
|
| + << "-" << end_timestamp << " ms";
|
| +
|
| + // The order is invalid if the offset is negative and its absolute value is
|
| + // larger then the duration of the previous turn.
|
| + if (offset_samples < 0 && -offset_samples > int(
|
| + last_turn.second - last_turn.first)) {
|
| + LOG(LS_ERROR) << "invalid order";
|
| + return false;
|
| + }
|
| +
|
| + // Cross-talk with 3 or more speakers occurs when the beginning of the
|
| + // current interval falls in the last two turns.
|
| + if (turn_index > 1 && in_interval(begin_timestamp, last_turn)
|
| + && in_interval(begin_timestamp, second_last_turn)) {
|
| + LOG(LS_ERROR) << "cross-talk with 3+ speakers";
|
| + return false;
|
| + }
|
| +
|
| + // Save speaker turn interval.
|
| + Interval current_turn = {begin_timestamp, end_timestamp};
|
| + speakers_intervals[turn.speaker_name]->push_back(current_turn);
|
| +
|
| + // Update and continue with next turn.
|
| + second_last_turn = last_turn;
|
| + last_turn = current_turn;
|
| + }
|
| +
|
| + // Detect self cross-talk.
|
| + for (const std::string& speaker_name : speaker_names_) {
|
| + LOG(LS_INFO) << "checking self cross-talk for <"
|
| + << speaker_name << ">";
|
| + if (DetectSelfCrossTalk(speakers_intervals[speaker_name].get())) {
|
| + LOG(LS_ERROR) << "Self cross-talk detected";
|
| + return false;
|
| + }
|
| + }
|
| +
|
| + return true;
|
| +}
|
| +
|
| +bool MultiEndCall::DetectSelfCrossTalk(IntervalsVector* speaker_intervals)
|
| + const {
|
| + Interval previous_interval = speaker_intervals->at(0);
|
| + LOG(LS_VERBOSE) << "#0" << ": " << previous_interval.first << " "
|
| + << previous_interval.second;
|
| + for (std::size_t index = 1; index < speaker_intervals->size(); ++index) {
|
| + auto interval = speaker_intervals->at(index);
|
| + LOG(LS_VERBOSE) << "#" << index << ": " << interval.first << " "
|
| + << interval.second;
|
| +
|
| + // Check if there is overlap with the previous interval.
|
| + // This is a sufficient condition for self cross-talk since the intervals
|
| + // are sorted by begin timestamp.
|
| + if (previous_interval.second > interval.first) {
|
| + return true;
|
| + }
|
| +
|
| + // Update and continue with next turn.
|
| + previous_interval = interval;
|
| + }
|
| + return false;
|
| }
|
|
|
| } // namespace conversational_speech
|
|
|