| Index: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| index f16aa753fa427a42b7def1298c145a52f3154afd..ad1d9a0c87e90d9dc4c398bc21d120673379c366 100644
|
| --- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| +++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
|
| @@ -10,8 +10,10 @@
|
|
|
| #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
|
|
| -#include <utility>
|
| +#include <algorithm>
|
| +#include <iterator>
|
|
|
| +#include "webrtc/base/logging.h"
|
| #include "webrtc/base/pathutils.h"
|
|
|
| namespace webrtc {
|
| @@ -25,7 +27,7 @@ MultiEndCall::MultiEndCall(
|
| wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {
|
| FindSpeakerNames();
|
| CreateAudioTrackReaders();
|
| - CheckTiming();
|
| + valid_ = CheckTiming();
|
| }
|
|
|
| MultiEndCall::~MultiEndCall() = default;
|
| @@ -39,10 +41,23 @@ const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
| return audiotrack_readers_;
|
| }
|
|
|
| +bool MultiEndCall::valid() const {
|
| + return valid_;
|
| +}
|
| +
|
| +size_t MultiEndCall::total_duration_samples() const {
|
| + return total_duration_samples_;
|
| +}
|
| +
|
| +const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns()
|
| + const {
|
| + return speaking_turns_;
|
| +}
|
| +
|
| void MultiEndCall::FindSpeakerNames() {
|
| RTC_DCHECK(speaker_names_.empty());
|
| for (const Turn& turn : timing_) {
|
| - speaker_names_.insert(turn.speaker_name);
|
| + speaker_names_.emplace(turn.speaker_name);
|
| }
|
| }
|
|
|
| @@ -60,14 +75,119 @@ void MultiEndCall::CreateAudioTrackReaders() {
|
| // Map the audiotrack file name to a new instance of WavReaderInterface.
|
| std::unique_ptr<WavReaderInterface> wavreader =
|
| wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());
|
| - audiotrack_readers_.insert(std::make_pair(
|
| - turn.audiotrack_file_name, std::move(wavreader)));
|
| + audiotrack_readers_.emplace(
|
| + turn.audiotrack_file_name, std::move(wavreader));
|
| }
|
| }
|
|
|
| -void MultiEndCall::CheckTiming() {
|
| - // TODO(alessiob): use audiotrack lengths and offset to check whether the
|
| - // timing is valid.
|
| +bool MultiEndCall::CheckTiming() {
|
| + struct Interval {
|
| + size_t begin;
|
| + size_t end;
|
| + };
|
| + size_t number_of_turns = timing_.size();
|
| + auto millisecond_to_samples = [](int ms, int sr) -> int {
|
| + // Truncation may happen if the sampling rate is not an integer multiple
|
| + // of 1000 (e.g., 44100).
|
| + return ms * sr / 1000;
|
| + };
|
| + auto in_interval = [](size_t value, const Interval& interval) {
|
| + return interval.begin <= value && value < interval.end;
|
| + };
|
| + total_duration_samples_ = 0;
|
| + speaking_turns_.clear();
|
| +
|
| + // Begin and end timestamps for the last two turns (unit: number of samples).
|
| + Interval second_last_turn = {0, 0};
|
| + Interval last_turn = {0, 0};
|
| +
|
| + // Initialize map to store speaking turn indices of each speaker (used to
|
| + // detect self cross-talk).
|
| + std::map<std::string, std::vector<size_t>> speaking_turn_indices;
|
| + for (const std::string& speaker_name : speaker_names_) {
|
| + speaking_turn_indices.emplace(
|
| + std::piecewise_construct,
|
| + std::forward_as_tuple(speaker_name),
|
| + std::forward_as_tuple());
|
| + }
|
| +
|
| + // Parse turns.
|
| + for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {
|
| + const Turn& turn = timing_[turn_index];
|
| + auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
|
| + RTC_CHECK(it != audiotrack_readers_.end())
|
| + << "Audio track reader not created";
|
| +
|
| + // Begin and end timestamps for the current turn.
|
| + int offset_samples = millisecond_to_samples(
|
| + turn.offset, it->second->sample_rate());
|
| + size_t begin_timestamp = last_turn.end + offset_samples;
|
| + size_t end_timestamp = begin_timestamp + it->second->num_samples();
|
| + LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp
|
| + << "-" << end_timestamp << " ms";
|
| +
|
| + // The order is invalid if the offset is negative and its absolute value is
|
| + // larger then the duration of the previous turn.
|
| + if (offset_samples < 0 && -offset_samples > static_cast<int>(
|
| + last_turn.end - last_turn.begin)) {
|
| + LOG(LS_ERROR) << "invalid order";
|
| + return false;
|
| + }
|
| +
|
| + // Cross-talk with 3 or more speakers occurs when the beginning of the
|
| + // current interval falls in the last two turns.
|
| + if (turn_index > 1 && in_interval(begin_timestamp, last_turn)
|
| + && in_interval(begin_timestamp, second_last_turn)) {
|
| + LOG(LS_ERROR) << "cross-talk with 3+ speakers";
|
| + return false;
|
| + }
|
| +
|
| + // Append turn.
|
| + speaking_turns_.emplace_back(
|
| + turn.speaker_name, turn.audiotrack_file_name,
|
| + begin_timestamp, end_timestamp);
|
| +
|
| + // Save speaking turn index for self cross-talk detection.
|
| + RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
|
| + speaking_turn_indices[turn.speaker_name].push_back(turn_index);
|
| +
|
| + // Update total duration of the consversational speech.
|
| + if (total_duration_samples_ < end_timestamp)
|
| + total_duration_samples_ = end_timestamp;
|
| +
|
| + // Update and continue with next turn.
|
| + second_last_turn = last_turn;
|
| + last_turn.begin = begin_timestamp;
|
| + last_turn.end = end_timestamp;
|
| + }
|
| +
|
| + // Detect self cross-talk.
|
| + for (const std::string& speaker_name : speaker_names_) {
|
| + LOG(LS_INFO) << "checking self cross-talk for <"
|
| + << speaker_name << ">";
|
| +
|
| + // Copy all turns for this speaker to new vector.
|
| + std::vector<SpeakingTurn> speaking_turns_for_name;
|
| + std::copy_if(speaking_turns_.begin(), speaking_turns_.end(),
|
| + std::back_inserter(speaking_turns_for_name),
|
| + [&speaker_name](const SpeakingTurn& st){
|
| + return st.speaker_name == speaker_name; });
|
| +
|
| + // Check for overlap between adjacent elements.
|
| + // This is a sufficient condition for self cross-talk since the intervals
|
| + // are sorted by begin timestamp.
|
| + auto overlap = std::adjacent_find(
|
| + speaking_turns_for_name.begin(), speaking_turns_for_name.end(),
|
| + [](const SpeakingTurn& a, const SpeakingTurn& b) {
|
| + return a.end > b.begin; });
|
| +
|
| + if (overlap != speaking_turns_for_name.end()) {
|
| + LOG(LS_ERROR) << "Self cross-talk detected";
|
| + return false;
|
| + }
|
| + }
|
| +
|
| + return true;
|
| }
|
|
|
| } // namespace conversational_speech
|
|
|