Index: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
index f16aa753fa427a42b7def1298c145a52f3154afd..ba36514e779e916108b43b5c8bde4bb413bd6b08 100644 |
--- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
+++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
@@ -10,8 +10,7 @@ |
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" |
-#include <utility> |
- |
+#include "webrtc/base/logging.h" |
#include "webrtc/base/pathutils.h" |
namespace webrtc { |
@@ -25,7 +24,7 @@ MultiEndCall::MultiEndCall( |
wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { |
FindSpeakerNames(); |
CreateAudioTrackReaders(); |
- CheckTiming(); |
+ valid_ = CheckTiming(); |
} |
MultiEndCall::~MultiEndCall() = default; |
@@ -39,6 +38,10 @@ const std::map<std::string, std::unique_ptr<WavReaderInterface>>& |
return audiotrack_readers_; |
} |
+bool MultiEndCall::valid() { |
+ return valid_; |
+} |
+ |
void MultiEndCall::FindSpeakerNames() { |
RTC_DCHECK(speaker_names_.empty()); |
for (const Turn& turn : timing_) { |
@@ -65,9 +68,106 @@ void MultiEndCall::CreateAudioTrackReaders() { |
} |
} |
-void MultiEndCall::CheckTiming() { |
- // TODO(alessiob): use audiotrack lengths and offset to check whether the |
- // timing is valid. |
+bool MultiEndCall::CheckTiming() const { |
+ std::size_t number_of_turns = timing_.size(); |
+ auto millisecond_to_samples = [](int ms, int sr) -> int { |
+ return ms * sr / 1000; |
+ }; |
+ auto in_interval = [](std::size_t value, const Interval& interval) { |
+ return interval.first <= value && value < interval.second; |
+ }; |
+ |
+ // Begin and end timestamps for the last two turns (unit: number of samples). |
+ Interval second_last_turn = {0, 0}; |
+ Interval last_turn = {0, 0}; |
+ |
+ // Initialize map to store turn intervals of each speaker (used to detect self |
+ // cross-talk). |
+ std::map<std::string, std::unique_ptr<IntervalsVector>> speakers_intervals; |
+ for (const std::string& speaker_name : speaker_names_) { |
+ // Initialize a vector. |
+ speakers_intervals.insert(std::make_pair( |
+ speaker_name, std::unique_ptr<IntervalsVector>( |
+ new IntervalsVector()))); |
+ LOG(LS_VERBOSE) << "speaker_intervals vector for <" << speaker_name |
+ << "> preallocated (capacity: " |
+ << speakers_intervals[speaker_name]->capacity() << ")"; |
+ } |
+ |
+ // Parse turns. |
+ for (std::size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { |
+ const Turn& turn = timing_[turn_index]; |
+ auto it = audiotrack_readers_.find(turn.audiotrack_file_name); |
+ RTC_CHECK(it != audiotrack_readers_.end()) |
+ << "Audio track reader not created"; |
+ |
+ // Begin and end timestamps for the current turn. |
+ int offset_samples = millisecond_to_samples( |
+ turn.offset, it->second->sample_rate()); |
+ std::size_t begin_timestamp = last_turn.second + offset_samples; |
+ std::size_t end_timestamp = begin_timestamp + it->second->num_samples(); |
+ LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp |
+ << "-" << end_timestamp << " ms"; |
+ |
+ // The order is invalid if the offset is negative and its absolute value is |
+ // larger then the duration of the previous turn. |
+ if (offset_samples < 0 && -offset_samples > int( |
+ last_turn.second - last_turn.first)) { |
+ LOG(LS_ERROR) << "invalid order"; |
+ return false; |
+ } |
+ |
+ // Cross-talk with 3 or more speakers occurs when the beginning of the |
+ // current interval falls in the last two turns. |
+ if (turn_index > 1 && in_interval(begin_timestamp, last_turn) |
+ && in_interval(begin_timestamp, second_last_turn)) { |
+ LOG(LS_ERROR) << "cross-talk with 3+ speakers"; |
+ return false; |
+ } |
+ |
+ // Save speaker turn interval. |
+ Interval current_turn = {begin_timestamp, end_timestamp}; |
+ speakers_intervals[turn.speaker_name]->push_back(current_turn); |
+ |
+ // Update and continue with next turn. |
+ second_last_turn = last_turn; |
+ last_turn = current_turn; |
+ } |
+ |
+ // Detect self cross-talk. |
+ for (const std::string& speaker_name : speaker_names_) { |
+ LOG(LS_INFO) << "checking self cross-talk for <" |
+ << speaker_name << ">"; |
+ if (DetectSelfCrossTalk(speakers_intervals[speaker_name].get())) { |
+ LOG(LS_ERROR) << "Self cross-talk detected"; |
+ return false; |
+ } |
+ } |
+ |
+ return true; |
+} |
+ |
+bool MultiEndCall::DetectSelfCrossTalk(IntervalsVector* speaker_intervals) |
+ const { |
+ Interval previous_interval = speaker_intervals->at(0); |
+ LOG(LS_VERBOSE) << "#0" << ": " << previous_interval.first << " " |
+ << previous_interval.second; |
+ for (std::size_t index = 1; index < speaker_intervals->size(); ++index) { |
+ auto interval = speaker_intervals->at(index); |
+ LOG(LS_VERBOSE) << "#" << index << ": " << interval.first << " " |
+ << interval.second; |
+ |
+ // Check if there is overlap with the previous interval. |
+ // This is a sufficient condition for self cross-talk since the intervals |
+ // are sorted by begin timestamp. |
+ if (previous_interval.second > interval.first) { |
+ return true; |
+ } |
+ |
+ // Update and continue with next turn. |
+ previous_interval = interval; |
+ } |
+ return false; |
} |
} // namespace conversational_speech |