Index: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
index f16aa753fa427a42b7def1298c145a52f3154afd..8fe43aa1dbd290507039a788d42504723dd3fba6 100644 |
--- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
+++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc |
@@ -10,8 +10,7 @@ |
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" |
-#include <utility> |
- |
+#include "webrtc/base/logging.h" |
#include "webrtc/base/pathutils.h" |
namespace webrtc { |
@@ -25,7 +24,7 @@ MultiEndCall::MultiEndCall( |
wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { |
FindSpeakerNames(); |
CreateAudioTrackReaders(); |
- CheckTiming(); |
+ valid_ = CheckTiming(); |
} |
MultiEndCall::~MultiEndCall() = default; |
@@ -39,10 +38,23 @@ const std::map<std::string, std::unique_ptr<WavReaderInterface>>& |
return audiotrack_readers_; |
} |
+bool MultiEndCall::valid() const { |
+ return valid_; |
+} |
+ |
+std::size_t MultiEndCall::total_duration_samples() const { |
+ return total_duration_samples_; |
+} |
+ |
+const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns() |
+ const { |
+ return speaking_turns_; |
+} |
+ |
void MultiEndCall::FindSpeakerNames() { |
RTC_DCHECK(speaker_names_.empty()); |
for (const Turn& turn : timing_) { |
- speaker_names_.insert(turn.speaker_name); |
+ speaker_names_.emplace(turn.speaker_name); |
} |
} |
@@ -60,14 +72,118 @@ void MultiEndCall::CreateAudioTrackReaders() { |
// Map the audiotrack file name to a new instance of WavReaderInterface. |
std::unique_ptr<WavReaderInterface> wavreader = |
wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); |
- audiotrack_readers_.insert(std::make_pair( |
- turn.audiotrack_file_name, std::move(wavreader))); |
+ audiotrack_readers_.emplace( |
+ turn.audiotrack_file_name, std::move(wavreader)); |
} |
} |
-void MultiEndCall::CheckTiming() { |
- // TODO(alessiob): use audiotrack lengths and offset to check whether the |
- // timing is valid. |
+bool MultiEndCall::CheckTiming() { |
+ struct Interval { |
+ std::size_t begin; |
hlundin-webrtc
2017/04/06 08:10:04
size_t
Here and below.
AleBzk
2017/04/06 16:42:42
Done.
|
+ std::size_t end; |
+ }; |
+ std::size_t number_of_turns = timing_.size(); |
+ auto millisecond_to_samples = [](int ms, int sr) -> int { |
+ return ms * sr / 1000; |
hlundin-webrtc
2017/04/06 08:10:04
I'd recommend rtc::CheckedDivExact(sr, 1000)
AleBzk
2017/04/06 16:42:42
If I do that, the tool won't work if the sampling
hlundin-webrtc
2017/04/07 10:24:09
Oh, the tool should be able to handle other rates
AleBzk
2017/04/07 11:37:06
Done.
|
+ }; |
+ auto in_interval = [](std::size_t value, const Interval& interval) { |
+ return interval.begin <= value && value < interval.end; |
+ }; |
+ total_duration_samples_ = 0; |
+ |
+ // Begin and end timestamps for the last two turns (unit: number of samples). |
+ Interval second_last_turn = {0, 0}; |
+ Interval last_turn = {0, 0}; |
+ |
+ // Initialize map to store speaking turn indices of each speaker (used to |
+ // detect self cross-talk). |
+ std::map<std::string, std::vector<std::size_t>> speaking_turn_indices; |
+ for (const std::string& speaker_name : speaker_names_) { |
+ speaking_turn_indices.emplace( |
+ std::piecewise_construct, |
+ std::forward_as_tuple(speaker_name), |
+ std::forward_as_tuple()); |
+ } |
+ |
+ // Parse turns. |
+ for (std::size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { |
+ const Turn& turn = timing_[turn_index]; |
+ auto it = audiotrack_readers_.find(turn.audiotrack_file_name); |
+ RTC_CHECK(it != audiotrack_readers_.end()) |
hlundin-webrtc
2017/04/06 08:10:04
RTC_CHECK_NE
AleBzk
2017/04/06 16:42:42
RTC_CHECK_NE(it, audiotrack_readers_.end()) raises
hlundin-webrtc
2017/04/07 10:24:09
Hmm. Boring. Keep this as is then.
AleBzk
2017/04/07 11:37:06
Acknowledged.
|
+ << "Audio track reader not created"; |
+ |
+ // Begin and end timestamps for the current turn. |
+ int offset_samples = millisecond_to_samples( |
+ turn.offset, it->second->sample_rate()); |
+ std::size_t begin_timestamp = last_turn.end + offset_samples; |
+ std::size_t end_timestamp = begin_timestamp + it->second->num_samples(); |
+ LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp |
+ << "-" << end_timestamp << " ms"; |
+ |
+ // The order is invalid if the offset is negative and its absolute value is |
+ // larger then the duration of the previous turn. |
+ if (offset_samples < 0 && -offset_samples > int( |
hlundin-webrtc
2017/04/06 08:10:04
static_cast<int>(last_turn.end - last_turn.begin)
AleBzk
2017/04/06 16:42:42
Done.
|
+ last_turn.end - last_turn.begin)) { |
+ LOG(LS_ERROR) << "invalid order"; |
+ return false; |
+ } |
+ |
+ // Cross-talk with 3 or more speakers occurs when the beginning of the |
+ // current interval falls in the last two turns. |
+ if (turn_index > 1 && in_interval(begin_timestamp, last_turn) |
+ && in_interval(begin_timestamp, second_last_turn)) { |
+ LOG(LS_ERROR) << "cross-talk with 3+ speakers"; |
+ return false; |
+ } |
+ |
+ // Append turn. |
+ speaking_turns_.emplace_back( |
+ turn.speaker_name, turn.audiotrack_file_name, |
+ begin_timestamp, end_timestamp); |
+ |
+ // Save speaking turn index for self cross-talk detection. |
+ speaking_turn_indices[turn.speaker_name].push_back(turn_index); |
hlundin-webrtc
2017/04/06 08:10:04
You are relying on an implicit assumption that tur
AleBzk
2017/04/06 16:42:42
Done.
|
+ |
+ // Update total duration of the consversational speech. |
+ if (total_duration_samples_ < end_timestamp) |
+ total_duration_samples_ = end_timestamp; |
+ |
+ // Update and continue with next turn. |
+ second_last_turn = last_turn; |
+ last_turn.begin = begin_timestamp; |
+ last_turn.end = end_timestamp; |
+ } |
+ |
+ // Detect self cross-talk. |
+ for (const std::string& speaker_name : speaker_names_) { |
hlundin-webrtc
2017/04/06 08:10:04
The speaking_turn_indices variable is only used fo
AleBzk
2017/04/06 16:42:42
Cool! Happy to learn about std::copy_if and std::a
|
+ LOG(LS_INFO) << "checking self cross-talk for <" |
+ << speaker_name << ">"; |
+ if (DetectSelfCrossTalk(speaking_turn_indices[speaker_name])) { |
hlundin-webrtc
2017/04/06 08:10:04
It is a bit tricky to use the map::[] operator her
AleBzk
2017/04/06 16:42:42
I'll go the way you suggested in your previous com
|
+ LOG(LS_ERROR) << "Self cross-talk detected"; |
+ return false; |
+ } |
+ } |
+ |
+ return true; |
+} |
+ |
+bool MultiEndCall::DetectSelfCrossTalk( |
+ const std::vector<std::size_t>& speaking_turn_indices) const { |
+ // Compare adjacent speaking turn pairs. |
+ for (std::size_t index = 1; index < speaking_turn_indices.size(); ++index) { |
+ const SpeakingTurn& previous_interval = speaking_turns_[ |
+ speaking_turn_indices[index - 1]]; |
+ const SpeakingTurn& interval = speaking_turns_[ |
+ speaking_turn_indices[index]]; |
+ |
+ // Check if there is overlap with the previous interval. |
+ // This is a sufficient condition for self cross-talk since the intervals |
+ // are sorted by begin timestamp. |
+ if (previous_interval.end > interval.begin) { |
+ return true; |
+ } |
+ } |
+ return false; |
} |
} // namespace conversational_speech |