webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc - Issue 2781573002: Conversational Speech tool, MultiEndCall::CheckTiming() and tests

Unified Diff: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc

Issue 2781573002: Conversational Speech tool, MultiEndCall::CheckTiming() and tests (Closed)

Patch Set: missing include to get std::back_inserter working on win targets Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc

diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc

index f16aa753fa427a42b7def1298c145a52f3154afd..ad1d9a0c87e90d9dc4c398bc21d120673379c366 100644

--- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc

+++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc

@@ -10,8 +10,10 @@

#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"

-#include <utility>

+#include <algorithm>

+#include <iterator>

+#include "webrtc/base/logging.h"

#include "webrtc/base/pathutils.h"

namespace webrtc {

@@ -25,7 +27,7 @@ MultiEndCall::MultiEndCall(

wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {

FindSpeakerNames();

CreateAudioTrackReaders();

- CheckTiming();

+ valid_ = CheckTiming();

}

MultiEndCall::~MultiEndCall() = default;

@@ -39,10 +41,23 @@ const std::map<std::string, std::unique_ptr<WavReaderInterface>>&

return audiotrack_readers_;

}

+bool MultiEndCall::valid() const {

+ return valid_;

+size_t MultiEndCall::total_duration_samples() const {

+ return total_duration_samples_;

+const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns()

+ const {

+ return speaking_turns_;

void MultiEndCall::FindSpeakerNames() {

RTC_DCHECK(speaker_names_.empty());

for (const Turn& turn : timing_) {

- speaker_names_.insert(turn.speaker_name);

+ speaker_names_.emplace(turn.speaker_name);

}

@@ -60,14 +75,119 @@ void MultiEndCall::CreateAudioTrackReaders() {

// Map the audiotrack file name to a new instance of WavReaderInterface.

std::unique_ptr<WavReaderInterface> wavreader =

wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());

- audiotrack_readers_.insert(std::make_pair(

- turn.audiotrack_file_name, std::move(wavreader)));

+ audiotrack_readers_.emplace(

+ turn.audiotrack_file_name, std::move(wavreader));

}

-void MultiEndCall::CheckTiming() {

- // TODO(alessiob): use audiotrack lengths and offset to check whether the

- // timing is valid.

+bool MultiEndCall::CheckTiming() {

+ struct Interval {

+ size_t begin;

+ size_t end;

+ };

+ size_t number_of_turns = timing_.size();

+ auto millisecond_to_samples = [](int ms, int sr) -> int {

+ // Truncation may happen if the sampling rate is not an integer multiple

+ // of 1000 (e.g., 44100).

+ return ms * sr / 1000;

+ };

+ auto in_interval = [](size_t value, const Interval& interval) {

+ return interval.begin <= value && value < interval.end;

+ };

+ total_duration_samples_ = 0;

+ speaking_turns_.clear();

+ // Begin and end timestamps for the last two turns (unit: number of samples).

+ Interval second_last_turn = {0, 0};

+ Interval last_turn = {0, 0};

+ // Initialize map to store speaking turn indices of each speaker (used to

+ // detect self cross-talk).

+ std::map<std::string, std::vector<size_t>> speaking_turn_indices;

+ for (const std::string& speaker_name : speaker_names_) {

+ speaking_turn_indices.emplace(

+ std::piecewise_construct,

+ std::forward_as_tuple(speaker_name),

+ std::forward_as_tuple());

+ }

+ // Parse turns.

+ for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {

+ const Turn& turn = timing_[turn_index];

+ auto it = audiotrack_readers_.find(turn.audiotrack_file_name);

+ RTC_CHECK(it != audiotrack_readers_.end())

+ << "Audio track reader not created";

+ // Begin and end timestamps for the current turn.

+ int offset_samples = millisecond_to_samples(

+ turn.offset, it->second->sample_rate());

+ size_t begin_timestamp = last_turn.end + offset_samples;

+ size_t end_timestamp = begin_timestamp + it->second->num_samples();

+ LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp

+ << "-" << end_timestamp << " ms";

+ // The order is invalid if the offset is negative and its absolute value is

+ // larger then the duration of the previous turn.

+ if (offset_samples < 0 && -offset_samples > static_cast<int>(

+ last_turn.end - last_turn.begin)) {

+ LOG(LS_ERROR) << "invalid order";

+ return false;

+ }

+ // Cross-talk with 3 or more speakers occurs when the beginning of the

+ // current interval falls in the last two turns.

+ if (turn_index > 1 && in_interval(begin_timestamp, last_turn)

+ && in_interval(begin_timestamp, second_last_turn)) {

+ LOG(LS_ERROR) << "cross-talk with 3+ speakers";

+ return false;

+ }

+ // Append turn.

+ speaking_turns_.emplace_back(

+ turn.speaker_name, turn.audiotrack_file_name,

+ begin_timestamp, end_timestamp);

+ // Save speaking turn index for self cross-talk detection.

+ RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);

+ speaking_turn_indices[turn.speaker_name].push_back(turn_index);

+ // Update total duration of the consversational speech.

+ if (total_duration_samples_ < end_timestamp)

+ total_duration_samples_ = end_timestamp;

+ // Update and continue with next turn.

+ second_last_turn = last_turn;

+ last_turn.begin = begin_timestamp;

+ last_turn.end = end_timestamp;

+ }

+ // Detect self cross-talk.

+ for (const std::string& speaker_name : speaker_names_) {

+ LOG(LS_INFO) << "checking self cross-talk for <"

+ << speaker_name << ">";

+ // Copy all turns for this speaker to new vector.

+ std::vector<SpeakingTurn> speaking_turns_for_name;

+ std::copy_if(speaking_turns_.begin(), speaking_turns_.end(),

+ std::back_inserter(speaking_turns_for_name),

+ [&speaker_name](const SpeakingTurn& st){

+ return st.speaker_name == speaker_name; });

+ // Check for overlap between adjacent elements.

+ // This is a sufficient condition for self cross-talk since the intervals

+ // are sorted by begin timestamp.

+ auto overlap = std::adjacent_find(

+ speaking_turns_for_name.begin(), speaking_turns_for_name.end(),

+ [](const SpeakingTurn& a, const SpeakingTurn& b) {

+ return a.end > b.begin; });

+ if (overlap != speaking_turns_for_name.end()) {

+ LOG(LS_ERROR) << "Self cross-talk detected";

+ return false;

+ }

+ return true;

}

} // namespace conversational_speech

« no previous file with comments | « webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h ('k') | no next file » | no next file with comments »