OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_ca
ll.h" | 11 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_ca
ll.h" |
12 | 12 |
13 #include <utility> | 13 #include <algorithm> |
| 14 #include <iterator> |
14 | 15 |
| 16 #include "webrtc/base/logging.h" |
15 #include "webrtc/base/pathutils.h" | 17 #include "webrtc/base/pathutils.h" |
16 | 18 |
17 namespace webrtc { | 19 namespace webrtc { |
18 namespace test { | 20 namespace test { |
19 namespace conversational_speech { | 21 namespace conversational_speech { |
20 | 22 |
21 MultiEndCall::MultiEndCall( | 23 MultiEndCall::MultiEndCall( |
22 rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path, | 24 rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path, |
23 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory) | 25 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory) |
24 : timing_(timing), audiotracks_path_(audiotracks_path), | 26 : timing_(timing), audiotracks_path_(audiotracks_path), |
25 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { | 27 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { |
26 FindSpeakerNames(); | 28 FindSpeakerNames(); |
27 CreateAudioTrackReaders(); | 29 CreateAudioTrackReaders(); |
28 CheckTiming(); | 30 valid_ = CheckTiming(); |
29 } | 31 } |
30 | 32 |
31 MultiEndCall::~MultiEndCall() = default; | 33 MultiEndCall::~MultiEndCall() = default; |
32 | 34 |
33 const std::set<std::string>& MultiEndCall::speaker_names() const { | 35 const std::set<std::string>& MultiEndCall::speaker_names() const { |
34 return speaker_names_; | 36 return speaker_names_; |
35 } | 37 } |
36 | 38 |
37 const std::map<std::string, std::unique_ptr<WavReaderInterface>>& | 39 const std::map<std::string, std::unique_ptr<WavReaderInterface>>& |
38 MultiEndCall::audiotrack_readers() const { | 40 MultiEndCall::audiotrack_readers() const { |
39 return audiotrack_readers_; | 41 return audiotrack_readers_; |
40 } | 42 } |
41 | 43 |
| 44 bool MultiEndCall::valid() const { |
| 45 return valid_; |
| 46 } |
| 47 |
| 48 size_t MultiEndCall::total_duration_samples() const { |
| 49 return total_duration_samples_; |
| 50 } |
| 51 |
| 52 const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns() |
| 53 const { |
| 54 return speaking_turns_; |
| 55 } |
| 56 |
42 void MultiEndCall::FindSpeakerNames() { | 57 void MultiEndCall::FindSpeakerNames() { |
43 RTC_DCHECK(speaker_names_.empty()); | 58 RTC_DCHECK(speaker_names_.empty()); |
44 for (const Turn& turn : timing_) { | 59 for (const Turn& turn : timing_) { |
45 speaker_names_.insert(turn.speaker_name); | 60 speaker_names_.emplace(turn.speaker_name); |
46 } | 61 } |
47 } | 62 } |
48 | 63 |
49 void MultiEndCall::CreateAudioTrackReaders() { | 64 void MultiEndCall::CreateAudioTrackReaders() { |
50 RTC_DCHECK(audiotrack_readers_.empty()); | 65 RTC_DCHECK(audiotrack_readers_.empty()); |
51 for (const Turn& turn : timing_) { | 66 for (const Turn& turn : timing_) { |
52 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); | 67 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); |
53 if (it != audiotrack_readers_.end()) | 68 if (it != audiotrack_readers_.end()) |
54 continue; | 69 continue; |
55 | 70 |
56 // Instance Pathname to retrieve the full path to the audiotrack file. | 71 // Instance Pathname to retrieve the full path to the audiotrack file. |
57 const rtc::Pathname audiotrack_file_path( | 72 const rtc::Pathname audiotrack_file_path( |
58 audiotracks_path_, turn.audiotrack_file_name); | 73 audiotracks_path_, turn.audiotrack_file_name); |
59 | 74 |
60 // Map the audiotrack file name to a new instance of WavReaderInterface. | 75 // Map the audiotrack file name to a new instance of WavReaderInterface. |
61 std::unique_ptr<WavReaderInterface> wavreader = | 76 std::unique_ptr<WavReaderInterface> wavreader = |
62 wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); | 77 wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); |
63 audiotrack_readers_.insert(std::make_pair( | 78 audiotrack_readers_.emplace( |
64 turn.audiotrack_file_name, std::move(wavreader))); | 79 turn.audiotrack_file_name, std::move(wavreader)); |
65 } | 80 } |
66 } | 81 } |
67 | 82 |
68 void MultiEndCall::CheckTiming() { | 83 bool MultiEndCall::CheckTiming() { |
69 // TODO(alessiob): use audiotrack lengths and offset to check whether the | 84 struct Interval { |
70 // timing is valid. | 85 size_t begin; |
| 86 size_t end; |
| 87 }; |
| 88 size_t number_of_turns = timing_.size(); |
| 89 auto millisecond_to_samples = [](int ms, int sr) -> int { |
| 90 // Truncation may happen if the sampling rate is not an integer multiple |
| 91 // of 1000 (e.g., 44100). |
| 92 return ms * sr / 1000; |
| 93 }; |
| 94 auto in_interval = [](size_t value, const Interval& interval) { |
| 95 return interval.begin <= value && value < interval.end; |
| 96 }; |
| 97 total_duration_samples_ = 0; |
| 98 speaking_turns_.clear(); |
| 99 |
| 100 // Begin and end timestamps for the last two turns (unit: number of samples). |
| 101 Interval second_last_turn = {0, 0}; |
| 102 Interval last_turn = {0, 0}; |
| 103 |
| 104 // Initialize map to store speaking turn indices of each speaker (used to |
| 105 // detect self cross-talk). |
| 106 std::map<std::string, std::vector<size_t>> speaking_turn_indices; |
| 107 for (const std::string& speaker_name : speaker_names_) { |
| 108 speaking_turn_indices.emplace( |
| 109 std::piecewise_construct, |
| 110 std::forward_as_tuple(speaker_name), |
| 111 std::forward_as_tuple()); |
| 112 } |
| 113 |
| 114 // Parse turns. |
| 115 for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { |
| 116 const Turn& turn = timing_[turn_index]; |
| 117 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); |
| 118 RTC_CHECK(it != audiotrack_readers_.end()) |
| 119 << "Audio track reader not created"; |
| 120 |
| 121 // Begin and end timestamps for the current turn. |
| 122 int offset_samples = millisecond_to_samples( |
| 123 turn.offset, it->second->sample_rate()); |
| 124 size_t begin_timestamp = last_turn.end + offset_samples; |
| 125 size_t end_timestamp = begin_timestamp + it->second->num_samples(); |
| 126 LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp |
| 127 << "-" << end_timestamp << " ms"; |
| 128 |
| 129 // The order is invalid if the offset is negative and its absolute value is |
| 130 // larger then the duration of the previous turn. |
| 131 if (offset_samples < 0 && -offset_samples > static_cast<int>( |
| 132 last_turn.end - last_turn.begin)) { |
| 133 LOG(LS_ERROR) << "invalid order"; |
| 134 return false; |
| 135 } |
| 136 |
| 137 // Cross-talk with 3 or more speakers occurs when the beginning of the |
| 138 // current interval falls in the last two turns. |
| 139 if (turn_index > 1 && in_interval(begin_timestamp, last_turn) |
| 140 && in_interval(begin_timestamp, second_last_turn)) { |
| 141 LOG(LS_ERROR) << "cross-talk with 3+ speakers"; |
| 142 return false; |
| 143 } |
| 144 |
| 145 // Append turn. |
| 146 speaking_turns_.emplace_back( |
| 147 turn.speaker_name, turn.audiotrack_file_name, |
| 148 begin_timestamp, end_timestamp); |
| 149 |
| 150 // Save speaking turn index for self cross-talk detection. |
| 151 RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1); |
| 152 speaking_turn_indices[turn.speaker_name].push_back(turn_index); |
| 153 |
| 154 // Update total duration of the consversational speech. |
| 155 if (total_duration_samples_ < end_timestamp) |
| 156 total_duration_samples_ = end_timestamp; |
| 157 |
| 158 // Update and continue with next turn. |
| 159 second_last_turn = last_turn; |
| 160 last_turn.begin = begin_timestamp; |
| 161 last_turn.end = end_timestamp; |
| 162 } |
| 163 |
| 164 // Detect self cross-talk. |
| 165 for (const std::string& speaker_name : speaker_names_) { |
| 166 LOG(LS_INFO) << "checking self cross-talk for <" |
| 167 << speaker_name << ">"; |
| 168 |
| 169 // Copy all turns for this speaker to new vector. |
| 170 std::vector<SpeakingTurn> speaking_turns_for_name; |
| 171 std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), |
| 172 std::back_inserter(speaking_turns_for_name), |
| 173 [&speaker_name](const SpeakingTurn& st){ |
| 174 return st.speaker_name == speaker_name; }); |
| 175 |
| 176 // Check for overlap between adjacent elements. |
| 177 // This is a sufficient condition for self cross-talk since the intervals |
| 178 // are sorted by begin timestamp. |
| 179 auto overlap = std::adjacent_find( |
| 180 speaking_turns_for_name.begin(), speaking_turns_for_name.end(), |
| 181 [](const SpeakingTurn& a, const SpeakingTurn& b) { |
| 182 return a.end > b.begin; }); |
| 183 |
| 184 if (overlap != speaking_turns_for_name.end()) { |
| 185 LOG(LS_ERROR) << "Self cross-talk detected"; |
| 186 return false; |
| 187 } |
| 188 } |
| 189 |
| 190 return true; |
71 } | 191 } |
72 | 192 |
73 } // namespace conversational_speech | 193 } // namespace conversational_speech |
74 } // namespace test | 194 } // namespace test |
75 } // namespace webrtc | 195 } // namespace webrtc |
OLD | NEW |