Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_ca ll.h" | 11 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_ca ll.h" |
| 12 | 12 |
| 13 #include <utility> | 13 #include "webrtc/base/logging.h" |
| 14 | |
| 15 #include "webrtc/base/pathutils.h" | 14 #include "webrtc/base/pathutils.h" |
| 16 | 15 |
| 17 namespace webrtc { | 16 namespace webrtc { |
| 18 namespace test { | 17 namespace test { |
| 19 namespace conversational_speech { | 18 namespace conversational_speech { |
| 20 | 19 |
| 21 MultiEndCall::MultiEndCall( | 20 MultiEndCall::MultiEndCall( |
| 22 rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path, | 21 rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path, |
| 23 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory) | 22 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory) |
| 24 : timing_(timing), audiotracks_path_(audiotracks_path), | 23 : timing_(timing), audiotracks_path_(audiotracks_path), |
| 25 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { | 24 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { |
| 26 FindSpeakerNames(); | 25 FindSpeakerNames(); |
| 27 CreateAudioTrackReaders(); | 26 CreateAudioTrackReaders(); |
| 28 CheckTiming(); | 27 valid_ = CheckTiming(); |
| 29 } | 28 } |
| 30 | 29 |
| 31 MultiEndCall::~MultiEndCall() = default; | 30 MultiEndCall::~MultiEndCall() = default; |
| 32 | 31 |
| 33 const std::set<std::string>& MultiEndCall::speaker_names() const { | 32 const std::set<std::string>& MultiEndCall::speaker_names() const { |
| 34 return speaker_names_; | 33 return speaker_names_; |
| 35 } | 34 } |
| 36 | 35 |
| 37 const std::map<std::string, std::unique_ptr<WavReaderInterface>>& | 36 const std::map<std::string, std::unique_ptr<WavReaderInterface>>& |
| 38 MultiEndCall::audiotrack_readers() const { | 37 MultiEndCall::audiotrack_readers() const { |
| 39 return audiotrack_readers_; | 38 return audiotrack_readers_; |
| 40 } | 39 } |
| 41 | 40 |
| 41 bool MultiEndCall::valid() const { | |
| 42 return valid_; | |
| 43 } | |
| 44 | |
| 45 std::size_t MultiEndCall::total_duration_samples() const { | |
| 46 return total_duration_samples_; | |
| 47 } | |
| 48 | |
| 49 const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns() | |
| 50 const { | |
| 51 return speaking_turns_; | |
| 52 } | |
| 53 | |
| 42 void MultiEndCall::FindSpeakerNames() { | 54 void MultiEndCall::FindSpeakerNames() { |
| 43 RTC_DCHECK(speaker_names_.empty()); | 55 RTC_DCHECK(speaker_names_.empty()); |
| 44 for (const Turn& turn : timing_) { | 56 for (const Turn& turn : timing_) { |
| 45 speaker_names_.insert(turn.speaker_name); | 57 speaker_names_.emplace(turn.speaker_name); |
| 46 } | 58 } |
| 47 } | 59 } |
| 48 | 60 |
| 49 void MultiEndCall::CreateAudioTrackReaders() { | 61 void MultiEndCall::CreateAudioTrackReaders() { |
| 50 RTC_DCHECK(audiotrack_readers_.empty()); | 62 RTC_DCHECK(audiotrack_readers_.empty()); |
| 51 for (const Turn& turn : timing_) { | 63 for (const Turn& turn : timing_) { |
| 52 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); | 64 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); |
| 53 if (it != audiotrack_readers_.end()) | 65 if (it != audiotrack_readers_.end()) |
| 54 continue; | 66 continue; |
| 55 | 67 |
| 56 // Instance Pathname to retrieve the full path to the audiotrack file. | 68 // Instance Pathname to retrieve the full path to the audiotrack file. |
| 57 const rtc::Pathname audiotrack_file_path( | 69 const rtc::Pathname audiotrack_file_path( |
| 58 audiotracks_path_, turn.audiotrack_file_name); | 70 audiotracks_path_, turn.audiotrack_file_name); |
| 59 | 71 |
| 60 // Map the audiotrack file name to a new instance of WavReaderInterface. | 72 // Map the audiotrack file name to a new instance of WavReaderInterface. |
| 61 std::unique_ptr<WavReaderInterface> wavreader = | 73 std::unique_ptr<WavReaderInterface> wavreader = |
| 62 wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); | 74 wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); |
| 63 audiotrack_readers_.insert(std::make_pair( | 75 audiotrack_readers_.emplace( |
| 64 turn.audiotrack_file_name, std::move(wavreader))); | 76 turn.audiotrack_file_name, std::move(wavreader)); |
| 65 } | 77 } |
| 66 } | 78 } |
| 67 | 79 |
| 68 void MultiEndCall::CheckTiming() { | 80 bool MultiEndCall::CheckTiming() { |
| 69 // TODO(alessiob): use audiotrack lengths and offset to check whether the | 81 struct Interval { |
| 70 // timing is valid. | 82 std::size_t begin; |
|
hlundin-webrtc
2017/04/06 08:10:04
size_t
Here and below.
AleBzk
2017/04/06 16:42:42
Done.
| |
| 83 std::size_t end; | |
| 84 }; | |
| 85 std::size_t number_of_turns = timing_.size(); | |
| 86 auto millisecond_to_samples = [](int ms, int sr) -> int { | |
| 87 return ms * sr / 1000; | |
|
hlundin-webrtc
2017/04/06 08:10:04
I'd recommend rtc::CheckedDivExact(sr, 1000)
AleBzk
2017/04/06 16:42:42
If I do that, the tool won't work if the sampling
hlundin-webrtc
2017/04/07 10:24:09
Oh, the tool should be able to handle other rates
AleBzk
2017/04/07 11:37:06
Done.
| |
| 88 }; | |
| 89 auto in_interval = [](std::size_t value, const Interval& interval) { | |
| 90 return interval.begin <= value && value < interval.end; | |
| 91 }; | |
| 92 total_duration_samples_ = 0; | |
| 93 | |
| 94 // Begin and end timestamps for the last two turns (unit: number of samples). | |
| 95 Interval second_last_turn = {0, 0}; | |
| 96 Interval last_turn = {0, 0}; | |
| 97 | |
| 98 // Initialize map to store speaking turn indices of each speaker (used to | |
| 99 // detect self cross-talk). | |
| 100 std::map<std::string, std::vector<std::size_t>> speaking_turn_indices; | |
| 101 for (const std::string& speaker_name : speaker_names_) { | |
| 102 speaking_turn_indices.emplace( | |
| 103 std::piecewise_construct, | |
| 104 std::forward_as_tuple(speaker_name), | |
| 105 std::forward_as_tuple()); | |
| 106 } | |
| 107 | |
| 108 // Parse turns. | |
| 109 for (std::size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { | |
| 110 const Turn& turn = timing_[turn_index]; | |
| 111 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); | |
| 112 RTC_CHECK(it != audiotrack_readers_.end()) | |
|
hlundin-webrtc
2017/04/06 08:10:04
RTC_CHECK_NE
AleBzk
2017/04/06 16:42:42
RTC_CHECK_NE(it, audiotrack_readers_.end()) raises
hlundin-webrtc
2017/04/07 10:24:09
Hmm. Boring. Keep this as is then.
AleBzk
2017/04/07 11:37:06
Acknowledged.
| |
| 113 << "Audio track reader not created"; | |
| 114 | |
| 115 // Begin and end timestamps for the current turn. | |
| 116 int offset_samples = millisecond_to_samples( | |
| 117 turn.offset, it->second->sample_rate()); | |
| 118 std::size_t begin_timestamp = last_turn.end + offset_samples; | |
| 119 std::size_t end_timestamp = begin_timestamp + it->second->num_samples(); | |
| 120 LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp | |
| 121 << "-" << end_timestamp << " ms"; | |
| 122 | |
| 123 // The order is invalid if the offset is negative and its absolute value is | |
| 124 // larger then the duration of the previous turn. | |
| 125 if (offset_samples < 0 && -offset_samples > int( | |
|
hlundin-webrtc
2017/04/06 08:10:04
static_cast<int>(last_turn.end - last_turn.begin)
AleBzk
2017/04/06 16:42:42
Done.
| |
| 126 last_turn.end - last_turn.begin)) { | |
| 127 LOG(LS_ERROR) << "invalid order"; | |
| 128 return false; | |
| 129 } | |
| 130 | |
| 131 // Cross-talk with 3 or more speakers occurs when the beginning of the | |
| 132 // current interval falls in the last two turns. | |
| 133 if (turn_index > 1 && in_interval(begin_timestamp, last_turn) | |
| 134 && in_interval(begin_timestamp, second_last_turn)) { | |
| 135 LOG(LS_ERROR) << "cross-talk with 3+ speakers"; | |
| 136 return false; | |
| 137 } | |
| 138 | |
| 139 // Append turn. | |
| 140 speaking_turns_.emplace_back( | |
| 141 turn.speaker_name, turn.audiotrack_file_name, | |
| 142 begin_timestamp, end_timestamp); | |
| 143 | |
| 144 // Save speaking turn index for self cross-talk detection. | |
| 145 speaking_turn_indices[turn.speaker_name].push_back(turn_index); | |
|
hlundin-webrtc
2017/04/06 08:10:04
You are relying on an implicit assumption that tur
AleBzk
2017/04/06 16:42:42
Done.
| |
| 146 | |
| 147 // Update total duration of the consversational speech. | |
| 148 if (total_duration_samples_ < end_timestamp) | |
| 149 total_duration_samples_ = end_timestamp; | |
| 150 | |
| 151 // Update and continue with next turn. | |
| 152 second_last_turn = last_turn; | |
| 153 last_turn.begin = begin_timestamp; | |
| 154 last_turn.end = end_timestamp; | |
| 155 } | |
| 156 | |
| 157 // Detect self cross-talk. | |
| 158 for (const std::string& speaker_name : speaker_names_) { | |
|
hlundin-webrtc
2017/04/06 08:10:04
The speaking_turn_indices variable is only used fo
AleBzk
2017/04/06 16:42:42
Cool! Happy to learn about std::copy_if and std::a
| |
| 159 LOG(LS_INFO) << "checking self cross-talk for <" | |
| 160 << speaker_name << ">"; | |
| 161 if (DetectSelfCrossTalk(speaking_turn_indices[speaker_name])) { | |
|
hlundin-webrtc
2017/04/06 08:10:04
It is a bit tricky to use the map::[] operator her
AleBzk
2017/04/06 16:42:42
I'll go the way you suggested in your previous com
| |
| 162 LOG(LS_ERROR) << "Self cross-talk detected"; | |
| 163 return false; | |
| 164 } | |
| 165 } | |
| 166 | |
| 167 return true; | |
| 168 } | |
| 169 | |
| 170 bool MultiEndCall::DetectSelfCrossTalk( | |
| 171 const std::vector<std::size_t>& speaking_turn_indices) const { | |
| 172 // Compare adjacent speaking turn pairs. | |
| 173 for (std::size_t index = 1; index < speaking_turn_indices.size(); ++index) { | |
| 174 const SpeakingTurn& previous_interval = speaking_turns_[ | |
| 175 speaking_turn_indices[index - 1]]; | |
| 176 const SpeakingTurn& interval = speaking_turns_[ | |
| 177 speaking_turn_indices[index]]; | |
| 178 | |
| 179 // Check if there is overlap with the previous interval. | |
| 180 // This is a sufficient condition for self cross-talk since the intervals | |
| 181 // are sorted by begin timestamp. | |
| 182 if (previous_interval.end > interval.begin) { | |
| 183 return true; | |
| 184 } | |
| 185 } | |
| 186 return false; | |
| 71 } | 187 } |
| 72 | 188 |
| 73 } // namespace conversational_speech | 189 } // namespace conversational_speech |
| 74 } // namespace test | 190 } // namespace test |
| 75 } // namespace webrtc | 191 } // namespace webrtc |
| OLD | NEW |