webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc - Issue 2781573002: Conversational Speech tool, MultiEndCall::CheckTiming() and tests

Side by Side Diff: webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc

Issue 2781573002: Conversational Speech tool, MultiEndCall::CheckTiming() and tests (Closed)

Patch Set: final refactoring Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h ('K') | « webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_ca ll.h"	11 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_ca ll.h"

12	12

13 #include <utility>	13 #include "webrtc/base/logging.h"

14

15 #include "webrtc/base/pathutils.h"	14 #include "webrtc/base/pathutils.h"

16	15

17 namespace webrtc {	16 namespace webrtc {

18 namespace test {	17 namespace test {

19 namespace conversational_speech {	18 namespace conversational_speech {

20	19

21 MultiEndCall::MultiEndCall(	20 MultiEndCall::MultiEndCall(

22 rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path,	21 rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path,

23 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)	22 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)

24 : timing_(timing), audiotracks_path_(audiotracks_path),	23 : timing_(timing), audiotracks_path_(audiotracks_path),

25 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {	24 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {

26 FindSpeakerNames();	25 FindSpeakerNames();

27 CreateAudioTrackReaders();	26 CreateAudioTrackReaders();

28 CheckTiming();	27 valid_ = CheckTiming();

29 }	28 }

30	29

31 MultiEndCall::~MultiEndCall() = default;	30 MultiEndCall::~MultiEndCall() = default;

32	31

33 const std::set<std::string>& MultiEndCall::speaker_names() const {	32 const std::set<std::string>& MultiEndCall::speaker_names() const {

34 return speaker_names_;	33 return speaker_names_;

35 }	34 }

36	35

37 const std::map<std::string, std::unique_ptr<WavReaderInterface>>&	36 const std::map<std::string, std::unique_ptr<WavReaderInterface>>&

38 MultiEndCall::audiotrack_readers() const {	37 MultiEndCall::audiotrack_readers() const {

39 return audiotrack_readers_;	38 return audiotrack_readers_;

40 }	39 }

41	40

	41 bool MultiEndCall::valid() const {

	42 return valid_;

	43 }

	44

	45 std::size_t MultiEndCall::total_duration_samples() const {

	46 return total_duration_samples_;

	47 }

	48

	49 const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns()

	50 const {

	51 return speaking_turns_;

	52 }

	53

42 void MultiEndCall::FindSpeakerNames() {	54 void MultiEndCall::FindSpeakerNames() {

43 RTC_DCHECK(speaker_names_.empty());	55 RTC_DCHECK(speaker_names_.empty());

44 for (const Turn& turn : timing_) {	56 for (const Turn& turn : timing_) {

45 speaker_names_.insert(turn.speaker_name);	57 speaker_names_.emplace(turn.speaker_name);

46 }	58 }

47 }	59 }

48	60

49 void MultiEndCall::CreateAudioTrackReaders() {	61 void MultiEndCall::CreateAudioTrackReaders() {

50 RTC_DCHECK(audiotrack_readers_.empty());	62 RTC_DCHECK(audiotrack_readers_.empty());

51 for (const Turn& turn : timing_) {	63 for (const Turn& turn : timing_) {

52 auto it = audiotrack_readers_.find(turn.audiotrack_file_name);	64 auto it = audiotrack_readers_.find(turn.audiotrack_file_name);

53 if (it != audiotrack_readers_.end())	65 if (it != audiotrack_readers_.end())

54 continue;	66 continue;

55	67

56 // Instance Pathname to retrieve the full path to the audiotrack file.	68 // Instance Pathname to retrieve the full path to the audiotrack file.

57 const rtc::Pathname audiotrack_file_path(	69 const rtc::Pathname audiotrack_file_path(

58 audiotracks_path_, turn.audiotrack_file_name);	70 audiotracks_path_, turn.audiotrack_file_name);

59	71

60 // Map the audiotrack file name to a new instance of WavReaderInterface.	72 // Map the audiotrack file name to a new instance of WavReaderInterface.

61 std::unique_ptr<WavReaderInterface> wavreader =	73 std::unique_ptr<WavReaderInterface> wavreader =

62 wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());	74 wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());

63 audiotrack_readers_.insert(std::make_pair(	75 audiotrack_readers_.emplace(

64 turn.audiotrack_file_name, std::move(wavreader)));	76 turn.audiotrack_file_name, std::move(wavreader));

65 }	77 }

66 }	78 }

67	79

68 void MultiEndCall::CheckTiming() {	80 bool MultiEndCall::CheckTiming() {

69 // TODO(alessiob): use audiotrack lengths and offset to check whether the	81 struct Interval {

70 // timing is valid.	82 std::size_t begin;
	hlundin-webrtc 2017/04/06 08:10:04 size_t Here and below. size_t Here and below. AleBzk 2017/04/06 16:42:42 Done. Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > size_t > Here and below. Done.
	83 std::size_t end;

	84 };

	85 std::size_t number_of_turns = timing_.size();

	86 auto millisecond_to_samples = [](int ms, int sr) -> int {

	87 return ms * sr / 1000;
	hlundin-webrtc 2017/04/06 08:10:04 I'd recommend rtc::CheckedDivExact(sr, 1000) I'd recommend rtc::CheckedDivExact(sr, 1000) AleBzk 2017/04/06 16:42:42 If I do that, the tool won't work if the sampling Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > I'd recommend rtc::CheckedDivExact(sr, 1000) If I do that, the tool won't work if the sampling rate is not an integer multiple of 1k (e.g., it'd crash with 22050 and 44100 Hz). Ceiling, floor or rounding are all fine here; so implicit casting is ok. WDYT? hlundin-webrtc 2017/04/07 10:24:09 Oh, the tool should be able to handle other rates Show quoted text On 2017/04/06 16:42:42, AleBzk wrote: > On 2017/04/06 08:10:04, hlundin-webrtc wrote: > > I'd recommend rtc::CheckedDivExact(sr, 1000) > > If I do that, the tool won't work if the sampling rate is not an integer > multiple of 1k (e.g., it'd crash with 22050 and 44100 Hz). Ceiling, floor or > rounding are all fine here; so implicit casting is ok. > > WDYT? Oh, the tool should be able to handle other rates than 8, 16, 32, 48? That changes everything. Keep this as is, then, if you are convinced that the truncation is fine. Please, add a comment that truncation may happen. AleBzk 2017/04/07 11:37:06 Done. Show quoted text On 2017/04/07 10:24:09, hlundin-webrtc wrote: > On 2017/04/06 16:42:42, AleBzk wrote: > > On 2017/04/06 08:10:04, hlundin-webrtc wrote: > > > I'd recommend rtc::CheckedDivExact(sr, 1000) > > > > If I do that, the tool won't work if the sampling rate is not an integer > > multiple of 1k (e.g., it'd crash with 22050 and 44100 Hz). Ceiling, floor or > > rounding are all fine here; so implicit casting is ok. > > > > WDYT? > > Oh, the tool should be able to handle other rates than 8, 16, 32, 48? That > changes everything. Keep this as is, then, if you are convinced that the > truncation is fine. Please, add a comment that truncation may happen. Done.
	88 };

	89 auto in_interval = [](std::size_t value, const Interval& interval) {

	90 return interval.begin <= value && value < interval.end;

	91 };

	92 total_duration_samples_ = 0;

	93

	94 // Begin and end timestamps for the last two turns (unit: number of samples).

	95 Interval second_last_turn = {0, 0};

	96 Interval last_turn = {0, 0};

	97

	98 // Initialize map to store speaking turn indices of each speaker (used to

	99 // detect self cross-talk).

	100 std::map<std::string, std::vector<std::size_t>> speaking_turn_indices;

	101 for (const std::string& speaker_name : speaker_names_) {

	102 speaking_turn_indices.emplace(

	103 std::piecewise_construct,

	104 std::forward_as_tuple(speaker_name),

	105 std::forward_as_tuple());

	106 }

	107

	108 // Parse turns.

	109 for (std::size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {

	110 const Turn& turn = timing_[turn_index];

	111 auto it = audiotrack_readers_.find(turn.audiotrack_file_name);

	112 RTC_CHECK(it != audiotrack_readers_.end())
	hlundin-webrtc 2017/04/06 08:10:04 RTC_CHECK_NE RTC_CHECK_NE AleBzk 2017/04/06 16:42:42 RTC_CHECK_NE(it, audiotrack_readers_.end()) raises Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > RTC_CHECK_NE RTC_CHECK_NE(it, audiotrack_readers_.end()) raises tons of compiling errors. I guess that can be fixed by casting, but IMHO I'd find that less readable. hlundin-webrtc 2017/04/07 10:24:09 Hmm. Boring. Keep this as is then. Show quoted text On 2017/04/06 16:42:42, AleBzk wrote: > On 2017/04/06 08:10:04, hlundin-webrtc wrote: > > RTC_CHECK_NE > > RTC_CHECK_NE(it, audiotrack_readers_.end()) raises tons of compiling errors. I > guess that can be fixed by casting, but IMHO I'd find that less readable. Hmm. Boring. Keep this as is then. AleBzk 2017/04/07 11:37:06 Acknowledged. Show quoted text On 2017/04/07 10:24:09, hlundin-webrtc wrote: > On 2017/04/06 16:42:42, AleBzk wrote: > > On 2017/04/06 08:10:04, hlundin-webrtc wrote: > > > RTC_CHECK_NE > > > > RTC_CHECK_NE(it, audiotrack_readers_.end()) raises tons of compiling errors. I > > guess that can be fixed by casting, but IMHO I'd find that less readable. > > Hmm. Boring. Keep this as is then. Acknowledged.
	113 << "Audio track reader not created";

	114

	115 // Begin and end timestamps for the current turn.

	116 int offset_samples = millisecond_to_samples(

	117 turn.offset, it->second->sample_rate());

	118 std::size_t begin_timestamp = last_turn.end + offset_samples;

	119 std::size_t end_timestamp = begin_timestamp + it->second->num_samples();

	120 LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp

	121 << "-" << end_timestamp << " ms";

	122

	123 // The order is invalid if the offset is negative and its absolute value is

	124 // larger then the duration of the previous turn.

	125 if (offset_samples < 0 && -offset_samples > int(
	hlundin-webrtc 2017/04/06 08:10:04 static_cast<int>(last_turn.end - last_turn.begin) static_cast<int>(last_turn.end - last_turn.begin) AleBzk 2017/04/06 16:42:42 Done. Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > static_cast<int>(last_turn.end - last_turn.begin) Done.
	126 last_turn.end - last_turn.begin)) {

	127 LOG(LS_ERROR) << "invalid order";

	128 return false;

	129 }

	130

	131 // Cross-talk with 3 or more speakers occurs when the beginning of the

	132 // current interval falls in the last two turns.

	133 if (turn_index > 1 && in_interval(begin_timestamp, last_turn)

	134 && in_interval(begin_timestamp, second_last_turn)) {

	135 LOG(LS_ERROR) << "cross-talk with 3+ speakers";

	136 return false;

	137 }

	138

	139 // Append turn.

	140 speaking_turns_.emplace_back(

	141 turn.speaker_name, turn.audiotrack_file_name,

	142 begin_timestamp, end_timestamp);

	143

	144 // Save speaking turn index for self cross-talk detection.

	145 speaking_turn_indices[turn.speaker_name].push_back(turn_index);
	hlundin-webrtc 2017/04/06 08:10:04 You are relying on an implicit assumption that tur You are relying on an implicit assumption that turn_index is equal to the current length of speaking_turns_, right? DCHECK that. AleBzk 2017/04/06 16:42:42 Done. Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > You are relying on an implicit assumption that turn_index is equal to the > current length of speaking_turns_, right? DCHECK that. Done.
	146

	147 // Update total duration of the consversational speech.

	148 if (total_duration_samples_ < end_timestamp)

	149 total_duration_samples_ = end_timestamp;

	150

	151 // Update and continue with next turn.

	152 second_last_turn = last_turn;

	153 last_turn.begin = begin_timestamp;

	154 last_turn.end = end_timestamp;

	155 }

	156

	157 // Detect self cross-talk.

	158 for (const std::string& speaker_name : speaker_names_) {
	hlundin-webrtc 2017/04/06 08:10:04 The speaking_turn_indices variable is only used fo The speaking_turn_indices variable is only used for detecting self cross-talk. I think you should be able to do that without speaking_turn_indices. for (const std::string& speaker_name : speaker_names_) { std::vector<SpeakingTurn> speaking_turns_for_name; // Copy all turns for this speaker to new vector. std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), speaking_turns_for_name.begin(), [&speaker_name](const SpeakingTurn& st){ return st.speaker_name == speaker_name; }); // Check for overlap between adjacent elements. auto overlap = std::adjacent_find(speaking_turns_for_name.begin(), speaking_turns_for_name.end(), [](const SpeakingTurn& a, const SpeakingTurn& b) { return a.end > b.begin; }); if (overlap != speaking_turns_for_name.end()) { LOG()...; return false; } } AleBzk 2017/04/06 16:42:42 Cool! Happy to learn about std::copy_if and std::a Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > The speaking_turn_indices variable is only used for detecting self cross-talk. I > think you should be able to do that without speaking_turn_indices. > > for (const std::string& speaker_name : speaker_names_) { > std::vector<SpeakingTurn> speaking_turns_for_name; > // Copy all turns for this speaker to new vector. > std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), > speaking_turns_for_name.begin(), > [&speaker_name](const SpeakingTurn& st){ > return st.speaker_name == speaker_name; }); > // Check for overlap between adjacent elements. > auto overlap = std::adjacent_find(speaking_turns_for_name.begin(), > speaking_turns_for_name.end(), > [](const SpeakingTurn& a, const SpeakingTurn& b) { > return a.end > b.begin; }); > if (overlap != speaking_turns_for_name.end()) { > LOG()...; > return false; > } > } Cool! Happy to learn about std::copy_if and std::adjacent_find. And great usage of lambdas. Thanks! I did something similar initially, but then I preferred to avoid waste of memory. That's why I eventually went for speaking_turn_indices (more lightweight). Since this tool will never handle millions of speaking turns, I will stick to your snippet (more readable).
	159 LOG(LS_INFO) << "checking self cross-talk for <"

	160 << speaker_name << ">";

	161 if (DetectSelfCrossTalk(speaking_turn_indices[speaker_name])) {
	hlundin-webrtc 2017/04/06 08:10:04 It is a bit tricky to use the map::[] operator her It is a bit tricky to use the map::[] operator here. In case something went wrong previously, and speaking_turn_indices contains no entry with key speaker_name, the [] operator will create such an entry. I'd rather see the code crash and burn in that case. Either you can use the map::at() method, but that is defined to throw an exception if the entry is not found; we don't use exceptions. Or, you can use the map::find() method, and DCHECK you didn't get past-then-end iterator as answer. AleBzk 2017/04/06 16:42:42 I'll go the way you suggested in your previous com Show quoted text On 2017/04/06 08:10:04, hlundin-webrtc wrote: > It is a bit tricky to use the map::[] operator here. In case something went > wrong previously, and speaking_turn_indices contains no entry with key > speaker_name, the [] operator will create such an entry. I'd rather see the code > crash and burn in that case. > > Either you can use the map::at() method, but that is defined to throw an > exception if the entry is not found; we don't use exceptions. Or, you can use > the map::find() method, and DCHECK you didn't get past-then-end iterator as > answer. I'll go the way you suggested in your previous comment, but thanks anyway for this comment.
	162 LOG(LS_ERROR) << "Self cross-talk detected";

	163 return false;

	164 }

	165 }

	166

	167 return true;

	168 }

	169

	170 bool MultiEndCall::DetectSelfCrossTalk(

	171 const std::vector<std::size_t>& speaking_turn_indices) const {

	172 // Compare adjacent speaking turn pairs.

	173 for (std::size_t index = 1; index < speaking_turn_indices.size(); ++index) {

	174 const SpeakingTurn& previous_interval = speaking_turns_[

	175 speaking_turn_indices[index - 1]];

	176 const SpeakingTurn& interval = speaking_turns_[

	177 speaking_turn_indices[index]];

	178

	179 // Check if there is overlap with the previous interval.

	180 // This is a sufficient condition for self cross-talk since the intervals

	181 // are sorted by begin timestamp.

	182 if (previous_interval.end > interval.begin) {

	183 return true;

	184 }

	185 }

	186 return false;

71 }	187 }

72	188

73 } // namespace conversational_speech	189 } // namespace conversational_speech

74 } // namespace test	190 } // namespace test

75 } // namespace webrtc	191 } // namespace webrtc

OLD	NEW