| Index: webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
|
| diff --git a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
|
| index 59454d9d47d26c864b5b1350c0c0ddde2b5f7d58..406d95cf211ebe3ba55eb343b72854555574dfec 100644
|
| --- a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
|
| +++ b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
|
| @@ -8,9 +8,36 @@
|
| * be found in the AUTHORS file in the root of the source tree.
|
| */
|
|
|
| +// This file consists of unit tests for webrtc::test::conversational_speech
|
| +// members. Part of them focus on accepting or rejecting different
|
| +// conversational speech setups. A setup is defined by a set of audio tracks and
|
| +// timing information).
|
| +// The docstring at the beginning of each TEST_F(ConversationalSpeechTest,
|
| +// MultiEndCallSetup*) function looks like the drawing below and indicates which
|
| +// setup is tested.
|
| +//
|
| +// Accept:
|
| +// A 0****.....
|
| +// B .....1****
|
| +//
|
| +// The drawing indicates the following:
|
| +// - the illustrated setup should be accepted,
|
| +// - there are two speakers (namely, A and B),
|
| +// - A is the first speaking, B is the second one,
|
| +// - each character after the speaker's letter indicates a time unit (e.g., 100
|
| +// ms),
|
| +// - "*" indicates speaking, "." listening,
|
| +// - numbers indicate the turn index in std::vector<Turn>.
|
| +//
|
| +// Note that the same speaker can appear in multiple lines in order to depict
|
| +// cases in which there are wrong offsets leading to self cross-talk (which is
|
| +// rejected).
|
| +
|
| #include <stdio.h>
|
| +#include <map>
|
| #include <memory>
|
|
|
| +#include "webrtc/base/logging.h"
|
| #include "webrtc/modules/audio_processing/test/conversational_speech/config.h"
|
| #include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
|
| #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
| @@ -44,9 +71,38 @@ const std::vector<Turn> expected_timing = {
|
| };
|
| const std::size_t kNumberOfTurns = expected_timing.size();
|
|
|
| +// Default arguments for MockWavReaderFactory ctor.
|
| +// Fake audio track parameters.
|
| +constexpr int kDefaultSampleRate = 48000;
|
| +const std::map<std::string, const MockWavReaderFactory::Params>
|
| + kDefaultMockWavReaderFactoryParamsMap = {
|
| + {"t300", {kDefaultSampleRate, 1u, 14400u}}, // 0.3 seconds.
|
| + {"t500", {kDefaultSampleRate, 1u, 24000u}}, // 0.5 seconds.
|
| + {"t1000", {kDefaultSampleRate, 1u, 48000u}}, // 1.0 seconds.
|
| +};
|
| +const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
|
| + kDefaultMockWavReaderFactoryParamsMap.at("t500");
|
| +
|
| +std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
|
| + return std::unique_ptr<MockWavReaderFactory>(
|
| + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
| + kDefaultMockWavReaderFactoryParamsMap));
|
| +}
|
| +
|
| } // namespace
|
|
|
| -TEST(ConversationalSpeechTest, Settings) {
|
| +using testing::_;
|
| +
|
| +// TODO(alessiob): Remove fixture once conversational_speech fully implemented
|
| +// and replace TEST_F with TEST.
|
| +class ConversationalSpeechTest : public testing::Test {
|
| + public:
|
| + ConversationalSpeechTest() {
|
| + rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE);
|
| + }
|
| +};
|
| +
|
| +TEST_F(ConversationalSpeechTest, Settings) {
|
| const conversational_speech::Config config(
|
| audiotracks_path, timing_filepath, output_path);
|
|
|
| @@ -56,7 +112,7 @@ TEST(ConversationalSpeechTest, Settings) {
|
| EXPECT_EQ(output_path, config.output_path());
|
| }
|
|
|
| -TEST(ConversationalSpeechTest, TimingSaveLoad) {
|
| +TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
|
| // Save test timing.
|
| const std::string temporary_filepath = webrtc::test::TempFilename(
|
| webrtc::test::OutputPath(), "TempTimingTestFile");
|
| @@ -76,20 +132,359 @@ TEST(ConversationalSpeechTest, TimingSaveLoad) {
|
| }
|
| }
|
|
|
| -TEST(ConversationalSpeechTest, MultiEndCallCreate) {
|
| - auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
| - new MockWavReaderFactory());
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
| // There are 5 unique audio tracks to read.
|
| - EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(5);
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
|
|
|
| // Inject the mock wav reader factory.
|
| conversational_speech::MultiEndCall multiend_call(
|
| expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
|
|
| // Test.
|
| EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
| EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(6u, multiend_call.speaking_turns().size());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", -100},
|
| + {"B", "t500", 0},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_FALSE(multiend_call.valid());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
| + // Accept:
|
| + // A 0****.....
|
| + // B .....1****
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t500", 0},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
| + // Accept:
|
| + // A 0****.......
|
| + // B .......1****
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t500", 200},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
| + // Accept:
|
| + // A 0****....
|
| + // B ....1****
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t500", -100},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
| + // Reject:
|
| + // A ..0****
|
| + // B .1****. The n-th turn cannot start before the (n-1)-th one.
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 200},
|
| + {"B", "t500", -600},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_FALSE(multiend_call.valid());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
| + // Accept:
|
| + // A 0****2****...
|
| + // B ...1*********
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t1000", -200},
|
| + {"A", "t500", -800},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
| + // Reject:
|
| + // A 0****......
|
| + // A ...1****...
|
| + // B ......2****
|
| + // ^ Turn #1 overlaps with #0 which is from the same speaker.
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"A", "t500", -200},
|
| + {"B", "t500", -200},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_FALSE(multiend_call.valid());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
| + // Reject:
|
| + // A 0*********
|
| + // B 1**.......
|
| + // C ...2**....
|
| + // A ......3**.
|
| + // ^ Turn #3 overlaps with #0 which is from the same speaker.
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t1000", 0},
|
| + {"B", "t300", -1000},
|
| + {"C", "t300", 0},
|
| + {"A", "t300", 0},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_FALSE(multiend_call.valid());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
| + // Accept:
|
| + // A 0*********..
|
| + // B ..1****.....
|
| + // C .......2****
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t1000", 0},
|
| + {"B", "t500", -800},
|
| + {"C", "t500", 0},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
| + // Reject:
|
| + // A 0*********
|
| + // B ..1****...
|
| + // C ....2****.
|
| + // ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
|
| + // not permitted).
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t1000", 0},
|
| + {"B", "t500", -800},
|
| + {"C", "t500", -300},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_FALSE(multiend_call.valid());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
| + // Accept:
|
| + // A 0*********..
|
| + // B .2****......
|
| + // C .......3****
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t1000", 0},
|
| + {"B", "t500", -900},
|
| + {"C", "t500", 100},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
| + // Accept:
|
| + // A 0****
|
| + // B 1****
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t500", -500},
|
| + };
|
| + auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
| +
|
| + // There is one unique audio track to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
| + // Accept:
|
| + // A 0****....3****.5**.
|
| + // B .....1****...4**...
|
| + // C ......2**.......6**..
|
| + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t500", 0},
|
| + {"C", "t300", -400},
|
| + {"A", "t500", 0},
|
| + {"B", "t300", -100},
|
| + {"A", "t300", -100},
|
| + {"C", "t300", -200},
|
| + };
|
| + auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
| + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
| + kDefaultMockWavReaderFactoryParamsMap));
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_TRUE(multiend_call.valid());
|
| +
|
| + // Test.
|
| + EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
| + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
| + EXPECT_EQ(7u, multiend_call.speaking_turns().size());
|
| + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
| +}
|
| +
|
| +TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
| + // Reject:
|
| + // A 0****....3****.6**
|
| + // B .....1****...4**..
|
| + // C ......2**.....5**..
|
| + // ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
|
| + // speakers not permitted).
|
| + const std::vector<Turn> timing = {
|
| + {"A", "t500", 0},
|
| + {"B", "t500", 0},
|
| + {"C", "t300", -400},
|
| + {"A", "t500", 0},
|
| + {"B", "t300", -100},
|
| + {"A", "t300", -200},
|
| + {"C", "t300", -200},
|
| + };
|
| + auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
| + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
| + kDefaultMockWavReaderFactoryParamsMap));
|
| +
|
| + // There are two unique audio tracks to read.
|
| + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
| +
|
| + conversational_speech::MultiEndCall multiend_call(
|
| + timing, audiotracks_path, std::move(mock_wavreader_factory));
|
| + EXPECT_FALSE(multiend_call.valid());
|
| }
|
|
|
| } // namespace test
|
|
|