webrtc/modules/audio_processing/include/audio_processing.h - Issue 1234463003: Integrate Intelligibility with APM

Side by Side Diff: webrtc/modules/audio_processing/include/audio_processing.h

Issue 1234463003: Integrate Intelligibility with APM (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Added support for distinct reverse input/output streams. Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« webrtc/modules/audio_processing/audio_processing_impl.h ('K') | « webrtc/modules/audio_processing/audio_processing_impl.cc ('k') | webrtc/modules/audio_processing/include/mock_audio_processing.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 98 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
109 Beamforming()	109 Beamforming()

110 : enabled(false),	110 : enabled(false),

111 array_geometry() {}	111 array_geometry() {}

112 Beamforming(bool enabled, const std::vector<Point>& array_geometry)	112 Beamforming(bool enabled, const std::vector<Point>& array_geometry)

113 : enabled(enabled),	113 : enabled(enabled),

114 array_geometry(array_geometry) {}	114 array_geometry(array_geometry) {}

115 const bool enabled;	115 const bool enabled;

116 const std::vector<Point> array_geometry;	116 const std::vector<Point> array_geometry;

117 };	117 };

118	118

	119 // Use to enable intelligibility enhancer in audio processing. Must be provided

	120 // though the constructor. It will have no impact if used with

	121 // AudioProcessing::SetExtraOptions().

	122 //

	123 // Note: If enabled and the reverse stream has more than one output channel,

	124 // the reverse stream will become an upmixed mono signal.

	125 struct Intelligibility {

	126 Intelligibility() : enabled(false) {}

	127 explicit Intelligibility(bool enabled) : enabled(enabled) {}

	128 bool enabled;

	129 };

	130

119 static const int kAudioProcMaxNativeSampleRateHz = 32000;	131 static const int kAudioProcMaxNativeSampleRateHz = 32000;

120	132

121 // The Audio Processing Module (APM) provides a collection of voice processing	133 // The Audio Processing Module (APM) provides a collection of voice processing

122 // components designed for real-time communications software.	134 // components designed for real-time communications software.

123 //	135 //

124 // APM operates on two audio streams on a frame-by-frame basis. Frames of the	136 // APM operates on two audio streams on a frame-by-frame basis. Frames of the

125 // primary stream, on which all processing is applied, are passed to	137 // primary stream, on which all processing is applied, are passed to

126 // \|ProcessStream()\|. Frames of the reverse direction stream, which are used for	138 // \|ProcessStream()\|. Frames of the reverse direction stream, which are used for

127 // analysis by some components, are passed to \|AnalyzeReverseStream()\|. On the	139 // analysis by some components, are passed to \|AnalyzeReverseStream()\|. On the

128 // client-side, this will typically be the near-end (capture) and far-end	140 // client-side, this will typically be the near-end (capture) and far-end

(...skipping 197 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
326 // reverse stream forms the echo reference signal. It is recommended, but not	338 // reverse stream forms the echo reference signal. It is recommended, but not

327 // necessary, to provide if gain control is enabled. On the server-side this	339 // necessary, to provide if gain control is enabled. On the server-side this

328 // typically will not be used. If you're not sure what to pass in here,	340 // typically will not be used. If you're not sure what to pass in here,

329 // chances are you don't need to use it.	341 // chances are you don't need to use it.

330 //	342 //

331 // The \|sample_rate_hz_\|, \|num_channels_\|, and \|samples_per_channel_\|	343 // The \|sample_rate_hz_\|, \|num_channels_\|, and \|samples_per_channel_\|

332 // members of \|frame\| must be valid. \|sample_rate_hz_\| must correspond to	344 // members of \|frame\| must be valid. \|sample_rate_hz_\| must correspond to

333 // \|input_sample_rate_hz()\|	345 // \|input_sample_rate_hz()\|

334 //	346 //

335 // TODO(ajm): add const to input; requires an implementation fix.	347 // TODO(ajm): add const to input; requires an implementation fix.

	348 // DEPRECATED: Use \|ProcessReverseStream\| instead.

	349 // TODO(ekm): Remove once all users have updated to \|ProcessReverseStream\|.

336 virtual int AnalyzeReverseStream(AudioFrame* frame) = 0;	350 virtual int AnalyzeReverseStream(AudioFrame* frame) = 0;

337	351

	352 // Same as \|AnalyzeReverseStream\|, but may modify \|data\| if intelligibility
	Andrew MacDonald 2015/07/30 18:48:53 modify \|frame\| modify \|frame\| ekm 2015/07/30 22:38:44 Done. Show quoted text On 2015/07/30 18:48:53, andrew wrote: > modify \|frame\| Done.
	353 // is enabled.

	354 virtual int ProcessReverseStream(AudioFrame* frame) = 0;

	355

338 // Accepts deinterleaved float audio with the range [-1, 1]. Each element	356 // Accepts deinterleaved float audio with the range [-1, 1]. Each element

339 // of \|data\| points to a channel buffer, arranged according to \|layout\|.	357 // of \|data\| points to a channel buffer, arranged according to \|layout\|.

340 //

341 // TODO(mgraczyk): Remove once clients are updated to use the new interface.	358 // TODO(mgraczyk): Remove once clients are updated to use the new interface.

342 virtual int AnalyzeReverseStream(const float* const* data,	359 virtual int AnalyzeReverseStream(const float* const* data,

343 int samples_per_channel,	360 int samples_per_channel,

344 int sample_rate_hz,	361 int rev_sample_rate_hz,

345 ChannelLayout layout) = 0;	362 ChannelLayout layout) = 0;

346	363

347 // Accepts deinterleaved float audio with the range [-1, 1]. Each element of	364 // Accepts deinterleaved float audio with the range [-1, 1]. Each element of

348 // \|data\| points to a channel buffer, arranged according to \|reverse_config\|.	365 // \|data\| points to a channel buffer, arranged according to \|reverse_config\|.

349 virtual int AnalyzeReverseStream(const float* const* data,	366 virtual int ProcessReverseStream(const float* const* src,

350 const StreamConfig& reverse_config) = 0;	367 const StreamConfig& reverse_input_config,

	368 const StreamConfig& reverse_output_config,
	Andrew MacDonald 2015/07/30 18:48:53 I'm not sure we want to do this. The way you have I'm not sure we want to do this. The way you have it now makes the interface inconsistent: the user can specify an output config, but they'll only get it if intelligibility is enabled. We either need to always support converting the stream, or remove the output config. I'm a bit divided on this. Alex what do you think? aluebs-webrtc 2015/07/30 18:56:34 You bring a good point here. I think it makes sens Show quoted text On 2015/07/30 18:48:53, andrew wrote: > I'm not sure we want to do this. The way you have it now makes the interface > inconsistent: the user can specify an output config, but they'll only get it if > intelligibility is enabled. > > We either need to always support converting the stream, or remove the output > config. I'm a bit divided on this. Alex what do you think? You bring a good point here. I think it makes sense to always convert (except when they are the same of course), so that the reverse stream is similar to the capture stream. Then enabling IE (or any other future component) doesn't change the behavior of the APM. ekm 2015/07/30 21:23:50 Are we talking about converting reverse to capture Show quoted text On 2015/07/30 18:48:53, andrew wrote: > I'm not sure we want to do this. The way you have it now makes the interface > inconsistent: the user can specify an output config, but they'll only get it if > intelligibility is enabled. > > We either need to always support converting the stream, or remove the output > config. I'm a bit divided on this. Alex what do you think? Are we talking about converting reverse to capture, or reverse output to reverse input? Since before IE apm left reverse input as is, it makes sense to me to convert reverse out to reverse in. In that case, I can revert the expansion of ProcessingConfig, etc. What about adding back the AnalyzeReverseStream that only takes the input stream+config? Is that too much interface? aluebs-webrtc 2015/07/30 23:09:51 I would prefer to avoid adding yet another interfa Show quoted text On 2015/07/30 21:23:50, ekm wrote: > On 2015/07/30 18:48:53, andrew wrote: > > I'm not sure we want to do this. The way you have it now makes the interface > > inconsistent: the user can specify an output config, but they'll only get it > if > > intelligibility is enabled. > > > > We either need to always support converting the stream, or remove the output > > config. I'm a bit divided on this. Alex what do you think? > > Are we talking about converting reverse to capture, or reverse output to reverse > input? Since before IE apm left reverse input as is, it makes sense to me to > convert reverse out to reverse in. In that case, I can revert the expansion of > ProcessingConfig, etc. > > What about adding back the AnalyzeReverseStream that only takes the input > stream+config? Is that too much interface? I would prefer to avoid adding yet another interface. So the input would be converted to the output format, as it happens in the capture stream. Andrew? ekm 2015/07/30 23:20:17 Ok. Why are the two configs in the ProcessStream i Show quoted text On 2015/07/30 23:09:51, aluebs-webrtc wrote: > On 2015/07/30 21:23:50, ekm wrote: > > On 2015/07/30 18:48:53, andrew wrote: > > > I'm not sure we want to do this. The way you have it now makes the interface > > > inconsistent: the user can specify an output config, but they'll only get it > > if > > > intelligibility is enabled. > > > > > > We either need to always support converting the stream, or remove the output > > > config. I'm a bit divided on this. Alex what do you think? > > > > Are we talking about converting reverse to capture, or reverse output to > reverse > > input? Since before IE apm left reverse input as is, it makes sense to me to > > convert reverse out to reverse in. In that case, I can revert the expansion of > > ProcessingConfig, etc. > > > > What about adding back the AnalyzeReverseStream that only takes the input > > stream+config? Is that too much interface? > > I would prefer to avoid adding yet another interface. So the input would be > converted to the output format, as it happens in the capture stream. Andrew? Ok. Why are the two configs in the ProcessStream interface necessary? aluebs-webrtc 2015/07/30 23:23:38 So that the APM knows what the user expects as out Show quoted text On 2015/07/30 23:20:17, ekm wrote: > On 2015/07/30 23:09:51, aluebs-webrtc wrote: > > On 2015/07/30 21:23:50, ekm wrote: > > > On 2015/07/30 18:48:53, andrew wrote: > > > > I'm not sure we want to do this. The way you have it now makes the > interface > > > > inconsistent: the user can specify an output config, but they'll only get > it > > > if > > > > intelligibility is enabled. > > > > > > > > We either need to always support converting the stream, or remove the > output > > > > config. I'm a bit divided on this. Alex what do you think? > > > > > > Are we talking about converting reverse to capture, or reverse output to > > reverse > > > input? Since before IE apm left reverse input as is, it makes sense to me to > > > convert reverse out to reverse in. In that case, I can revert the expansion > of > > > ProcessingConfig, etc. > > > > > > What about adding back the AnalyzeReverseStream that only takes the input > > > stream+config? Is that too much interface? > > > > I would prefer to avoid adding yet another interface. So the input would be > > converted to the output format, as it happens in the capture stream. Andrew? > > Ok. Why are the two configs in the ProcessStream interface necessary? So that the APM knows what the user expects as output and does the right thing.
	369 float* const* dest) = 0;

351	370

352 // This must be called if and only if echo processing is enabled.	371 // This must be called if and only if echo processing is enabled.

353 //	372 //

354 // Sets the \|delay\| in ms between AnalyzeReverseStream() receiving a far-end	373 // Sets the \|delay\| in ms between AnalyzeReverseStream() receiving a far-end

355 // frame and ProcessStream() receiving a near-end frame containing the	374 // frame and ProcessStream() receiving a near-end frame containing the

356 // corresponding echo. On the client-side this can be expressed as	375 // corresponding echo. On the client-side this can be expressed as

357 // delay = (t_render - t_analyze) + (t_process - t_capture)	376 // delay = (t_render - t_analyze) + (t_process - t_capture)

358 // where,	377 // where,

359 // - t_analyze is the time a frame is passed to AnalyzeReverseStream() and	378 // - t_analyze is the time a frame is passed to AnalyzeReverseStream() and

360 // t_render is the time the first sample of the same frame is rendered by	379 // t_render is the time the first sample of the same frame is rendered by

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
462 // num_channels: The number of audio channels in the stream, excluding the	481 // num_channels: The number of audio channels in the stream, excluding the

463 // keyboard channel if it is present. When passing a	482 // keyboard channel if it is present. When passing a

464 // StreamConfig with an array of arrays T*[N],	483 // StreamConfig with an array of arrays T*[N],

465 //	484 //

466 // N == {num_channels + 1 if has_keyboard	485 // N == {num_channels + 1 if has_keyboard

467 // {num_channels if !has_keyboard	486 // {num_channels if !has_keyboard

468 //	487 //

469 // has_keyboard: True if the stream has a keyboard channel. When has_keyboard	488 // has_keyboard: True if the stream has a keyboard channel. When has_keyboard

470 // is true, the last channel in any corresponding list of	489 // is true, the last channel in any corresponding list of

471 // channels is the keyboard channel.	490 // channels is the keyboard channel.

472 StreamConfig(int sample_rate_hz = 0,	491 StreamConfig(int sample_rate_hz = 16000,

473 int num_channels = 0,	492 int num_channels = 1,
	aluebs-webrtc 2015/07/30 15:28:07 I think this zero-initialization here is on purpos I think this zero-initialization here is on purpose to force the user to set reasonable values. Actually not setting them is a way of signalling that that stream is not necessary, for example when a reverse stream is not available. Why do you need to have these default values? Andrew MacDonald 2015/07/30 18:48:53 Agreed. Show quoted text On 2015/07/30 15:28:07, aluebs-webrtc wrote: > I think this zero-initialization here is on purpose to force the user to set > reasonable values. Actually not setting them is a way of signalling that that > stream is not necessary, for example when a reverse stream is not available. Why > do you need to have these default values? Agreed. ekm 2015/07/30 22:38:44 That makes sense. I thought it'd be a nice way to Show quoted text On 2015/07/30 15:28:07, aluebs-webrtc wrote: > I think this zero-initialization here is on purpose to force the user to set > reasonable values. Actually not setting them is a way of signalling that that > stream is not necessary, for example when a reverse stream is not available. Why > do you need to have these default values? That makes sense. I thought it'd be a nice way to get around the asserts in audio_buffer, but I see it defeats the purpose. I re-zeroed these and added a new conditional at the beginning of InitializeLocked() instead.
474 bool has_keyboard = false)	493 bool has_keyboard = false)

475 : sample_rate_hz_(sample_rate_hz),	494 : sample_rate_hz_(sample_rate_hz),

476 num_channels_(num_channels),	495 num_channels_(num_channels),

477 has_keyboard_(has_keyboard),	496 has_keyboard_(has_keyboard),

478 num_frames_(calculate_frames(sample_rate_hz)) {}	497 num_frames_(calculate_frames(sample_rate_hz)) {}

479	498

480 void set_sample_rate_hz(int value) {	499 void set_sample_rate_hz(int value) {

481 sample_rate_hz_ = value;	500 sample_rate_hz_ = value;

482 num_frames_ = calculate_frames(value);	501 num_frames_ = calculate_frames(value);

483 }	502 }

(...skipping 26 matching lines...) Expand all Loading...
510 int num_channels_;	529 int num_channels_;

511 bool has_keyboard_;	530 bool has_keyboard_;

512 int num_frames_;	531 int num_frames_;

513 };	532 };

514	533

515 class ProcessingConfig {	534 class ProcessingConfig {

516 public:	535 public:

517 enum StreamName {	536 enum StreamName {

518 kInputStream,	537 kInputStream,

519 kOutputStream,	538 kOutputStream,

520 kReverseStream,	539 kReverseInputStream,

	540 kReverseOutputStream,

521 kNumStreamNames,	541 kNumStreamNames,

522 };	542 };

523	543

524 const StreamConfig& input_stream() const {	544 const StreamConfig& input_stream() const {

525 return streams[StreamName::kInputStream];	545 return streams[StreamName::kInputStream];

526 }	546 }

527 const StreamConfig& output_stream() const {	547 const StreamConfig& output_stream() const {

528 return streams[StreamName::kOutputStream];	548 return streams[StreamName::kOutputStream];

529 }	549 }

530 const StreamConfig& reverse_stream() const {	550 const StreamConfig& reverse_input_stream() const {

531 return streams[StreamName::kReverseStream];	551 return streams[StreamName::kReverseInputStream];

	552 }

	553 const StreamConfig& reverse_output_stream() const {

	554 return streams[StreamName::kReverseOutputStream];

532 }	555 }

533	556

534 StreamConfig& input_stream() { return streams[StreamName::kInputStream]; }	557 StreamConfig& input_stream() { return streams[StreamName::kInputStream]; }

535 StreamConfig& output_stream() { return streams[StreamName::kOutputStream]; }	558 StreamConfig& output_stream() { return streams[StreamName::kOutputStream]; }

536 StreamConfig& reverse_stream() { return streams[StreamName::kReverseStream]; }	559 StreamConfig& reverse_input_stream() {

	560 return streams[StreamName::kReverseInputStream];

	561 }

	562 StreamConfig& reverse_output_stream() {

	563 return streams[StreamName::kReverseOutputStream];

	564 }

537	565

538 bool operator==(const ProcessingConfig& other) const {	566 bool operator==(const ProcessingConfig& other) const {

539 for (int i = 0; i < StreamName::kNumStreamNames; ++i) {	567 for (int i = 0; i < StreamName::kNumStreamNames; ++i) {

540 if (this->streams[i] != other.streams[i]) {	568 if (this->streams[i] != other.streams[i]) {

541 return false;	569 return false;

542 }	570 }

543 }	571 }

544 return true;	572 return true;

545 }	573 }

546	574

(...skipping 359 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
906 // This does not impact the size of frames passed to \|ProcessStream()\|.	934 // This does not impact the size of frames passed to \|ProcessStream()\|.

907 virtual int set_frame_size_ms(int size) = 0;	935 virtual int set_frame_size_ms(int size) = 0;

908 virtual int frame_size_ms() const = 0;	936 virtual int frame_size_ms() const = 0;

909	937

910 protected:	938 protected:

911 virtual ~VoiceDetection() {}	939 virtual ~VoiceDetection() {}

912 };	940 };

913 } // namespace webrtc	941 } // namespace webrtc

914	942

915 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_	943 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_

OLD	NEW