webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc - Issue 2110593003: Pull out the PostFilter to its own NonlinearBeamformer API

Unified Diff: webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

Issue 2110593003: Pull out the PostFilter to its own NonlinearBeamformer API (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Keep old interfaces to not break dependencies Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h ('K') | « webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h ('k') | webrtc/modules/audio_processing/beamformer/nonlinear_beamformer_test.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

index f5bdd6a3c2d0308bea77074301f6d8f688c89ead..c7cf48f8773cb521e282ff4677ba75c284f4162c 100644

--- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

+++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

@@ -122,18 +122,6 @@ size_t Round(float x) {

return static_cast<size_t>(std::floor(x + 0.5f));

}

-// Calculates the sum of absolute values of a complex matrix.

-float SumAbs(const ComplexMatrix<float>& mat) {

- float sum_abs = 0.f;

- const complex<float>* const* mat_els = mat.elements();

- for (size_t i = 0; i < mat.num_rows(); ++i) {

- for (size_t j = 0; j < mat.num_columns(); ++j) {

- sum_abs += std::abs(mat_els[i][j]);

- }

- return sum_abs;

// Calculates the sum of squares of a complex matrix.

float SumSquares(const ComplexMatrix<float>& mat) {

float sum_squares = 0.f;

@@ -183,10 +171,46 @@ const float NonlinearBeamformer::kHalfBeamWidthRadians = DegreesToRadians(20.f);

// static

const size_t NonlinearBeamformer::kNumFreqBins;

+PostFilterTransform::PostFilterTransform(size_t num_channels,

+ size_t chunk_length,

+ float* window,

+ size_t fft_size)

+ : transform_(num_channels,

+ num_channels,

+ chunk_length,

+ window,

+ fft_size,

+ fft_size / 2,

+ this),

+ num_freq_bins_(fft_size / 2 + 1) {}

+void PostFilterTransform::ProcessChunk(float* const* data, float* final_mask) {

+ final_mask_ = final_mask;

+ transform_.ProcessChunk(data, data);

+void PostFilterTransform::ProcessAudioBlock(const complex<float>* const* input,

+ size_t num_input_channels,

+ size_t num_freq_bins,

+ size_t num_output_channels,

+ complex<float>* const* output) {

+ RTC_DCHECK_EQ(num_freq_bins_, num_freq_bins);

+ RTC_DCHECK_EQ(num_input_channels, num_output_channels);

+ for (size_t ch = 0; ch < num_input_channels; ++ch) {

+ for (size_t f_ix = 0; f_ix < num_freq_bins_; ++f_ix) {

+ output[ch][f_ix] =

+ kCompensationGain * final_mask_[f_ix] * input[ch][f_ix];

+ }

NonlinearBeamformer::NonlinearBeamformer(

const std::vector<Point>& array_geometry,

+ size_t num_postfilter_channels,

SphericalPointf target_direction)

: num_input_channels_(array_geometry.size()),

+ num_postfilter_channels_(num_postfilter_channels),

array_geometry_(GetCenteredArray(array_geometry)),

array_normal_(GetArrayNormalIfExists(array_geometry)),

min_mic_spacing_(GetMinimumSpacing(array_geometry)),

@@ -208,18 +232,21 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {

hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;

interference_blocks_count_ = hold_target_blocks_;

- lapped_transform_.reset(new LappedTransform(num_input_channels_,

- 1,

- chunk_length_,

- window_,

- kFftSize,

- kFftSize / 2,

- this));

+ process_transform_.reset(new LappedTransform(num_input_channels_,

+ 0u,

+ chunk_length_,

+ window_,

+ kFftSize,

+ kFftSize / 2,

+ this));

+ postfilter_transform_.reset(new PostFilterTransform(

+ num_postfilter_channels_, chunk_length_, window_, kFftSize));

+ const float wave_number_step =

+ (2.f * M_PI * sample_rate_hz_) / (kFftSize * kSpeedOfSoundMeterSeconds);

for (size_t i = 0; i < kNumFreqBins; ++i) {

time_smooth_mask_[i] = 1.f;

final_mask_[i] = 1.f;

- float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;

- wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;

+ wave_numbers_[i] = i * wave_number_step;

}

InitLowFrequencyCorrectionRanges();

@@ -306,9 +333,6 @@ void NonlinearBeamformer::InitDelaySumMasks() {

complex_f norm_factor = sqrt(

ConjugateDotProduct(delay_sum_masks_[f_ix], delay_sum_masks_[f_ix]));

delay_sum_masks_[f_ix].Scale(1.f / norm_factor);

- normalized_delay_sum_masks_[f_ix].CopyFrom(delay_sum_masks_[f_ix]);

- normalized_delay_sum_masks_[f_ix].Scale(1.f / SumAbs(

- normalized_delay_sum_masks_[f_ix]));

}

@@ -366,30 +390,47 @@ void NonlinearBeamformer::NormalizeCovMats() {

}

-void NonlinearBeamformer::ProcessChunk(const ChannelBuffer<float>& input,

- ChannelBuffer<float>* output) {

- RTC_DCHECK_EQ(input.num_channels(), num_input_channels_);

- RTC_DCHECK_EQ(input.num_frames_per_band(), chunk_length_);

+void NonlinearBeamformer::AnalyzeChunk(const ChannelBuffer<float>& data) {

+ RTC_DCHECK_EQ(data.num_channels(), num_input_channels_);

+ RTC_DCHECK_EQ(data.num_frames_per_band(), chunk_length_);

- float old_high_pass_mask = high_pass_postfilter_mask_;

- lapped_transform_->ProcessChunk(input.channels(0), output->channels(0));

- // Ramp up/down for smoothing. 1 mask per 10ms results in audible

- // discontinuities.

+ old_high_pass_mask_ = high_pass_postfilter_mask_;

+ process_transform_->ProcessChunk(data.channels(0), nullptr);

+void NonlinearBeamformer::PostFilter(ChannelBuffer<float>* data) {

+ RTC_DCHECK_EQ(data->num_frames_per_band(), chunk_length_);

+ // TODO(aluebs): Change to RTC_CHECK_EQ once the ChannelBuffer is updated.

+ RTC_DCHECK_GE(data->num_channels(), num_postfilter_channels_);

+ postfilter_transform_->ProcessChunk(data->channels(0), final_mask_);

+ // Ramp up/down for smoothing is needed in order to avoid discontinuities in

+ // the transitions between 10 ms frames.

const float ramp_increment =

- (high_pass_postfilter_mask_ - old_high_pass_mask) /

- input.num_frames_per_band();

- // Apply the smoothed high-pass mask to the first channel of each band.

- // This can be done because the effect of the linear beamformer is negligible

- // compared to the post-filter.

- for (size_t i = 1; i < input.num_bands(); ++i) {

- float smoothed_mask = old_high_pass_mask;

- for (size_t j = 0; j < input.num_frames_per_band(); ++j) {

+ (high_pass_postfilter_mask_ - old_high_pass_mask_) /

+ data->num_frames_per_band();

+ for (size_t i = 1; i < data->num_bands(); ++i) {

+ float smoothed_mask = old_high_pass_mask_;

+ for (size_t j = 0; j < data->num_frames_per_band(); ++j) {

smoothed_mask += ramp_increment;

- output->channels(i)[0][j] = input.channels(i)[0][j] * smoothed_mask;

+ for (size_t k = 0; k < num_postfilter_channels_; ++k) {

+ data->channels(i)[k][j] *= smoothed_mask;

+ }

}

+void NonlinearBeamformer::ProcessChunk(const ChannelBuffer<float>& input,

+ ChannelBuffer<float>* output) {

+ AnalyzeChunk(input);

+ for (size_t i = 0u; i < input.num_bands(); ++i) {

peah-webrtc 2016/06/29 08:38:20 Would it be possible to handle the output number o

aluebs-webrtc 2016/06/29 23:19:55 To be compatible with how the beamformer was used

peah-webrtc 2016/06/30 05:25:05 I know this is how it is used to be, but it is qui

I know this is how it is used to be, but it is quite error-prone and that was why I wanted to see whether it could be changed. By only copying the left channel to the output, you are in fact enforcing the number of channels in the output to 1, as any other channels than the left one will not be updated. Refraining to set the number of channels to 1 in this part of the code does not change that. I don't really see why it would be a problem to change the output number of channels when the input and the output refer to the same ChannelBuffer (via an in-place call). The memcopy operation is valid regardless of that, and the const-ness of input does not impose any restrictions on output just because they happen to point to the same ChannelBuffer. To me, since you anyway here copied only the left channel, you might as well also set the number of channels to 1 as that is what you really do when you only copy the left channel but I'm probably missing something.

aluebs-webrtc 2016/06/30 23:37:09 This is a temporary interface to be able to migrat

On 2016/06/30 05:25:05, peah-webrtc wrote: > On 2016/06/29 23:19:55, aluebs-webrtc wrote: > > On 2016/06/29 08:38:20, peah-webrtc wrote: > > > Would it be possible to handle the output number of channels in a less > > > error-prone way. > > > As it is now, there is an unnecessary dependency between how the user of the > > > method handles the output. > > > > > > Would it be possible to either: > > > 1) DCHECK on that the number of channels in output is 1? > > > 2) Set the number of channels in output to 1. > > > 3) Change the memcpy so that the input is downmixed/upmixed to the number of > > > channels in output. > > > > > > > To be compatible with how the beamformer was used before this CL (the whole > > point of re-adding this interface) we: > > 1) can't enforce that the number of channels of the output is 1, since it > could > > be the same channel_buffer than the input. > > 2) can't change the number of channels of the output, since that wasn't how it > > behaved and again it could be the same channel_buffer than the input. > > > > On the old bf interface, the behavior was exactly like this and copying the > > number of channels of the output (which can be the same as the input) will add > > unecessary complexity. But I guess it ensures it can be used with a different > > number of output channels (not supported in the old interface), so I did that. > > I know this is how it is used to be, but it is quite error-prone and that was > why I wanted to see whether it could be changed. > > By only copying the left channel to the output, you are in fact enforcing the > number of channels in the output to 1, as any other channels than the left one > will not be updated. Refraining to set the number of channels to 1 in this part > of the code does not change that. > > I don't really see why it would be a problem to change the output number of > channels when the input and the output refer to the same ChannelBuffer (via an > in-place call). The memcopy operation is valid regardless of that, and the > const-ness of input does not impose any restrictions on output just because they > happen to point to the same ChannelBuffer. > > To me, since you anyway here copied only the left channel, you might as well > also set the number of channels to 1 as that is what you really do when you only > copy the left channel but I'm probably missing something. >

This is a temporary interface to be able to migrate the dependencies without breaking them and will be removed as soon as we migrate the dependencies. We shouldn't change the behavior, since that could break the dependencies that were not expecting the BF to change the channels of its ChannelBuffer.

+ std::memcpy(output->channels(i)[0], input.channels(i)[0],

+ sizeof(input.channels(0)[0][0]) * input.num_frames_per_band());

+ }

+ PostFilter(output);

void NonlinearBeamformer::AimAt(const SphericalPointf& target_direction) {

target_angle_radians_ = target_direction.azimuth();

InitHighFrequencyCorrectionRanges();

@@ -414,7 +455,7 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input,

complex_f* const* output) {

RTC_CHECK_EQ(kNumFreqBins, num_freq_bins);

RTC_CHECK_EQ(num_input_channels_, num_input_channels);

- RTC_CHECK_EQ(1u, num_output_channels);

+ RTC_CHECK_EQ(0u, num_output_channels);

// Calculating the post-filter masks. Note that we need two for each

// frequency bin to account for the positive and negative interferer

@@ -456,7 +497,6 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input,

ApplyLowFrequencyCorrection();

ApplyHighFrequencyCorrection();

ApplyMaskFrequencySmoothing();

- ApplyMasks(input, output);

}

float NonlinearBeamformer::CalculatePostfilterMask(

@@ -484,22 +524,6 @@ float NonlinearBeamformer::CalculatePostfilterMask(

return numerator / denominator;

}

-void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,

- complex_f* const* output) {

- complex_f* output_channel = output[0];

- for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {

- output_channel[f_ix] = complex_f(0.f, 0.f);

- const complex_f* delay_sum_mask_els =

- normalized_delay_sum_masks_[f_ix].elements()[0];

- for (size_t c_ix = 0; c_ix < num_input_channels_; ++c_ix) {

- output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];

- }

- output_channel[f_ix] *= kCompensationGain * final_mask_[f_ix];

- }

// Smooth new_mask_ into time_smooth_mask_.

void NonlinearBeamformer::ApplyMaskTimeSmoothing() {

for (size_t i = low_mean_start_bin_; i <= high_mean_end_bin_; ++i) {