webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc - Issue 1982183002: Pull out the PostFilter to its own NonlinearBeamformer API

Unified Diff: webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

Issue 1982183002: Pull out the PostFilter to its own NonlinearBeamformer API (Closed) Base URL: https://chromium.googlesource.com/external/webrtc.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h ('K') | « webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h ('k') | webrtc/modules/audio_processing/beamformer/nonlinear_beamformer_test.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

index f5bdd6a3c2d0308bea77074301f6d8f688c89ead..b5d5197edc0e6218c548368e49d27e3cddaabbe4 100644

--- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

+++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc

@@ -183,6 +183,30 @@ const float NonlinearBeamformer::kHalfBeamWidthRadians = DegreesToRadians(20.f);

// static

peah-webrtc 2016/05/22 21:06:48 Please correct this comment as well while you are

aluebs-webrtc 2016/05/26 01:04:45 What is wrong with it?

peah-webrtc 2016/05/26 08:48:52 It is not a proper sentence, and not terminated by

aluebs-webrtc 2016/05/28 03:00:00 I think the static definition is clearer like this

peah-webrtc 2016/05/30 11:49:25 I think the guidelines should be applied regardles

aluebs-webrtc 2016/06/01 00:16:34 Acknowledged.

const size_t NonlinearBeamformer::kNumFreqBins;

+class PostFilterTransform : public LappedTransform::Callback {

peah-webrtc 2016/05/22 21:06:48 This class is a way to be able to use the callback

aluebs-webrtc 2016/05/26 01:04:45 I don't see how this simplifies the code, but I ag

peah-webrtc 2016/05/26 08:48:52 The simplification is separation of concerns and d

aluebs-webrtc 2016/05/28 03:00:00 I think the separation/encapsulation is almost the

peah-webrtc 2016/05/30 11:49:25 Acknowledged.

+ public:

+ explicit PostFilterTransform(NonlinearBeamformer* beamformer)

+ : beamformer_(beamformer) {}

+ protected:

+ // Process one frequency-domain block of audio. This is where the fun

peah-webrtc 2016/05/22 21:06:48 Please describe this more thoroughly. I'm not sure

aluebs-webrtc 2016/05/26 01:04:45 Removed comment. It was just to be consistent with

peah-webrtc 2016/05/30 11:49:25 Acknowledged.

+ // happens. Implements LappedTransform::Callback.

+ void ProcessAudioBlock(const complex<float>* const* input,

+ size_t num_input_channels,

+ size_t num_freq_bins,

+ size_t num_output_channels,

+ complex<float>* const* output) override {

+ RTC_CHECK_EQ(NonlinearBeamformer::kNumFreqBins, num_freq_bins);

+ RTC_CHECK_EQ(1u, num_input_channels);

+ RTC_CHECK_EQ(1u, num_output_channels);

+ beamformer_->ApplyPostFilter(input[0], output[0]);

+ }

+ private:

+ NonlinearBeamformer* beamformer_;

+};

NonlinearBeamformer::NonlinearBeamformer(

const std::vector<Point>& array_geometry,

SphericalPointf target_direction)

@@ -208,13 +232,16 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {

hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;

interference_blocks_count_ = hold_target_blocks_;

- lapped_transform_.reset(new LappedTransform(num_input_channels_,

- 1,

- chunk_length_,

- window_,

- kFftSize,

- kFftSize / 2,

- this));

+ process_transform_.reset(new LappedTransform(num_input_channels_,

+ 1u,

+ chunk_length_,

+ window_,

+ kFftSize,

+ kFftSize / 2,

+ this));

+ postfilter_transform_.reset(new LappedTransform(

+ 1u, 1u, chunk_length_, window_, kFftSize, kFftSize / 2,

+ new PostFilterTransform(this)));

for (size_t i = 0; i < kNumFreqBins; ++i) {

time_smooth_mask_[i] = 1.f;

final_mask_[i] = 1.f;

@@ -371,18 +398,31 @@ void NonlinearBeamformer::ProcessChunk(const ChannelBuffer<float>& input,

RTC_DCHECK_EQ(input.num_channels(), num_input_channels_);

RTC_DCHECK_EQ(input.num_frames_per_band(), chunk_length_);

- float old_high_pass_mask = high_pass_postfilter_mask_;

- lapped_transform_->ProcessChunk(input.channels(0), output->channels(0));

+ old_high_pass_mask_ = high_pass_postfilter_mask_;

+ process_transform_->ProcessChunk(input.channels(0), output->channels(0));

+ // Copy over only the first channel of each band.

peah-webrtc 2016/05/22 21:06:48 Have you checked the impact on the signal when thi

aluebs-webrtc 2016/05/26 01:04:45 This is no longer relevant, since we decided offli

peah-webrtc 2016/05/26 08:48:52 I think the perfect reconstruction may actually be

aluebs-webrtc 2016/05/28 03:00:00 I meant that other components are already non-line

On 2016/05/26 08:48:52, peah-webrtc wrote: > On 2016/05/26 01:04:45, aluebs-webrtc wrote: > > On 2016/05/22 21:06:48, peah-webrtc wrote: > > > Have you checked the impact on the signal when this is only done in the > lowest > > > band. In theory the perfect reconstruction property of the signal could be > > > destroyed through this , causing aliasing distortions. Furthermore, this > > could > > > in theory cause an sharp increase in the signal power at the first > band-split > > > frequency. > > > > > > Why don't you perform the linear beamforming in the time-domain instead, as > > you > > > anyway do not pre-steer the signals before applying that step (right?)? > > > That would have the benefits > > > 1) Be cheaper as you don't need to do the IFFT to go back to the > time-domain. > > > 2) Have lower delay, as the filter-bank delay is avoided. > > > 3) Have zero impact on the perfect-reconstruction property of the splitting > > > filter. > > > 4) Have no edge effect at the first band-split frequency. > > > > > > > This is no longer relevant, since we decided offline to not do any linear > > beamforming because it is negligible compared to the postfilter. But a few > > points just to be clear: > > * The perfect reconstruction of the filter bank is lost with any processing > > done. > > * The worst case 3dB increase of power in the higher bands is probably not > > perceivable. And I am not sure what the problem with a "sharp" edge here is. > > I think the perfect reconstruction may actually be preserved if you average two > PR filterbanks in the time-domain (which the linear beamforming is identical to > if no pre-steering is performed). > > A 3 dB "edge" is perceivable, but I doubt as well if it will matter in general > for most noise types (at least the one with tilted spectral shapes). The sharp > edge is dependent on the sharpness of the splitting filter, as the subsequent > merge will merge two bands where the noise levels differ by 3 dB after the > beamforming.

I meant that other components are already non-linear, losing the perfect reconstruction. I don't think the 3dB (in the worst case) would be perceivable in the higher bands at all. But in any case it would sound slightly highpass, but the sharpness is not something that would matter too much. Anyways, now I am only taking one channel, so this is no longer relevant.

peah-webrtc 2016/05/30 11:49:25 Acknowledged.

On 2016/05/28 03:00:00, aluebs-webrtc wrote: > On 2016/05/26 08:48:52, peah-webrtc wrote: > > On 2016/05/26 01:04:45, aluebs-webrtc wrote: > > > On 2016/05/22 21:06:48, peah-webrtc wrote: > > > > Have you checked the impact on the signal when this is only done in the > > lowest > > > > band. In theory the perfect reconstruction property of the signal could be > > > > destroyed through this , causing aliasing distortions. Furthermore, this > > > could > > > > in theory cause an sharp increase in the signal power at the first > > band-split > > > > frequency. > > > > > > > > Why don't you perform the linear beamforming in the time-domain instead, > as > > > you > > > > anyway do not pre-steer the signals before applying that step (right?)? > > > > That would have the benefits > > > > 1) Be cheaper as you don't need to do the IFFT to go back to the > > time-domain. > > > > 2) Have lower delay, as the filter-bank delay is avoided. > > > > 3) Have zero impact on the perfect-reconstruction property of the > splitting > > > > filter. > > > > 4) Have no edge effect at the first band-split frequency. > > > > > > > > > > This is no longer relevant, since we decided offline to not do any linear > > > beamforming because it is negligible compared to the postfilter. But a few > > > points just to be clear: > > > * The perfect reconstruction of the filter bank is lost with any processing > > > done. > > > * The worst case 3dB increase of power in the higher bands is probably not > > > perceivable. And I am not sure what the problem with a "sharp" edge here is. > > > > I think the perfect reconstruction may actually be preserved if you average > two > > PR filterbanks in the time-domain (which the linear beamforming is identical > to > > if no pre-steering is performed). > > > > A 3 dB "edge" is perceivable, but I doubt as well if it will matter in general > > for most noise types (at least the one with tilted spectral shapes). The sharp > > edge is dependent on the sharpness of the splitting filter, as the subsequent > > merge will merge two bands where the noise levels differ by 3 dB after the > > beamforming. > > I meant that other components are already non-linear, losing the perfect > reconstruction. I don't think the 3dB (in the worst case) would be perceivable > in the higher bands at all. But in any case it would sound slightly highpass, > but the sharpness is not something that would matter too much. Anyways, now I am > only taking one channel, so this is no longer relevant.

Acknowledged.

+ // This can be done because the effect of the linear beamformer is negligible

+ // compared to the post-filter.

+ for (size_t i = 1; i < input.num_bands(); ++i) {

+ memcpy(output->channels(i)[0],

peah-webrtc 2016/05/22 21:06:48 what happens if the output is dual channel? Since

aluebs-webrtc 2016/05/26 01:04:45 This can't be done as is, since the input and outp

peah-webrtc 2016/05/30 11:49:25 So this means that the input could have 2 channels

aluebs-webrtc 2016/06/01 00:16:34 I don't think that adding an additional interface

peah-webrtc 2016/06/01 14:51:01 It actually does not add code complexity, as it se

aluebs-webrtc 2016/06/01 22:13:20 Interface removed and changed to what input-only.

Interface removed and changed to what input-only.

+ input.channels(i)[0],

+ input.num_frames_per_band() * sizeof(output->channels(i)[0][0]));

+ }

+void NonlinearBeamformer::PostFilter(const ChannelBuffer<float>& input,

+ ChannelBuffer<float>* output) {

+ RTC_DCHECK_EQ(input.num_frames_per_band(), chunk_length_);

+ postfilter_transform_->ProcessChunk(input.channels(0), output->channels(0));

// Ramp up/down for smoothing. 1 mask per 10ms results in audible

peah-webrtc 2016/05/22 21:06:48 I guess, what you mean is that smoothing is needed

aluebs-webrtc 2016/05/26 01:04:45 Adds unrelated changes to the CL, but if you think

peah-webrtc 2016/05/30 11:49:25 Sounds awesome!

// discontinuities.

const float ramp_increment =

- (high_pass_postfilter_mask_ - old_high_pass_mask) /

+ (high_pass_postfilter_mask_ - old_high_pass_mask_) /

input.num_frames_per_band();

- // Apply the smoothed high-pass mask to the first channel of each band.

- // This can be done because the effect of the linear beamformer is negligible

- // compared to the post-filter.

for (size_t i = 1; i < input.num_bands(); ++i) {

- float smoothed_mask = old_high_pass_mask;

+ float smoothed_mask = old_high_pass_mask_;

for (size_t j = 0; j < input.num_frames_per_band(); ++j) {

smoothed_mask += ramp_increment;

output->channels(i)[0][j] = input.channels(i)[0][j] * smoothed_mask;

@@ -456,7 +496,7 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input,

ApplyLowFrequencyCorrection();

ApplyHighFrequencyCorrection();

ApplyMaskFrequencySmoothing();

- ApplyMasks(input, output);

+ ApplyDelayAndSum(input, output);

}

float NonlinearBeamformer::CalculatePostfilterMask(

@@ -484,8 +524,8 @@ float NonlinearBeamformer::CalculatePostfilterMask(

return numerator / denominator;

}

-void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,

- complex_f* const* output) {

+void NonlinearBeamformer::ApplyDelayAndSum(const complex_f* const* input,

+ complex_f* const* output) {

complex_f* output_channel = output[0];

for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {

output_channel[f_ix] = complex_f(0.f, 0.f);

@@ -495,8 +535,13 @@ void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,

for (size_t c_ix = 0; c_ix < num_input_channels_; ++c_ix) {

output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];

}

+ }

- output_channel[f_ix] *= kCompensationGain * final_mask_[f_ix];

+void NonlinearBeamformer::ApplyPostFilter(const complex_f* input,

+ complex_f* output) {

+ for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {

+ output[f_ix] = kCompensationGain * final_mask_[f_ix] * input[f_ix];

}