Index: webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc |
diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc |
index da7ad0da59c1d663175e6ed878ab1a8aea06407d..40e738c6d1fc6a4735763c178dc3259124435d17 100644 |
--- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc |
+++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc |
@@ -27,34 +27,23 @@ namespace { |
// Alpha for the Kaiser Bessel Derived window. |
const float kKbdAlpha = 1.5f; |
-// The minimum value a post-processing mask can take. |
-const float kMaskMinimum = 0.01f; |
- |
const float kSpeedOfSoundMeterSeconds = 343; |
// For both target and interference angles, PI / 2 is perpendicular to the |
// microphone array, facing forwards. The positive direction goes |
// counterclockwise. |
// The angle at which we amplify sound. |
+// TODO(aluebs): Make the target angle dynamically settable. |
const float kTargetAngleRadians = static_cast<float>(M_PI) / 2.f; |
-// The angle at which we suppress sound. Suppression is symmetric around PI / 2 |
-// radians, so sound is suppressed at both +|kInterfAngleRadians| and |
-// PI - |kInterfAngleRadians|. Since the beamformer is robust, this should |
-// suppress sound coming from close angles as well. |
-const float kInterfAngleRadians = static_cast<float>(M_PI) / 4.f; |
- |
// When calculating the interference covariance matrix, this is the weight for |
// the weighted average between the uniform covariance matrix and the angled |
// covariance matrix. |
// Rpsi = Rpsi_angled * kBalance + Rpsi_uniform * (1 - kBalance) |
-const float kBalance = 0.4f; |
+const float kBalance = 0.95f; |
const float kHalfBeamWidthRadians = static_cast<float>(M_PI) * 20.f / 180.f; |
-// TODO(claguna): need comment here. |
-const float kBeamwidthConstant = 0.00002f; |
- |
// Alpha coefficients for mask smoothing. |
const float kMaskTimeSmoothAlpha = 0.2f; |
const float kMaskFrequencySmoothAlpha = 0.6f; |
@@ -64,17 +53,33 @@ const float kMaskFrequencySmoothAlpha = 0.6f; |
const int kLowMeanStartHz = 200; |
const int kLowMeanEndHz = 400; |
+// TODO(aluebs): Make the high frequency correction range depend on the target |
+// angle. |
const int kHighMeanStartHz = 3000; |
const int kHighMeanEndHz = 5000; |
+// Range limiter for subtractive terms in the nominator and denominator of the |
+// postfilter expression. It handles the scenario mismatch between the true and |
+// model sources (target and interference). |
+const float kCutOffConstant = 0.9999f; |
+ |
// Quantile of mask values which is used to estimate target presence. |
const float kMaskQuantile = 0.7f; |
// Mask threshold over which the data is considered signal and not interference. |
-const float kMaskTargetThreshold = 0.3f; |
+// It has to be updated every time the postfilter calculation is changed |
+// significantly. |
+// TODO(aluebs): Write a tool to tune the target threshold automatically based |
+// on files annotated with target and interference ground truth. |
+const float kMaskTargetThreshold = 0.01f; |
// Time in seconds after which the data is considered interference if the mask |
// does not pass |kMaskTargetThreshold|. |
const float kHoldTargetSeconds = 0.25f; |
+// To compensate for the attenuation this algorithm introduces to the target |
+// signal. It was estimated empirically from a low-noise low-reverberation |
+// recording from broadside. |
+const float kCompensationGain = 2.f; |
+ |
// Does conjugate(|norm_mat|) * |mat| * transpose(|norm_mat|). No extra space is |
// used; to accomplish this, we compute both multiplications in the same loop. |
// The returned norm is clamped to be non-negative. |
@@ -218,7 +223,6 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) { |
hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize; |
interference_blocks_count_ = hold_target_blocks_; |
- |
lapped_transform_.reset(new LappedTransform(num_input_channels_, |
1, |
chunk_length_, |
@@ -231,24 +235,34 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) { |
final_mask_[i] = 1.f; |
float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_; |
wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds; |
- mask_thresholds_[i] = num_input_channels_ * num_input_channels_ * |
- kBeamwidthConstant * wave_numbers_[i] * |
- wave_numbers_[i]; |
} |
// Initialize all nonadaptive values before looping through the frames. |
+ InitInterfAngles(); |
InitDelaySumMasks(); |
InitTargetCovMats(); |
InitInterfCovMats(); |
for (size_t i = 0; i < kNumFreqBins; ++i) { |
rxiws_[i] = Norm(target_cov_mats_[i], delay_sum_masks_[i]); |
- rpsiws_[i] = Norm(interf_cov_mats_[i], delay_sum_masks_[i]); |
- reflected_rpsiws_[i] = |
- Norm(reflected_interf_cov_mats_[i], delay_sum_masks_[i]); |
+ rpsiws_[i].clear(); |
+ for (size_t j = 0; j < interf_angles_radians_.size(); ++j) { |
+ rpsiws_[i].push_back(Norm(*interf_cov_mats_[i][j], delay_sum_masks_[i])); |
+ } |
} |
} |
+void NonlinearBeamformer::InitInterfAngles() { |
+ // TODO(aluebs): Make kAwayRadians dependent on the mic spacing. |
+ const float kAwayRadians = 0.5; |
+ |
+ interf_angles_radians_.clear(); |
+ // TODO(aluebs): When the target angle is settable, make sure the interferer |
+ // scenarios aren't reflected over the target one for linear geometries. |
+ interf_angles_radians_.push_back(kTargetAngleRadians - kAwayRadians); |
+ interf_angles_radians_.push_back(kTargetAngleRadians + kAwayRadians); |
+} |
+ |
void NonlinearBeamformer::InitDelaySumMasks() { |
for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) { |
delay_sum_masks_[f_ix].Resize(1, num_input_channels_); |
@@ -273,40 +287,39 @@ void NonlinearBeamformer::InitTargetCovMats() { |
for (size_t i = 0; i < kNumFreqBins; ++i) { |
target_cov_mats_[i].Resize(num_input_channels_, num_input_channels_); |
TransposedConjugatedProduct(delay_sum_masks_[i], &target_cov_mats_[i]); |
- complex_f normalization_factor = target_cov_mats_[i].Trace(); |
- target_cov_mats_[i].Scale(1.f / normalization_factor); |
} |
} |
void NonlinearBeamformer::InitInterfCovMats() { |
for (size_t i = 0; i < kNumFreqBins; ++i) { |
- interf_cov_mats_[i].Resize(num_input_channels_, num_input_channels_); |
ComplexMatrixF uniform_cov_mat(num_input_channels_, num_input_channels_); |
- ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_); |
- |
CovarianceMatrixGenerator::UniformCovarianceMatrix(wave_numbers_[i], |
array_geometry_, |
&uniform_cov_mat); |
- |
- CovarianceMatrixGenerator::AngledCovarianceMatrix(kSpeedOfSoundMeterSeconds, |
- kInterfAngleRadians, |
- i, |
- kFftSize, |
- kNumFreqBins, |
- sample_rate_hz_, |
- array_geometry_, |
- &angled_cov_mat); |
- // Normalize matrices before averaging them. |
- complex_f normalization_factor = uniform_cov_mat.Trace(); |
+ complex_f normalization_factor = uniform_cov_mat.elements()[0][0]; |
uniform_cov_mat.Scale(1.f / normalization_factor); |
- normalization_factor = angled_cov_mat.Trace(); |
- angled_cov_mat.Scale(1.f / normalization_factor); |
- |
- // Average matrices. |
uniform_cov_mat.Scale(1 - kBalance); |
- angled_cov_mat.Scale(kBalance); |
- interf_cov_mats_[i].Add(uniform_cov_mat, angled_cov_mat); |
- reflected_interf_cov_mats_[i].PointwiseConjugate(interf_cov_mats_[i]); |
+ interf_cov_mats_[i].clear(); |
+ for (size_t j = 0; j < interf_angles_radians_.size(); ++j) { |
+ interf_cov_mats_[i].push_back(new ComplexMatrixF(num_input_channels_, |
+ num_input_channels_)); |
+ ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_); |
+ CovarianceMatrixGenerator::AngledCovarianceMatrix( |
+ kSpeedOfSoundMeterSeconds, |
+ interf_angles_radians_[j], |
+ i, |
+ kFftSize, |
+ kNumFreqBins, |
+ sample_rate_hz_, |
+ array_geometry_, |
+ &angled_cov_mat); |
+ // Normalize matrices before averaging them. |
+ normalization_factor = angled_cov_mat.elements()[0][0]; |
+ angled_cov_mat.Scale(1.f / normalization_factor); |
+ // Weighted average of matrices. |
+ angled_cov_mat.Scale(kBalance); |
+ interf_cov_mats_[i][j]->Add(uniform_cov_mat, angled_cov_mat); |
+ } |
} |
} |
@@ -376,17 +389,19 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input, |
rmw *= rmw; |
float rmw_r = rmw.real(); |
- new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i], |
- rpsiws_[i], |
+ new_mask_[i] = CalculatePostfilterMask(*interf_cov_mats_[i][0], |
+ rpsiws_[i][0], |
ratio_rxiw_rxim, |
- rmw_r, |
- mask_thresholds_[i]); |
- |
- new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i], |
- reflected_rpsiws_[i], |
- ratio_rxiw_rxim, |
- rmw_r, |
- mask_thresholds_[i]); |
+ rmw_r); |
+ for (size_t j = 1; j < interf_angles_radians_.size(); ++j) { |
+ float tmp_mask = CalculatePostfilterMask(*interf_cov_mats_[i][j], |
+ rpsiws_[i][j], |
+ ratio_rxiw_rxim, |
+ rmw_r); |
+ if (tmp_mask < new_mask_[i]) { |
+ new_mask_[i] = tmp_mask; |
+ } |
+ } |
} |
ApplyMaskTimeSmoothing(); |
@@ -401,24 +416,16 @@ float NonlinearBeamformer::CalculatePostfilterMask( |
const ComplexMatrixF& interf_cov_mat, |
float rpsiw, |
float ratio_rxiw_rxim, |
- float rmw_r, |
- float mask_threshold) { |
+ float rmw_r) { |
float rpsim = Norm(interf_cov_mat, eig_m_); |
- // Find lambda. |
float ratio = 0.f; |
if (rpsim > 0.f) { |
ratio = rpsiw / rpsim; |
} |
- float numerator = rmw_r - ratio; |
- float denominator = ratio_rxiw_rxim - ratio; |
- float mask = 1.f; |
- if (denominator > mask_threshold) { |
- float lambda = numerator / denominator; |
- mask = std::max(lambda * ratio_rxiw_rxim / rmw_r, kMaskMinimum); |
- } |
- return mask; |
+ return (1.f - std::min(kCutOffConstant, ratio / rmw_r)) / |
+ (1.f - std::min(kCutOffConstant, ratio / ratio_rxiw_rxim)); |
} |
void NonlinearBeamformer::ApplyMasks(const complex_f* const* input, |
@@ -433,7 +440,7 @@ void NonlinearBeamformer::ApplyMasks(const complex_f* const* input, |
output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix]; |
} |
- output_channel[f_ix] *= final_mask_[f_ix]; |
+ output_channel[f_ix] *= kCompensationGain * final_mask_[f_ix]; |
} |
} |