| Index: webrtc/modules/audio_processing/aec3/matched_filter.cc | 
| diff --git a/webrtc/modules/audio_processing/aec3/matched_filter.cc b/webrtc/modules/audio_processing/aec3/matched_filter.cc | 
| index 7bb5778999879f91012641f82f72c491b1df11c4..4c6e0d70525ad9eba85cb52043d49b82751af999 100644 | 
| --- a/webrtc/modules/audio_processing/aec3/matched_filter.cc | 
| +++ b/webrtc/modules/audio_processing/aec3/matched_filter.cc | 
| @@ -9,6 +9,9 @@ | 
| */ | 
| #include "webrtc/modules/audio_processing/aec3/matched_filter.h" | 
|  | 
| +#if defined(WEBRTC_HAS_NEON) | 
| +#include <arm_neon.h> | 
| +#endif | 
| #include "webrtc/typedefs.h" | 
| #if defined(WEBRTC_ARCH_X86_FAMILY) | 
| #include <emmintrin.h> | 
| @@ -22,6 +25,114 @@ | 
| namespace webrtc { | 
| namespace aec3 { | 
|  | 
| +#if defined(WEBRTC_HAS_NEON) | 
| + | 
| +void MatchedFilterCore_NEON(size_t x_start_index, | 
| +                            float x2_sum_threshold, | 
| +                            rtc::ArrayView<const float> x, | 
| +                            rtc::ArrayView<const float> y, | 
| +                            rtc::ArrayView<float> h, | 
| +                            bool* filters_updated, | 
| +                            float* error_sum) { | 
| +  const int h_size = static_cast<int>(h.size()); | 
| +  const int x_size = static_cast<int>(x.size()); | 
| +  RTC_DCHECK_EQ(0, h_size % 4); | 
| + | 
| +  // Process for all samples in the sub-block. | 
| +  for (size_t i = 0; i < kSubBlockSize; ++i) { | 
| +    // Apply the matched filter as filter * x, and compute x * x. | 
| + | 
| +    RTC_DCHECK_GT(x_size, x_start_index); | 
| +    const float* x_p = &x[x_start_index]; | 
| +    const float* h_p = &h[0]; | 
| + | 
| +    // Initialize values for the accumulation. | 
| +    float32x4_t s_128 = vdupq_n_f32(0); | 
| +    float32x4_t x2_sum_128 = vdupq_n_f32(0); | 
| +    float x2_sum = 0.f; | 
| +    float s = 0; | 
| + | 
| +    // Compute loop chunk sizes until, and after, the wraparound of the circular | 
| +    // buffer for x. | 
| +    const int chunk1 = | 
| +        std::min(h_size, static_cast<int>(x_size - x_start_index)); | 
| + | 
| +    // Perform the loop in two chunks. | 
| +    const int chunk2 = h_size - chunk1; | 
| +    for (int limit : {chunk1, chunk2}) { | 
| +      // Perform 128 bit vector operations. | 
| +      const int limit_by_4 = limit >> 2; | 
| +      for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) { | 
| +        // Load the data into 128 bit vectors. | 
| +        const float32x4_t x_k = vld1q_f32(x_p); | 
| +        const float32x4_t h_k = vld1q_f32(h_p); | 
| +        // Compute and accumulate x * x and h * x. | 
| +        x2_sum_128 = vmlaq_f32(x2_sum_128, x_k, x_k); | 
| +        s_128 = vmlaq_f32(s_128, h_k, x_k); | 
| +      } | 
| + | 
| +      // Perform non-vector operations for any remaining items. | 
| +      for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) { | 
| +        const float x_k = *x_p; | 
| +        x2_sum += x_k * x_k; | 
| +        s += *h_p * x_k; | 
| +      } | 
| + | 
| +      x_p = &x[0]; | 
| +    } | 
| + | 
| +    // Combine the accumulated vector and scalar values. | 
| +    float* v = reinterpret_cast<float*>(&x2_sum_128); | 
| +    x2_sum += v[0] + v[1] + v[2] + v[3]; | 
| +    v = reinterpret_cast<float*>(&s_128); | 
| +    s += v[0] + v[1] + v[2] + v[3]; | 
| + | 
| +    // Compute the matched filter error. | 
| +    const float e = std::min(32767.f, std::max(-32768.f, y[i] - s)); | 
| +    *error_sum += e * e; | 
| + | 
| +    // Update the matched filter estimate in an NLMS manner. | 
| +    if (x2_sum > x2_sum_threshold) { | 
| +      RTC_DCHECK_LT(0.f, x2_sum); | 
| +      const float alpha = 0.7f * e / x2_sum; | 
| +      const float32x4_t alpha_128 = vmovq_n_f32(alpha); | 
| + | 
| +      // filter = filter + 0.7 * (y - filter * x) / x * x. | 
| +      float* h_p = &h[0]; | 
| +      x_p = &x[x_start_index]; | 
| + | 
| +      // Perform the loop in two chunks. | 
| +      for (int limit : {chunk1, chunk2}) { | 
| +        // Perform 128 bit vector operations. | 
| +        const int limit_by_4 = limit >> 2; | 
| +        for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) { | 
| +          // Load the data into 128 bit vectors. | 
| +          float32x4_t h_k = vld1q_f32(h_p); | 
| +          const float32x4_t x_k = vld1q_f32(x_p); | 
| +          // Compute h = h + alpha * x. | 
| +          h_k = vmlaq_f32(h_k, alpha_128, x_k); | 
| + | 
| +          // Store the result. | 
| +          vst1q_f32(h_p, h_k); | 
| +        } | 
| + | 
| +        // Perform non-vector operations for any remaining items. | 
| +        for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) { | 
| +          *h_p += alpha * *x_p; | 
| +        } | 
| + | 
| +        x_p = &x[0]; | 
| +      } | 
| + | 
| +      *filters_updated = true; | 
| +    } | 
| + | 
| +    x_start_index = x_start_index > 0 ? x_start_index - 1 : x_size - 1; | 
| +  } | 
| +} | 
| + | 
| +#endif | 
| + | 
| #if defined(WEBRTC_ARCH_X86_FAMILY) | 
|  | 
| void MatchedFilterCore_SSE2(size_t x_start_index, | 
| @@ -227,6 +338,13 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer, | 
| &filters_updated, &error_sum); | 
| break; | 
| #endif | 
| +#if defined(WEBRTC_HAS_NEON) | 
| +      case Aec3Optimization::kNeon: | 
| +        aec3::MatchedFilterCore_NEON(x_start_index, x2_sum_threshold, | 
| +                                     render_buffer.buffer, y, filters_[n], | 
| +                                     &filters_updated, &error_sum); | 
| +        break; | 
| +#endif | 
| default: | 
| aec3::MatchedFilterCore(x_start_index, x2_sum_threshold, | 
| render_buffer.buffer, y, filters_[n], | 
|  |