webrtc/modules/audio_processing/aec3/vector_math.h - Issue 2862573002: Reland of Added ARM Neon SIMD optimizations for AEC3

Side by Side Diff: webrtc/modules/audio_processing/aec3/vector_math.h

Issue 2862573002: Reland of Added ARM Neon SIMD optimizations for AEC3 (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « webrtc/modules/audio_processing/aec3/matched_filter_unittest.cc ('k') | webrtc/modules/audio_processing/aec3/vector_math_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_	11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_

12 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_	12 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_

13	13

14 #include "webrtc/typedefs.h"	14 #include "webrtc/typedefs.h"

	15 #if defined(WEBRTC_HAS_NEON)

	16 #include <arm_neon.h>

	17 #endif

15 #if defined(WEBRTC_ARCH_X86_FAMILY)	18 #if defined(WEBRTC_ARCH_X86_FAMILY)

16 #include <emmintrin.h>	19 #include <emmintrin.h>

17 #endif	20 #endif

18 #include <math.h>	21 #include <math.h>

19 #include <algorithm>	22 #include <algorithm>

20 #include <array>	23 #include <array>

21 #include <functional>	24 #include <functional>

22	25

23 #include "webrtc/base/array_view.h"	26 #include "webrtc/base/array_view.h"

24 #include "webrtc/base/checks.h"	27 #include "webrtc/base/checks.h"

(...skipping 21 matching lines...) Expand all Loading...
46 __m128 g = _mm_loadu_ps(&x[j]);	49 __m128 g = _mm_loadu_ps(&x[j]);

47 g = _mm_sqrt_ps(g);	50 g = _mm_sqrt_ps(g);

48 _mm_storeu_ps(&x[j], g);	51 _mm_storeu_ps(&x[j], g);

49 }	52 }

50	53

51 for (; j < x_size; ++j) {	54 for (; j < x_size; ++j) {

52 x[j] = sqrtf(x[j]);	55 x[j] = sqrtf(x[j]);

53 }	56 }

54 } break;	57 } break;

55 #endif	58 #endif

	59 #if defined(WEBRTC_HAS_NEON)

	60 case Aec3Optimization::kNeon: {

	61 const int x_size = static_cast<int>(x.size());

	62 const int vector_limit = x_size >> 2;

	63

	64 int j = 0;

	65 for (; j < vector_limit * 4; j += 4) {

	66 float32x4_t g = vld1q_f32(&x[j]);

	67 #if !defined(WEBRTC_ARCH_ARM64)

	68 float32x4_t y = vrsqrteq_f32(g);

	69

	70 // Code to handle sqrt(0).

	71 // If the input to sqrtf() is zero, a zero will be returned.

	72 // If the input to vrsqrteq_f32() is zero, positive infinity is

	73 // returned.

	74 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);

	75 // check for divide by zero

	76 const uint32x4_t div_by_zero =

	77 vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(y));

	78 // zero out the positive infinity results

	79 y = vreinterpretq_f32_u32(

	80 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(y)));

	81 // from arm documentation

	82 // The Newton-Raphson iteration:

	83 // y[n+1] = y[n] * (3 - d * (y[n] * y[n])) / 2)

	84 // converges to (1/√d) if y0 is the result of VRSQRTE applied to d.

	85 //

	86 // Note: The precision did not improve after 2 iterations.

	87 for (int i = 0; i < 2; i++) {

	88 y = vmulq_f32(vrsqrtsq_f32(vmulq_f32(y, y), g), y);

	89 }

	90 // sqrt(g) = g * 1/sqrt(g)

	91 g = vmulq_f32(g, y);

	92 #else

	93 g = vsqrtq_f32(g);

	94 #endif

	95 vst1q_f32(&x[j], g);

	96 }

	97

	98 for (; j < x_size; ++j) {

	99 x[j] = sqrtf(x[j]);

	100 }

	101 }

	102 #endif

	103 break;

56 default:	104 default:

57 std::for_each(x.begin(), x.end(), [](float& a) { a = sqrtf(a); });	105 std::for_each(x.begin(), x.end(), [](float& a) { a = sqrtf(a); });

58 }	106 }

59 }	107 }

60	108

61 // Elementwise vector multiplication z = x * y.	109 // Elementwise vector multiplication z = x * y.

62 void Multiply(rtc::ArrayView<const float> x,	110 void Multiply(rtc::ArrayView<const float> x,

63 rtc::ArrayView<const float> y,	111 rtc::ArrayView<const float> y,

64 rtc::ArrayView<float> z) {	112 rtc::ArrayView<float> z) {

65 RTC_DCHECK_EQ(z.size(), x.size());	113 RTC_DCHECK_EQ(z.size(), x.size());

(...skipping 10 matching lines...) Expand all Loading...
76 const __m128 y_j = _mm_loadu_ps(&y[j]);	124 const __m128 y_j = _mm_loadu_ps(&y[j]);

77 const __m128 z_j = _mm_mul_ps(x_j, y_j);	125 const __m128 z_j = _mm_mul_ps(x_j, y_j);

78 _mm_storeu_ps(&z[j], z_j);	126 _mm_storeu_ps(&z[j], z_j);

79 }	127 }

80	128

81 for (; j < x_size; ++j) {	129 for (; j < x_size; ++j) {

82 z[j] = x[j] * y[j];	130 z[j] = x[j] * y[j];

83 }	131 }

84 } break;	132 } break;

85 #endif	133 #endif

	134 #if defined(WEBRTC_HAS_NEON)

	135 case Aec3Optimization::kNeon: {

	136 const int x_size = static_cast<int>(x.size());

	137 const int vector_limit = x_size >> 2;

	138

	139 int j = 0;

	140 for (; j < vector_limit * 4; j += 4) {

	141 const float32x4_t x_j = vld1q_f32(&x[j]);

	142 const float32x4_t y_j = vld1q_f32(&y[j]);

	143 const float32x4_t z_j = vmulq_f32(x_j, y_j);

	144 vst1q_f32(&z[j], z_j);

	145 }

	146

	147 for (; j < x_size; ++j) {

	148 z[j] = x[j] * y[j];

	149 }

	150 } break;

	151 #endif

86 default:	152 default:

87 std::transform(x.begin(), x.end(), y.begin(), z.begin(),	153 std::transform(x.begin(), x.end(), y.begin(), z.begin(),

88 std::multiplies<float>());	154 std::multiplies<float>());

89 }	155 }

90 }	156 }

91	157

92 // Elementwise vector accumulation z += x.	158 // Elementwise vector accumulation z += x.

93 void Accumulate(rtc::ArrayView<const float> x, rtc::ArrayView<float> z) {	159 void Accumulate(rtc::ArrayView<const float> x, rtc::ArrayView<float> z) {

94 RTC_DCHECK_EQ(z.size(), x.size());	160 RTC_DCHECK_EQ(z.size(), x.size());

95 switch (optimization_) {	161 switch (optimization_) {

96 #if defined(WEBRTC_ARCH_X86_FAMILY)	162 #if defined(WEBRTC_ARCH_X86_FAMILY)

97 case Aec3Optimization::kSse2: {	163 case Aec3Optimization::kSse2: {

98 const int x_size = static_cast<int>(x.size());	164 const int x_size = static_cast<int>(x.size());

99 const int vector_limit = x_size >> 2;	165 const int vector_limit = x_size >> 2;

100	166

101 int j = 0;	167 int j = 0;

102 for (; j < vector_limit * 4; j += 4) {	168 for (; j < vector_limit * 4; j += 4) {

103 const __m128 x_j = _mm_loadu_ps(&x[j]);	169 const __m128 x_j = _mm_loadu_ps(&x[j]);

104 __m128 z_j = _mm_loadu_ps(&z[j]);	170 __m128 z_j = _mm_loadu_ps(&z[j]);

105 z_j = _mm_add_ps(x_j, z_j);	171 z_j = _mm_add_ps(x_j, z_j);

106 _mm_storeu_ps(&z[j], z_j);	172 _mm_storeu_ps(&z[j], z_j);

107 }	173 }

108	174

109 for (; j < x_size; ++j) {	175 for (; j < x_size; ++j) {

110 z[j] += x[j];	176 z[j] += x[j];

111 }	177 }

112 } break;	178 } break;

113 #endif	179 #endif

	180 #if defined(WEBRTC_HAS_NEON)

	181 case Aec3Optimization::kNeon: {

	182 const int x_size = static_cast<int>(x.size());

	183 const int vector_limit = x_size >> 2;

	184

	185 int j = 0;

	186 for (; j < vector_limit * 4; j += 4) {

	187 const float32x4_t x_j = vld1q_f32(&x[j]);

	188 float32x4_t z_j = vld1q_f32(&z[j]);

	189 z_j = vaddq_f32(z_j, x_j);

	190 vst1q_f32(&z[j], z_j);

	191 }

	192

	193 for (; j < x_size; ++j) {

	194 z[j] += x[j];

	195 }

	196 } break;

	197 #endif

114 default:	198 default:

115 std::transform(x.begin(), x.end(), z.begin(), z.begin(),	199 std::transform(x.begin(), x.end(), z.begin(), z.begin(),

116 std::plus<float>());	200 std::plus<float>());

117 }	201 }

118 }	202 }

119	203

120 private:	204 private:

121 Aec3Optimization optimization_;	205 Aec3Optimization optimization_;

122 };	206 };

123	207

124 } // namespace aec3	208 } // namespace aec3

125	209

126 } // namespace webrtc	210 } // namespace webrtc

127	211

128 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_	212 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_

OLD	NEW