webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc - Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe.

Side by Side Diff: webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc

Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe. (Closed)

Patch Set: Rebase Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « webrtc/modules/audio_processing/utility/ooura_fft_neon.cc ('k') | webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.	2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"	11 #include "webrtc/modules/audio_processing//utility/ooura_fft.h"

12	12

13 #include <emmintrin.h>	13 #include <emmintrin.h>

14	14

15 static const ALIGN16_BEG float ALIGN16_END	15 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h"

16 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};	16 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"

17	17

18 static void cft1st_128_SSE2(float* a) {	18 namespace webrtc {

	19

	20 #if defined(WEBRTC_ARCH_X86_FAMILY)

	21

	22 namespace {

	23 // These intrinsics were unavailable before VS 2008.

	24 // TODO(andrew): move to a common file.

	25 #if defined(_MSC_VER) && _MSC_VER < 1500

	26 static __inline __m128 _mm_castsi128_ps(__m128i a) {

	27 return (__m128)&a;

	28 }

	29 static __inline __m128i _mm_castps_si128(__m128 a) {

	30 return (__m128i)&a;

	31 }

	32 #endif

	33

	34 } // namespace

	35

	36 void cft1st_128_SSE2(float* a) {

19 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);	37 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);

20 int j, k2;	38 int j, k2;

21	39

22 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {	40 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {

23 __m128 a00v = _mm_loadu_ps(&a[j + 0]);	41 __m128 a00v = _mm_loadu_ps(&a[j + 0]);

24 __m128 a04v = _mm_loadu_ps(&a[j + 4]);	42 __m128 a04v = _mm_loadu_ps(&a[j + 4]);

25 __m128 a08v = _mm_loadu_ps(&a[j + 8]);	43 __m128 a08v = _mm_loadu_ps(&a[j + 8]);

26 __m128 a12v = _mm_loadu_ps(&a[j + 12]);	44 __m128 a12v = _mm_loadu_ps(&a[j + 12]);

27 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));	45 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));

28 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));	46 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
71 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));	89 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));

72 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));	90 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));

73 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));	91 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));

74 _mm_storeu_ps(&a[j + 0], a00v);	92 _mm_storeu_ps(&a[j + 0], a00v);

75 _mm_storeu_ps(&a[j + 4], a04v);	93 _mm_storeu_ps(&a[j + 4], a04v);

76 _mm_storeu_ps(&a[j + 8], a08v);	94 _mm_storeu_ps(&a[j + 8], a08v);

77 _mm_storeu_ps(&a[j + 12], a12v);	95 _mm_storeu_ps(&a[j + 12], a12v);

78 }	96 }

79 }	97 }

80	98

81 static void cftmdl_128_SSE2(float* a) {	99 void cftmdl_128_SSE2(float* a) {

82 const int l = 8;	100 const int l = 8;

83 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);	101 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);

84 int j0;	102 int j0;

85	103

86 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);	104 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);

87 for (j0 = 0; j0 < l; j0 += 2) {	105 for (j0 = 0; j0 < l; j0 += 2) {

88 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);	106 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);

89 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);	107 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);

90 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);	108 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);

91 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);	109 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);

92 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),	110 const __m128 a_00_32 =

93 _mm_castsi128_ps(a_32),	111 _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),

94 _MM_SHUFFLE(1, 0, 1, 0));	112 _MM_SHUFFLE(1, 0, 1, 0));

95 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),	113 const __m128 a_08_40 =

96 _mm_castsi128_ps(a_40),	114 _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),

97 _MM_SHUFFLE(1, 0, 1, 0));	115 _MM_SHUFFLE(1, 0, 1, 0));

98 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);	116 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);

99 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);	117 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

100	118

101 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);	119 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);

102 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);	120 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);

103 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);	121 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);

104 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);	122 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);

105 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),	123 const __m128 a_16_48 =

106 _mm_castsi128_ps(a_48),	124 _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),

107 _MM_SHUFFLE(1, 0, 1, 0));	125 _MM_SHUFFLE(1, 0, 1, 0));

108 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),	126 const __m128 a_24_56 =

109 _mm_castsi128_ps(a_56),	127 _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),

110 _MM_SHUFFLE(1, 0, 1, 0));	128 _MM_SHUFFLE(1, 0, 1, 0));

111 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);	129 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);

112 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);	130 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

113	131

114 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);	132 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

115 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);	133 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

116	134

117 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(	135 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(

118 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));	136 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));

119 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);	137 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);

120 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);	138 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
156 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);	174 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);

157 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);	175 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);

158 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);	176 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);

159 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);	177 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);

160 wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);	178 wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);

161 for (j0 = k; j0 < l + k; j0 += 2) {	179 for (j0 = k; j0 < l + k; j0 += 2) {

162 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);	180 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);

163 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);	181 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);

164 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);	182 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);

165 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);	183 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);

166 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),	184 const __m128 a_00_32 =

167 _mm_castsi128_ps(a_32),	185 _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),

168 _MM_SHUFFLE(1, 0, 1, 0));	186 _MM_SHUFFLE(1, 0, 1, 0));

169 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),	187 const __m128 a_08_40 =

170 _mm_castsi128_ps(a_40),	188 _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),

171 _MM_SHUFFLE(1, 0, 1, 0));	189 _MM_SHUFFLE(1, 0, 1, 0));

172 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);	190 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);

173 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);	191 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

174	192

175 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);	193 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);

176 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);	194 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);

177 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);	195 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);

178 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);	196 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);

179 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),	197 const __m128 a_16_48 =

180 _mm_castsi128_ps(a_48),	198 _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),

181 _MM_SHUFFLE(1, 0, 1, 0));	199 _MM_SHUFFLE(1, 0, 1, 0));

182 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),	200 const __m128 a_24_56 =

183 _mm_castsi128_ps(a_56),	201 _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),

184 _MM_SHUFFLE(1, 0, 1, 0));	202 _MM_SHUFFLE(1, 0, 1, 0));

185 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);	203 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);

186 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);	204 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

187	205

188 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);	206 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

189 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);	207 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

190 const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);	208 const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);

191 const __m128 xx3 =	209 const __m128 xx3 = _mm_mul_ps(

192 _mm_mul_ps(wk2iv,	210 wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),

193 _mm_castsi128_ps(_mm_shuffle_epi32(	211 _MM_SHUFFLE(2, 3, 0, 1))));

194 _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));

195 const __m128 xx4 = _mm_add_ps(xx2, xx3);	212 const __m128 xx4 = _mm_add_ps(xx2, xx3);

196	213

197 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(	214 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(

198 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));	215 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));

199 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);	216 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);

200 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);	217 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

201 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);	218 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

202	219

203 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);	220 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);

204 const __m128 xx11 = _mm_mul_ps(	221 const __m128 xx11 = _mm_mul_ps(

205 wk1iv,	222 wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),

206 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),	223 _MM_SHUFFLE(2, 3, 0, 1))));

207 _MM_SHUFFLE(2, 3, 0, 1))));

208 const __m128 xx12 = _mm_add_ps(xx10, xx11);	224 const __m128 xx12 = _mm_add_ps(xx10, xx11);

209	225

210 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);	226 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);

211 const __m128 xx21 = _mm_mul_ps(	227 const __m128 xx21 = _mm_mul_ps(

212 wk3iv,	228 wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),

213 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),	229 _MM_SHUFFLE(2, 3, 0, 1))));

214 _MM_SHUFFLE(2, 3, 0, 1))));

215 const __m128 xx22 = _mm_add_ps(xx20, xx21);	230 const __m128 xx22 = _mm_add_ps(xx20, xx21);

216	231

217 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));	232 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));

218 _mm_storel_epi64(	233 _mm_storel_epi64(

219 (__m128i*)&a[j0 + 32],	234 (__m128i*)&a[j0 + 32],

220 _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));	235 _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));

221	236

222 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));	237 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));

223 _mm_storel_epi64(	238 _mm_storel_epi64(

224 (__m128i*)&a[j0 + 48],	239 (__m128i*)&a[j0 + 48],

225 _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));	240 _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));

226	241

227 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));	242 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));

228 _mm_storel_epi64(	243 _mm_storel_epi64(

229 (__m128i*)&a[j0 + 40],	244 (__m128i*)&a[j0 + 40],

230 _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));	245 _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));

231	246

232 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));	247 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));

233 _mm_storel_epi64(	248 _mm_storel_epi64(

234 (__m128i*)&a[j0 + 56],	249 (__m128i*)&a[j0 + 56],

235 _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));	250 _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));

236 }	251 }

237 }	252 }

238 }	253 }

239	254

240 static void rftfsub_128_SSE2(float* a) {	255 void rftfsub_128_SSE2(float* a) {

241 const float* c = rdft_w + 32;	256 const float* c = rdft_w + 32;

242 int j1, j2, k1, k2;	257 int j1, j2, k1, k2;

243 float wkr, wki, xr, xi, yr, yi;	258 float wkr, wki, xr, xi, yr, yi;

244	259

245 static const ALIGN16_BEG float ALIGN16_END	260 static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,

246 k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};	261 0.5f};

247 const __m128 mm_half = _mm_load_ps(k_half);	262 const __m128 mm_half = _mm_load_ps(k_half);

248	263

249 // Vectorized code (four at once).	264 // Vectorized code (four at once).

250 // Note: commented number are indexes for the first iteration of the loop.	265 // Note: commented number are indexes for the first iteration of the loop.

251 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {	266 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {

252 // Load 'wk'.	267 // Load 'wk'.

253 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,	268 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,

254 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,	269 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,

255 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,	270 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,

256 const __m128 wkr_ =	271 const __m128 wkr_ =

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
320 xi = a[j2 + 1] + a[k2 + 1];	335 xi = a[j2 + 1] + a[k2 + 1];

321 yr = wkr * xr - wki * xi;	336 yr = wkr * xr - wki * xi;

322 yi = wkr * xi + wki * xr;	337 yi = wkr * xi + wki * xr;

323 a[j2 + 0] -= yr;	338 a[j2 + 0] -= yr;

324 a[j2 + 1] -= yi;	339 a[j2 + 1] -= yi;

325 a[k2 + 0] += yr;	340 a[k2 + 0] += yr;

326 a[k2 + 1] -= yi;	341 a[k2 + 1] -= yi;

327 }	342 }

328 }	343 }

329	344

330 static void rftbsub_128_SSE2(float* a) {	345 void rftbsub_128_SSE2(float* a) {

331 const float* c = rdft_w + 32;	346 const float* c = rdft_w + 32;

332 int j1, j2, k1, k2;	347 int j1, j2, k1, k2;

333 float wkr, wki, xr, xi, yr, yi;	348 float wkr, wki, xr, xi, yr, yi;

334	349

335 static const ALIGN16_BEG float ALIGN16_END	350 static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,

336 k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};	351 0.5f};

337 const __m128 mm_half = _mm_load_ps(k_half);	352 const __m128 mm_half = _mm_load_ps(k_half);

338	353

339 a[1] = -a[1];	354 a[1] = -a[1];

340 // Vectorized code (four at once).	355 // Vectorized code (four at once).

341 // Note: commented number are indexes for the first iteration of the loop.	356 // Note: commented number are indexes for the first iteration of the loop.

342 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {	357 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {

343 // Load 'wk'.	358 // Load 'wk'.

344 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,	359 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,

345 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,	360 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,

346 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,	361 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
411 xi = a[j2 + 1] + a[k2 + 1];	426 xi = a[j2 + 1] + a[k2 + 1];

412 yr = wkr * xr + wki * xi;	427 yr = wkr * xr + wki * xi;

413 yi = wkr * xi - wki * xr;	428 yi = wkr * xi - wki * xr;

414 a[j2 + 0] = a[j2 + 0] - yr;	429 a[j2 + 0] = a[j2 + 0] - yr;

415 a[j2 + 1] = yi - a[j2 + 1];	430 a[j2 + 1] = yi - a[j2 + 1];

416 a[k2 + 0] = yr + a[k2 + 0];	431 a[k2 + 0] = yr + a[k2 + 0];

417 a[k2 + 1] = yi - a[k2 + 1];	432 a[k2 + 1] = yi - a[k2 + 1];

418 }	433 }

419 a[65] = -a[65];	434 a[65] = -a[65];

420 }	435 }

	436 #endif

421	437

422 void aec_rdft_init_sse2(void) {	438 } // namespace webrtc

423 cft1st_128 = cft1st_128_SSE2;

424 cftmdl_128 = cftmdl_128_SSE2;

425 rftfsub_128 = rftfsub_128_SSE2;

426 rftbsub_128 = rftbsub_128_SSE2;

427 }

OLD	NEW