OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 11 #include "webrtc/modules/audio_processing//utility/ooura_fft.h" |
12 | 12 |
13 #include <emmintrin.h> | 13 #include <emmintrin.h> |
14 | 14 |
15 static const ALIGN16_BEG float ALIGN16_END | 15 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
16 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; | 16 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" |
17 | 17 |
18 static void cft1st_128_SSE2(float* a) { | 18 namespace webrtc { |
| 19 |
| 20 #if defined(WEBRTC_ARCH_X86_FAMILY) |
| 21 |
| 22 namespace { |
| 23 // These intrinsics were unavailable before VS 2008. |
| 24 // TODO(andrew): move to a common file. |
| 25 #if defined(_MSC_VER) && _MSC_VER < 1500 |
| 26 static __inline __m128 _mm_castsi128_ps(__m128i a) { |
| 27 return *(__m128*)&a; |
| 28 } |
| 29 static __inline __m128i _mm_castps_si128(__m128 a) { |
| 30 return *(__m128i*)&a; |
| 31 } |
| 32 #endif |
| 33 |
| 34 } // namespace |
| 35 |
| 36 void cft1st_128_SSE2(float* a) { |
19 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 37 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |
20 int j, k2; | 38 int j, k2; |
21 | 39 |
22 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { | 40 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { |
23 __m128 a00v = _mm_loadu_ps(&a[j + 0]); | 41 __m128 a00v = _mm_loadu_ps(&a[j + 0]); |
24 __m128 a04v = _mm_loadu_ps(&a[j + 4]); | 42 __m128 a04v = _mm_loadu_ps(&a[j + 4]); |
25 __m128 a08v = _mm_loadu_ps(&a[j + 8]); | 43 __m128 a08v = _mm_loadu_ps(&a[j + 8]); |
26 __m128 a12v = _mm_loadu_ps(&a[j + 12]); | 44 __m128 a12v = _mm_loadu_ps(&a[j + 12]); |
27 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); | 45 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); |
28 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); | 46 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); | 89 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); |
72 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); | 90 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); |
73 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); | 91 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); |
74 _mm_storeu_ps(&a[j + 0], a00v); | 92 _mm_storeu_ps(&a[j + 0], a00v); |
75 _mm_storeu_ps(&a[j + 4], a04v); | 93 _mm_storeu_ps(&a[j + 4], a04v); |
76 _mm_storeu_ps(&a[j + 8], a08v); | 94 _mm_storeu_ps(&a[j + 8], a08v); |
77 _mm_storeu_ps(&a[j + 12], a12v); | 95 _mm_storeu_ps(&a[j + 12], a12v); |
78 } | 96 } |
79 } | 97 } |
80 | 98 |
81 static void cftmdl_128_SSE2(float* a) { | 99 void cftmdl_128_SSE2(float* a) { |
82 const int l = 8; | 100 const int l = 8; |
83 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 101 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |
84 int j0; | 102 int j0; |
85 | 103 |
86 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); | 104 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); |
87 for (j0 = 0; j0 < l; j0 += 2) { | 105 for (j0 = 0; j0 < l; j0 += 2) { |
88 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 106 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); |
89 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 107 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); |
90 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 108 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); |
91 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 109 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); |
92 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), | 110 const __m128 a_00_32 = |
93 _mm_castsi128_ps(a_32), | 111 _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), |
94 _MM_SHUFFLE(1, 0, 1, 0)); | 112 _MM_SHUFFLE(1, 0, 1, 0)); |
95 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), | 113 const __m128 a_08_40 = |
96 _mm_castsi128_ps(a_40), | 114 _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), |
97 _MM_SHUFFLE(1, 0, 1, 0)); | 115 _MM_SHUFFLE(1, 0, 1, 0)); |
98 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 116 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); |
99 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 117 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); |
100 | 118 |
101 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 119 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); |
102 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 120 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); |
103 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 121 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); |
104 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 122 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); |
105 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), | 123 const __m128 a_16_48 = |
106 _mm_castsi128_ps(a_48), | 124 _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), |
107 _MM_SHUFFLE(1, 0, 1, 0)); | 125 _MM_SHUFFLE(1, 0, 1, 0)); |
108 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), | 126 const __m128 a_24_56 = |
109 _mm_castsi128_ps(a_56), | 127 _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), |
110 _MM_SHUFFLE(1, 0, 1, 0)); | 128 _MM_SHUFFLE(1, 0, 1, 0)); |
111 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 129 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); |
112 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 130 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); |
113 | 131 |
114 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 132 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
115 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 133 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
116 | 134 |
117 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 135 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( |
118 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 136 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); |
119 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 137 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); |
120 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 138 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
156 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); | 174 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); |
157 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); | 175 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); |
158 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); | 176 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); |
159 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); | 177 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); |
160 wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); | 178 wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); |
161 for (j0 = k; j0 < l + k; j0 += 2) { | 179 for (j0 = k; j0 < l + k; j0 += 2) { |
162 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 180 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); |
163 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 181 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); |
164 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 182 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); |
165 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 183 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); |
166 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), | 184 const __m128 a_00_32 = |
167 _mm_castsi128_ps(a_32), | 185 _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), |
168 _MM_SHUFFLE(1, 0, 1, 0)); | 186 _MM_SHUFFLE(1, 0, 1, 0)); |
169 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), | 187 const __m128 a_08_40 = |
170 _mm_castsi128_ps(a_40), | 188 _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), |
171 _MM_SHUFFLE(1, 0, 1, 0)); | 189 _MM_SHUFFLE(1, 0, 1, 0)); |
172 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 190 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); |
173 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 191 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); |
174 | 192 |
175 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 193 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); |
176 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 194 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); |
177 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 195 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); |
178 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 196 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); |
179 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), | 197 const __m128 a_16_48 = |
180 _mm_castsi128_ps(a_48), | 198 _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), |
181 _MM_SHUFFLE(1, 0, 1, 0)); | 199 _MM_SHUFFLE(1, 0, 1, 0)); |
182 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), | 200 const __m128 a_24_56 = |
183 _mm_castsi128_ps(a_56), | 201 _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), |
184 _MM_SHUFFLE(1, 0, 1, 0)); | 202 _MM_SHUFFLE(1, 0, 1, 0)); |
185 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 203 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); |
186 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 204 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); |
187 | 205 |
188 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 206 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
189 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 207 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
190 const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); | 208 const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); |
191 const __m128 xx3 = | 209 const __m128 xx3 = _mm_mul_ps( |
192 _mm_mul_ps(wk2iv, | 210 wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), |
193 _mm_castsi128_ps(_mm_shuffle_epi32( | 211 _MM_SHUFFLE(2, 3, 0, 1)))); |
194 _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); | |
195 const __m128 xx4 = _mm_add_ps(xx2, xx3); | 212 const __m128 xx4 = _mm_add_ps(xx2, xx3); |
196 | 213 |
197 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 214 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( |
198 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 215 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); |
199 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 216 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); |
200 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 217 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); |
201 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 218 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); |
202 | 219 |
203 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); | 220 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); |
204 const __m128 xx11 = _mm_mul_ps( | 221 const __m128 xx11 = _mm_mul_ps( |
205 wk1iv, | 222 wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), |
206 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), | 223 _MM_SHUFFLE(2, 3, 0, 1)))); |
207 _MM_SHUFFLE(2, 3, 0, 1)))); | |
208 const __m128 xx12 = _mm_add_ps(xx10, xx11); | 224 const __m128 xx12 = _mm_add_ps(xx10, xx11); |
209 | 225 |
210 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); | 226 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); |
211 const __m128 xx21 = _mm_mul_ps( | 227 const __m128 xx21 = _mm_mul_ps( |
212 wk3iv, | 228 wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), |
213 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), | 229 _MM_SHUFFLE(2, 3, 0, 1)))); |
214 _MM_SHUFFLE(2, 3, 0, 1)))); | |
215 const __m128 xx22 = _mm_add_ps(xx20, xx21); | 230 const __m128 xx22 = _mm_add_ps(xx20, xx21); |
216 | 231 |
217 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); | 232 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); |
218 _mm_storel_epi64( | 233 _mm_storel_epi64( |
219 (__m128i*)&a[j0 + 32], | 234 (__m128i*)&a[j0 + 32], |
220 _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); | 235 _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); |
221 | 236 |
222 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); | 237 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); |
223 _mm_storel_epi64( | 238 _mm_storel_epi64( |
224 (__m128i*)&a[j0 + 48], | 239 (__m128i*)&a[j0 + 48], |
225 _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); | 240 _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); |
226 | 241 |
227 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); | 242 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); |
228 _mm_storel_epi64( | 243 _mm_storel_epi64( |
229 (__m128i*)&a[j0 + 40], | 244 (__m128i*)&a[j0 + 40], |
230 _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); | 245 _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); |
231 | 246 |
232 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); | 247 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); |
233 _mm_storel_epi64( | 248 _mm_storel_epi64( |
234 (__m128i*)&a[j0 + 56], | 249 (__m128i*)&a[j0 + 56], |
235 _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); | 250 _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); |
236 } | 251 } |
237 } | 252 } |
238 } | 253 } |
239 | 254 |
240 static void rftfsub_128_SSE2(float* a) { | 255 void rftfsub_128_SSE2(float* a) { |
241 const float* c = rdft_w + 32; | 256 const float* c = rdft_w + 32; |
242 int j1, j2, k1, k2; | 257 int j1, j2, k1, k2; |
243 float wkr, wki, xr, xi, yr, yi; | 258 float wkr, wki, xr, xi, yr, yi; |
244 | 259 |
245 static const ALIGN16_BEG float ALIGN16_END | 260 static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, |
246 k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; | 261 0.5f}; |
247 const __m128 mm_half = _mm_load_ps(k_half); | 262 const __m128 mm_half = _mm_load_ps(k_half); |
248 | 263 |
249 // Vectorized code (four at once). | 264 // Vectorized code (four at once). |
250 // Note: commented number are indexes for the first iteration of the loop. | 265 // Note: commented number are indexes for the first iteration of the loop. |
251 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 266 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { |
252 // Load 'wk'. | 267 // Load 'wk'. |
253 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, | 268 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, |
254 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, | 269 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, |
255 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, | 270 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, |
256 const __m128 wkr_ = | 271 const __m128 wkr_ = |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
320 xi = a[j2 + 1] + a[k2 + 1]; | 335 xi = a[j2 + 1] + a[k2 + 1]; |
321 yr = wkr * xr - wki * xi; | 336 yr = wkr * xr - wki * xi; |
322 yi = wkr * xi + wki * xr; | 337 yi = wkr * xi + wki * xr; |
323 a[j2 + 0] -= yr; | 338 a[j2 + 0] -= yr; |
324 a[j2 + 1] -= yi; | 339 a[j2 + 1] -= yi; |
325 a[k2 + 0] += yr; | 340 a[k2 + 0] += yr; |
326 a[k2 + 1] -= yi; | 341 a[k2 + 1] -= yi; |
327 } | 342 } |
328 } | 343 } |
329 | 344 |
330 static void rftbsub_128_SSE2(float* a) { | 345 void rftbsub_128_SSE2(float* a) { |
331 const float* c = rdft_w + 32; | 346 const float* c = rdft_w + 32; |
332 int j1, j2, k1, k2; | 347 int j1, j2, k1, k2; |
333 float wkr, wki, xr, xi, yr, yi; | 348 float wkr, wki, xr, xi, yr, yi; |
334 | 349 |
335 static const ALIGN16_BEG float ALIGN16_END | 350 static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, |
336 k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; | 351 0.5f}; |
337 const __m128 mm_half = _mm_load_ps(k_half); | 352 const __m128 mm_half = _mm_load_ps(k_half); |
338 | 353 |
339 a[1] = -a[1]; | 354 a[1] = -a[1]; |
340 // Vectorized code (four at once). | 355 // Vectorized code (four at once). |
341 // Note: commented number are indexes for the first iteration of the loop. | 356 // Note: commented number are indexes for the first iteration of the loop. |
342 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 357 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { |
343 // Load 'wk'. | 358 // Load 'wk'. |
344 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, | 359 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, |
345 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, | 360 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, |
346 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, | 361 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
411 xi = a[j2 + 1] + a[k2 + 1]; | 426 xi = a[j2 + 1] + a[k2 + 1]; |
412 yr = wkr * xr + wki * xi; | 427 yr = wkr * xr + wki * xi; |
413 yi = wkr * xi - wki * xr; | 428 yi = wkr * xi - wki * xr; |
414 a[j2 + 0] = a[j2 + 0] - yr; | 429 a[j2 + 0] = a[j2 + 0] - yr; |
415 a[j2 + 1] = yi - a[j2 + 1]; | 430 a[j2 + 1] = yi - a[j2 + 1]; |
416 a[k2 + 0] = yr + a[k2 + 0]; | 431 a[k2 + 0] = yr + a[k2 + 0]; |
417 a[k2 + 1] = yi - a[k2 + 1]; | 432 a[k2 + 1] = yi - a[k2 + 1]; |
418 } | 433 } |
419 a[65] = -a[65]; | 434 a[65] = -a[65]; |
420 } | 435 } |
| 436 #endif |
421 | 437 |
422 void aec_rdft_init_sse2(void) { | 438 } // namespace webrtc |
423 cft1st_128 = cft1st_128_SSE2; | |
424 cftmdl_128 = cftmdl_128_SSE2; | |
425 rftfsub_128 = rftfsub_128_SSE2; | |
426 rftbsub_128 = rftbsub_128_SSE2; | |
427 } | |
OLD | NEW |