| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 11 #include "webrtc/modules/audio_processing//utility/ooura_fft.h" |
| 12 | 12 |
| 13 #include <emmintrin.h> | 13 #include <emmintrin.h> |
| 14 | 14 |
| 15 static const ALIGN16_BEG float ALIGN16_END | 15 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" |
| 16 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; | 16 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" |
| 17 | 17 |
| 18 static void cft1st_128_SSE2(float* a) { | 18 namespace webrtc { |
| 19 |
| 20 #if defined(WEBRTC_ARCH_X86_FAMILY) |
| 21 |
| 22 namespace { |
| 23 // These intrinsics were unavailable before VS 2008. |
| 24 // TODO(andrew): move to a common file. |
| 25 #if defined(_MSC_VER) && _MSC_VER < 1500 |
| 26 static __inline __m128 _mm_castsi128_ps(__m128i a) { |
| 27 return *(__m128*)&a; |
| 28 } |
| 29 static __inline __m128i _mm_castps_si128(__m128 a) { |
| 30 return *(__m128i*)&a; |
| 31 } |
| 32 #endif |
| 33 |
| 34 } // namespace |
| 35 |
| 36 void cft1st_128_SSE2(float* a) { |
| 19 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 37 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |
| 20 int j, k2; | 38 int j, k2; |
| 21 | 39 |
| 22 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { | 40 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { |
| 23 __m128 a00v = _mm_loadu_ps(&a[j + 0]); | 41 __m128 a00v = _mm_loadu_ps(&a[j + 0]); |
| 24 __m128 a04v = _mm_loadu_ps(&a[j + 4]); | 42 __m128 a04v = _mm_loadu_ps(&a[j + 4]); |
| 25 __m128 a08v = _mm_loadu_ps(&a[j + 8]); | 43 __m128 a08v = _mm_loadu_ps(&a[j + 8]); |
| 26 __m128 a12v = _mm_loadu_ps(&a[j + 12]); | 44 __m128 a12v = _mm_loadu_ps(&a[j + 12]); |
| 27 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); | 45 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); |
| 28 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); | 46 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 71 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); | 89 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); |
| 72 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); | 90 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); |
| 73 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); | 91 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); |
| 74 _mm_storeu_ps(&a[j + 0], a00v); | 92 _mm_storeu_ps(&a[j + 0], a00v); |
| 75 _mm_storeu_ps(&a[j + 4], a04v); | 93 _mm_storeu_ps(&a[j + 4], a04v); |
| 76 _mm_storeu_ps(&a[j + 8], a08v); | 94 _mm_storeu_ps(&a[j + 8], a08v); |
| 77 _mm_storeu_ps(&a[j + 12], a12v); | 95 _mm_storeu_ps(&a[j + 12], a12v); |
| 78 } | 96 } |
| 79 } | 97 } |
| 80 | 98 |
| 81 static void cftmdl_128_SSE2(float* a) { | 99 void cftmdl_128_SSE2(float* a) { |
| 82 const int l = 8; | 100 const int l = 8; |
| 83 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 101 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |
| 84 int j0; | 102 int j0; |
| 85 | 103 |
| 86 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); | 104 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); |
| 87 for (j0 = 0; j0 < l; j0 += 2) { | 105 for (j0 = 0; j0 < l; j0 += 2) { |
| 88 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 106 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); |
| 89 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 107 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); |
| 90 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 108 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); |
| 91 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 109 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); |
| 92 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), | 110 const __m128 a_00_32 = |
| 93 _mm_castsi128_ps(a_32), | 111 _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), |
| 94 _MM_SHUFFLE(1, 0, 1, 0)); | 112 _MM_SHUFFLE(1, 0, 1, 0)); |
| 95 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), | 113 const __m128 a_08_40 = |
| 96 _mm_castsi128_ps(a_40), | 114 _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), |
| 97 _MM_SHUFFLE(1, 0, 1, 0)); | 115 _MM_SHUFFLE(1, 0, 1, 0)); |
| 98 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 116 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); |
| 99 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 117 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); |
| 100 | 118 |
| 101 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 119 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); |
| 102 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 120 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); |
| 103 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 121 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); |
| 104 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 122 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); |
| 105 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), | 123 const __m128 a_16_48 = |
| 106 _mm_castsi128_ps(a_48), | 124 _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), |
| 107 _MM_SHUFFLE(1, 0, 1, 0)); | 125 _MM_SHUFFLE(1, 0, 1, 0)); |
| 108 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), | 126 const __m128 a_24_56 = |
| 109 _mm_castsi128_ps(a_56), | 127 _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), |
| 110 _MM_SHUFFLE(1, 0, 1, 0)); | 128 _MM_SHUFFLE(1, 0, 1, 0)); |
| 111 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 129 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); |
| 112 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 130 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); |
| 113 | 131 |
| 114 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 132 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
| 115 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 133 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
| 116 | 134 |
| 117 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 135 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( |
| 118 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 136 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); |
| 119 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 137 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); |
| 120 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 138 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 156 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); | 174 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); |
| 157 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); | 175 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); |
| 158 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); | 176 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); |
| 159 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); | 177 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); |
| 160 wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); | 178 wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); |
| 161 for (j0 = k; j0 < l + k; j0 += 2) { | 179 for (j0 = k; j0 < l + k; j0 += 2) { |
| 162 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 180 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); |
| 163 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 181 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); |
| 164 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 182 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); |
| 165 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 183 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); |
| 166 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), | 184 const __m128 a_00_32 = |
| 167 _mm_castsi128_ps(a_32), | 185 _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), |
| 168 _MM_SHUFFLE(1, 0, 1, 0)); | 186 _MM_SHUFFLE(1, 0, 1, 0)); |
| 169 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), | 187 const __m128 a_08_40 = |
| 170 _mm_castsi128_ps(a_40), | 188 _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), |
| 171 _MM_SHUFFLE(1, 0, 1, 0)); | 189 _MM_SHUFFLE(1, 0, 1, 0)); |
| 172 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 190 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); |
| 173 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 191 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); |
| 174 | 192 |
| 175 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 193 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); |
| 176 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 194 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); |
| 177 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 195 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); |
| 178 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 196 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); |
| 179 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), | 197 const __m128 a_16_48 = |
| 180 _mm_castsi128_ps(a_48), | 198 _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), |
| 181 _MM_SHUFFLE(1, 0, 1, 0)); | 199 _MM_SHUFFLE(1, 0, 1, 0)); |
| 182 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), | 200 const __m128 a_24_56 = |
| 183 _mm_castsi128_ps(a_56), | 201 _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), |
| 184 _MM_SHUFFLE(1, 0, 1, 0)); | 202 _MM_SHUFFLE(1, 0, 1, 0)); |
| 185 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 203 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); |
| 186 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 204 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); |
| 187 | 205 |
| 188 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 206 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
| 189 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 207 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); |
| 190 const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); | 208 const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); |
| 191 const __m128 xx3 = | 209 const __m128 xx3 = _mm_mul_ps( |
| 192 _mm_mul_ps(wk2iv, | 210 wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), |
| 193 _mm_castsi128_ps(_mm_shuffle_epi32( | 211 _MM_SHUFFLE(2, 3, 0, 1)))); |
| 194 _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); | |
| 195 const __m128 xx4 = _mm_add_ps(xx2, xx3); | 212 const __m128 xx4 = _mm_add_ps(xx2, xx3); |
| 196 | 213 |
| 197 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 214 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( |
| 198 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 215 _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); |
| 199 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 216 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); |
| 200 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 217 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); |
| 201 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 218 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); |
| 202 | 219 |
| 203 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); | 220 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); |
| 204 const __m128 xx11 = _mm_mul_ps( | 221 const __m128 xx11 = _mm_mul_ps( |
| 205 wk1iv, | 222 wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), |
| 206 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), | 223 _MM_SHUFFLE(2, 3, 0, 1)))); |
| 207 _MM_SHUFFLE(2, 3, 0, 1)))); | |
| 208 const __m128 xx12 = _mm_add_ps(xx10, xx11); | 224 const __m128 xx12 = _mm_add_ps(xx10, xx11); |
| 209 | 225 |
| 210 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); | 226 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); |
| 211 const __m128 xx21 = _mm_mul_ps( | 227 const __m128 xx21 = _mm_mul_ps( |
| 212 wk3iv, | 228 wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), |
| 213 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), | 229 _MM_SHUFFLE(2, 3, 0, 1)))); |
| 214 _MM_SHUFFLE(2, 3, 0, 1)))); | |
| 215 const __m128 xx22 = _mm_add_ps(xx20, xx21); | 230 const __m128 xx22 = _mm_add_ps(xx20, xx21); |
| 216 | 231 |
| 217 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); | 232 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); |
| 218 _mm_storel_epi64( | 233 _mm_storel_epi64( |
| 219 (__m128i*)&a[j0 + 32], | 234 (__m128i*)&a[j0 + 32], |
| 220 _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); | 235 _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); |
| 221 | 236 |
| 222 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); | 237 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); |
| 223 _mm_storel_epi64( | 238 _mm_storel_epi64( |
| 224 (__m128i*)&a[j0 + 48], | 239 (__m128i*)&a[j0 + 48], |
| 225 _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); | 240 _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); |
| 226 | 241 |
| 227 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); | 242 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); |
| 228 _mm_storel_epi64( | 243 _mm_storel_epi64( |
| 229 (__m128i*)&a[j0 + 40], | 244 (__m128i*)&a[j0 + 40], |
| 230 _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); | 245 _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); |
| 231 | 246 |
| 232 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); | 247 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); |
| 233 _mm_storel_epi64( | 248 _mm_storel_epi64( |
| 234 (__m128i*)&a[j0 + 56], | 249 (__m128i*)&a[j0 + 56], |
| 235 _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); | 250 _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); |
| 236 } | 251 } |
| 237 } | 252 } |
| 238 } | 253 } |
| 239 | 254 |
| 240 static void rftfsub_128_SSE2(float* a) { | 255 void rftfsub_128_SSE2(float* a) { |
| 241 const float* c = rdft_w + 32; | 256 const float* c = rdft_w + 32; |
| 242 int j1, j2, k1, k2; | 257 int j1, j2, k1, k2; |
| 243 float wkr, wki, xr, xi, yr, yi; | 258 float wkr, wki, xr, xi, yr, yi; |
| 244 | 259 |
| 245 static const ALIGN16_BEG float ALIGN16_END | 260 static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, |
| 246 k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; | 261 0.5f}; |
| 247 const __m128 mm_half = _mm_load_ps(k_half); | 262 const __m128 mm_half = _mm_load_ps(k_half); |
| 248 | 263 |
| 249 // Vectorized code (four at once). | 264 // Vectorized code (four at once). |
| 250 // Note: commented number are indexes for the first iteration of the loop. | 265 // Note: commented number are indexes for the first iteration of the loop. |
| 251 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 266 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { |
| 252 // Load 'wk'. | 267 // Load 'wk'. |
| 253 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, | 268 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, |
| 254 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, | 269 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, |
| 255 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, | 270 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, |
| 256 const __m128 wkr_ = | 271 const __m128 wkr_ = |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 320 xi = a[j2 + 1] + a[k2 + 1]; | 335 xi = a[j2 + 1] + a[k2 + 1]; |
| 321 yr = wkr * xr - wki * xi; | 336 yr = wkr * xr - wki * xi; |
| 322 yi = wkr * xi + wki * xr; | 337 yi = wkr * xi + wki * xr; |
| 323 a[j2 + 0] -= yr; | 338 a[j2 + 0] -= yr; |
| 324 a[j2 + 1] -= yi; | 339 a[j2 + 1] -= yi; |
| 325 a[k2 + 0] += yr; | 340 a[k2 + 0] += yr; |
| 326 a[k2 + 1] -= yi; | 341 a[k2 + 1] -= yi; |
| 327 } | 342 } |
| 328 } | 343 } |
| 329 | 344 |
| 330 static void rftbsub_128_SSE2(float* a) { | 345 void rftbsub_128_SSE2(float* a) { |
| 331 const float* c = rdft_w + 32; | 346 const float* c = rdft_w + 32; |
| 332 int j1, j2, k1, k2; | 347 int j1, j2, k1, k2; |
| 333 float wkr, wki, xr, xi, yr, yi; | 348 float wkr, wki, xr, xi, yr, yi; |
| 334 | 349 |
| 335 static const ALIGN16_BEG float ALIGN16_END | 350 static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, |
| 336 k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; | 351 0.5f}; |
| 337 const __m128 mm_half = _mm_load_ps(k_half); | 352 const __m128 mm_half = _mm_load_ps(k_half); |
| 338 | 353 |
| 339 a[1] = -a[1]; | 354 a[1] = -a[1]; |
| 340 // Vectorized code (four at once). | 355 // Vectorized code (four at once). |
| 341 // Note: commented number are indexes for the first iteration of the loop. | 356 // Note: commented number are indexes for the first iteration of the loop. |
| 342 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 357 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { |
| 343 // Load 'wk'. | 358 // Load 'wk'. |
| 344 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, | 359 const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, |
| 345 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, | 360 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, |
| 346 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, | 361 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 411 xi = a[j2 + 1] + a[k2 + 1]; | 426 xi = a[j2 + 1] + a[k2 + 1]; |
| 412 yr = wkr * xr + wki * xi; | 427 yr = wkr * xr + wki * xi; |
| 413 yi = wkr * xi - wki * xr; | 428 yi = wkr * xi - wki * xr; |
| 414 a[j2 + 0] = a[j2 + 0] - yr; | 429 a[j2 + 0] = a[j2 + 0] - yr; |
| 415 a[j2 + 1] = yi - a[j2 + 1]; | 430 a[j2 + 1] = yi - a[j2 + 1]; |
| 416 a[k2 + 0] = yr + a[k2 + 0]; | 431 a[k2 + 0] = yr + a[k2 + 0]; |
| 417 a[k2 + 1] = yi - a[k2 + 1]; | 432 a[k2 + 1] = yi - a[k2 + 1]; |
| 418 } | 433 } |
| 419 a[65] = -a[65]; | 434 a[65] = -a[65]; |
| 420 } | 435 } |
| 436 #endif |
| 421 | 437 |
| 422 void aec_rdft_init_sse2(void) { | 438 } // namespace webrtc |
| 423 cft1st_128 = cft1st_128_SSE2; | |
| 424 cftmdl_128 = cftmdl_128_SSE2; | |
| 425 rftfsub_128 = rftfsub_128_SSE2; | |
| 426 rftbsub_128 = rftbsub_128_SSE2; | |
| 427 } | |
| OLD | NEW |