| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 
| 3  * | 3  * | 
| 4  *  Use of this source code is governed by a BSD-style license | 4  *  Use of this source code is governed by a BSD-style license | 
| 5  *  that can be found in the LICENSE file in the root of the source | 5  *  that can be found in the LICENSE file in the root of the source | 
| 6  *  tree. An additional intellectual property rights grant can be found | 6  *  tree. An additional intellectual property rights grant can be found | 
| 7  *  in the file PATENTS.  All contributing project authors may | 7  *  in the file PATENTS.  All contributing project authors may | 
| 8  *  be found in the AUTHORS file in the root of the source tree. | 8  *  be found in the AUTHORS file in the root of the source tree. | 
| 9  */ | 9  */ | 
| 10 | 10 | 
| 11 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 11 #include "webrtc/modules/audio_processing//utility/ooura_fft.h" | 
| 12 | 12 | 
| 13 #include <emmintrin.h> | 13 #include <emmintrin.h> | 
| 14 | 14 | 
| 15 static const ALIGN16_BEG float ALIGN16_END | 15 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h" | 
| 16     k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; | 16 #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" | 
| 17 | 17 | 
| 18 static void cft1st_128_SSE2(float* a) { | 18 namespace webrtc { | 
|  | 19 | 
|  | 20 #if defined(WEBRTC_ARCH_X86_FAMILY) | 
|  | 21 | 
|  | 22 namespace { | 
|  | 23 // These intrinsics were unavailable before VS 2008. | 
|  | 24 // TODO(andrew): move to a common file. | 
|  | 25 #if defined(_MSC_VER) && _MSC_VER < 1500 | 
|  | 26 static __inline __m128 _mm_castsi128_ps(__m128i a) { | 
|  | 27   return *(__m128*)&a; | 
|  | 28 } | 
|  | 29 static __inline __m128i _mm_castps_si128(__m128 a) { | 
|  | 30   return *(__m128i*)&a; | 
|  | 31 } | 
|  | 32 #endif | 
|  | 33 | 
|  | 34 }  // namespace | 
|  | 35 | 
|  | 36 void cft1st_128_SSE2(float* a) { | 
| 19   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 37   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 
| 20   int j, k2; | 38   int j, k2; | 
| 21 | 39 | 
| 22   for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { | 40   for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { | 
| 23     __m128 a00v = _mm_loadu_ps(&a[j + 0]); | 41     __m128 a00v = _mm_loadu_ps(&a[j + 0]); | 
| 24     __m128 a04v = _mm_loadu_ps(&a[j + 4]); | 42     __m128 a04v = _mm_loadu_ps(&a[j + 4]); | 
| 25     __m128 a08v = _mm_loadu_ps(&a[j + 8]); | 43     __m128 a08v = _mm_loadu_ps(&a[j + 8]); | 
| 26     __m128 a12v = _mm_loadu_ps(&a[j + 12]); | 44     __m128 a12v = _mm_loadu_ps(&a[j + 12]); | 
| 27     __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); | 45     __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); | 
| 28     __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); | 46     __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); | 
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 71     a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); | 89     a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); | 
| 72     a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); | 90     a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); | 
| 73     a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); | 91     a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); | 
| 74     _mm_storeu_ps(&a[j + 0], a00v); | 92     _mm_storeu_ps(&a[j + 0], a00v); | 
| 75     _mm_storeu_ps(&a[j + 4], a04v); | 93     _mm_storeu_ps(&a[j + 4], a04v); | 
| 76     _mm_storeu_ps(&a[j + 8], a08v); | 94     _mm_storeu_ps(&a[j + 8], a08v); | 
| 77     _mm_storeu_ps(&a[j + 12], a12v); | 95     _mm_storeu_ps(&a[j + 12], a12v); | 
| 78   } | 96   } | 
| 79 } | 97 } | 
| 80 | 98 | 
| 81 static void cftmdl_128_SSE2(float* a) { | 99 void cftmdl_128_SSE2(float* a) { | 
| 82   const int l = 8; | 100   const int l = 8; | 
| 83   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 101   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | 
| 84   int j0; | 102   int j0; | 
| 85 | 103 | 
| 86   __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); | 104   __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); | 
| 87   for (j0 = 0; j0 < l; j0 += 2) { | 105   for (j0 = 0; j0 < l; j0 += 2) { | 
| 88     const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 106     const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 
| 89     const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 107     const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 
| 90     const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 108     const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 
| 91     const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 109     const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 
| 92     const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), | 110     const __m128 a_00_32 = | 
| 93                                           _mm_castsi128_ps(a_32), | 111         _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), | 
| 94                                           _MM_SHUFFLE(1, 0, 1, 0)); | 112                        _MM_SHUFFLE(1, 0, 1, 0)); | 
| 95     const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), | 113     const __m128 a_08_40 = | 
| 96                                           _mm_castsi128_ps(a_40), | 114         _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), | 
| 97                                           _MM_SHUFFLE(1, 0, 1, 0)); | 115                        _MM_SHUFFLE(1, 0, 1, 0)); | 
| 98     __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 116     __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 
| 99     const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 117     const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 
| 100 | 118 | 
| 101     const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 119     const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 
| 102     const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 120     const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 
| 103     const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 121     const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 
| 104     const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 122     const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 
| 105     const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), | 123     const __m128 a_16_48 = | 
| 106                                           _mm_castsi128_ps(a_48), | 124         _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), | 
| 107                                           _MM_SHUFFLE(1, 0, 1, 0)); | 125                        _MM_SHUFFLE(1, 0, 1, 0)); | 
| 108     const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), | 126     const __m128 a_24_56 = | 
| 109                                           _mm_castsi128_ps(a_56), | 127         _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), | 
| 110                                           _MM_SHUFFLE(1, 0, 1, 0)); | 128                        _MM_SHUFFLE(1, 0, 1, 0)); | 
| 111     const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 129     const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 
| 112     const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 130     const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 
| 113 | 131 | 
| 114     const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 132     const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 
| 115     const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 133     const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 
| 116 | 134 | 
| 117     const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 135     const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 
| 118         _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 136         _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 
| 119     const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 137     const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 
| 120     const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 138     const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 156     const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); | 174     const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); | 
| 157     const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); | 175     const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); | 
| 158     const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); | 176     const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); | 
| 159     const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); | 177     const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); | 
| 160     wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); | 178     wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); | 
| 161     for (j0 = k; j0 < l + k; j0 += 2) { | 179     for (j0 = k; j0 < l + k; j0 += 2) { | 
| 162       const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 180       const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); | 
| 163       const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 181       const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); | 
| 164       const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 182       const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | 
| 165       const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 183       const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | 
| 166       const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), | 184       const __m128 a_00_32 = | 
| 167                                             _mm_castsi128_ps(a_32), | 185           _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), | 
| 168                                             _MM_SHUFFLE(1, 0, 1, 0)); | 186                          _MM_SHUFFLE(1, 0, 1, 0)); | 
| 169       const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), | 187       const __m128 a_08_40 = | 
| 170                                             _mm_castsi128_ps(a_40), | 188           _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), | 
| 171                                             _MM_SHUFFLE(1, 0, 1, 0)); | 189                          _MM_SHUFFLE(1, 0, 1, 0)); | 
| 172       __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 190       __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | 
| 173       const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 191       const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | 
| 174 | 192 | 
| 175       const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 193       const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | 
| 176       const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 194       const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | 
| 177       const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 195       const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | 
| 178       const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 196       const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | 
| 179       const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), | 197       const __m128 a_16_48 = | 
| 180                                             _mm_castsi128_ps(a_48), | 198           _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), | 
| 181                                             _MM_SHUFFLE(1, 0, 1, 0)); | 199                          _MM_SHUFFLE(1, 0, 1, 0)); | 
| 182       const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), | 200       const __m128 a_24_56 = | 
| 183                                             _mm_castsi128_ps(a_56), | 201           _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), | 
| 184                                             _MM_SHUFFLE(1, 0, 1, 0)); | 202                          _MM_SHUFFLE(1, 0, 1, 0)); | 
| 185       const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 203       const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | 
| 186       const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 204       const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | 
| 187 | 205 | 
| 188       const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 206       const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 
| 189       const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 207       const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | 
| 190       const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); | 208       const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); | 
| 191       const __m128 xx3 = | 209       const __m128 xx3 = _mm_mul_ps( | 
| 192           _mm_mul_ps(wk2iv, | 210           wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), | 
| 193                      _mm_castsi128_ps(_mm_shuffle_epi32( | 211                                                     _MM_SHUFFLE(2, 3, 0, 1)))); | 
| 194                          _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); |  | 
| 195       const __m128 xx4 = _mm_add_ps(xx2, xx3); | 212       const __m128 xx4 = _mm_add_ps(xx2, xx3); | 
| 196 | 213 | 
| 197       const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 214       const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( | 
| 198           _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 215           _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); | 
| 199       const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 216       const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | 
| 200       const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 217       const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 
| 201       const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 218       const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | 
| 202 | 219 | 
| 203       const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); | 220       const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); | 
| 204       const __m128 xx11 = _mm_mul_ps( | 221       const __m128 xx11 = _mm_mul_ps( | 
| 205           wk1iv, | 222           wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), | 
| 206           _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), | 223                                                     _MM_SHUFFLE(2, 3, 0, 1)))); | 
| 207                                              _MM_SHUFFLE(2, 3, 0, 1)))); |  | 
| 208       const __m128 xx12 = _mm_add_ps(xx10, xx11); | 224       const __m128 xx12 = _mm_add_ps(xx10, xx11); | 
| 209 | 225 | 
| 210       const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); | 226       const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); | 
| 211       const __m128 xx21 = _mm_mul_ps( | 227       const __m128 xx21 = _mm_mul_ps( | 
| 212           wk3iv, | 228           wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), | 
| 213           _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), | 229                                                     _MM_SHUFFLE(2, 3, 0, 1)))); | 
| 214                                              _MM_SHUFFLE(2, 3, 0, 1)))); |  | 
| 215       const __m128 xx22 = _mm_add_ps(xx20, xx21); | 230       const __m128 xx22 = _mm_add_ps(xx20, xx21); | 
| 216 | 231 | 
| 217       _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); | 232       _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); | 
| 218       _mm_storel_epi64( | 233       _mm_storel_epi64( | 
| 219           (__m128i*)&a[j0 + 32], | 234           (__m128i*)&a[j0 + 32], | 
| 220           _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); | 235           _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); | 
| 221 | 236 | 
| 222       _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); | 237       _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); | 
| 223       _mm_storel_epi64( | 238       _mm_storel_epi64( | 
| 224           (__m128i*)&a[j0 + 48], | 239           (__m128i*)&a[j0 + 48], | 
| 225           _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); | 240           _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); | 
| 226 | 241 | 
| 227       _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); | 242       _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); | 
| 228       _mm_storel_epi64( | 243       _mm_storel_epi64( | 
| 229           (__m128i*)&a[j0 + 40], | 244           (__m128i*)&a[j0 + 40], | 
| 230           _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); | 245           _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); | 
| 231 | 246 | 
| 232       _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); | 247       _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); | 
| 233       _mm_storel_epi64( | 248       _mm_storel_epi64( | 
| 234           (__m128i*)&a[j0 + 56], | 249           (__m128i*)&a[j0 + 56], | 
| 235           _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); | 250           _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); | 
| 236     } | 251     } | 
| 237   } | 252   } | 
| 238 } | 253 } | 
| 239 | 254 | 
| 240 static void rftfsub_128_SSE2(float* a) { | 255 void rftfsub_128_SSE2(float* a) { | 
| 241   const float* c = rdft_w + 32; | 256   const float* c = rdft_w + 32; | 
| 242   int j1, j2, k1, k2; | 257   int j1, j2, k1, k2; | 
| 243   float wkr, wki, xr, xi, yr, yi; | 258   float wkr, wki, xr, xi, yr, yi; | 
| 244 | 259 | 
| 245   static const ALIGN16_BEG float ALIGN16_END | 260   static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, | 
| 246       k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; | 261                                                           0.5f}; | 
| 247   const __m128 mm_half = _mm_load_ps(k_half); | 262   const __m128 mm_half = _mm_load_ps(k_half); | 
| 248 | 263 | 
| 249   // Vectorized code (four at once). | 264   // Vectorized code (four at once). | 
| 250   //    Note: commented number are indexes for the first iteration of the loop. | 265   //    Note: commented number are indexes for the first iteration of the loop. | 
| 251   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 266   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 
| 252     // Load 'wk'. | 267     // Load 'wk'. | 
| 253     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4, | 268     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4, | 
| 254     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31, | 269     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31, | 
| 255     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31, | 270     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31, | 
| 256     const __m128 wkr_ = | 271     const __m128 wkr_ = | 
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 320     xi = a[j2 + 1] + a[k2 + 1]; | 335     xi = a[j2 + 1] + a[k2 + 1]; | 
| 321     yr = wkr * xr - wki * xi; | 336     yr = wkr * xr - wki * xi; | 
| 322     yi = wkr * xi + wki * xr; | 337     yi = wkr * xi + wki * xr; | 
| 323     a[j2 + 0] -= yr; | 338     a[j2 + 0] -= yr; | 
| 324     a[j2 + 1] -= yi; | 339     a[j2 + 1] -= yi; | 
| 325     a[k2 + 0] += yr; | 340     a[k2 + 0] += yr; | 
| 326     a[k2 + 1] -= yi; | 341     a[k2 + 1] -= yi; | 
| 327   } | 342   } | 
| 328 } | 343 } | 
| 329 | 344 | 
| 330 static void rftbsub_128_SSE2(float* a) { | 345 void rftbsub_128_SSE2(float* a) { | 
| 331   const float* c = rdft_w + 32; | 346   const float* c = rdft_w + 32; | 
| 332   int j1, j2, k1, k2; | 347   int j1, j2, k1, k2; | 
| 333   float wkr, wki, xr, xi, yr, yi; | 348   float wkr, wki, xr, xi, yr, yi; | 
| 334 | 349 | 
| 335   static const ALIGN16_BEG float ALIGN16_END | 350   static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, | 
| 336       k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; | 351                                                           0.5f}; | 
| 337   const __m128 mm_half = _mm_load_ps(k_half); | 352   const __m128 mm_half = _mm_load_ps(k_half); | 
| 338 | 353 | 
| 339   a[1] = -a[1]; | 354   a[1] = -a[1]; | 
| 340   // Vectorized code (four at once). | 355   // Vectorized code (four at once). | 
| 341   //    Note: commented number are indexes for the first iteration of the loop. | 356   //    Note: commented number are indexes for the first iteration of the loop. | 
| 342   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 357   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { | 
| 343     // Load 'wk'. | 358     // Load 'wk'. | 
| 344     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4, | 359     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4, | 
| 345     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31, | 360     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31, | 
| 346     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31, | 361     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31, | 
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 411     xi = a[j2 + 1] + a[k2 + 1]; | 426     xi = a[j2 + 1] + a[k2 + 1]; | 
| 412     yr = wkr * xr + wki * xi; | 427     yr = wkr * xr + wki * xi; | 
| 413     yi = wkr * xi - wki * xr; | 428     yi = wkr * xi - wki * xr; | 
| 414     a[j2 + 0] = a[j2 + 0] - yr; | 429     a[j2 + 0] = a[j2 + 0] - yr; | 
| 415     a[j2 + 1] = yi - a[j2 + 1]; | 430     a[j2 + 1] = yi - a[j2 + 1]; | 
| 416     a[k2 + 0] = yr + a[k2 + 0]; | 431     a[k2 + 0] = yr + a[k2 + 0]; | 
| 417     a[k2 + 1] = yi - a[k2 + 1]; | 432     a[k2 + 1] = yi - a[k2 + 1]; | 
| 418   } | 433   } | 
| 419   a[65] = -a[65]; | 434   a[65] = -a[65]; | 
| 420 } | 435 } | 
|  | 436 #endif | 
| 421 | 437 | 
| 422 void aec_rdft_init_sse2(void) { | 438 }  // namespace webrtc | 
| 423   cft1st_128 = cft1st_128_SSE2; |  | 
| 424   cftmdl_128 = cftmdl_128_SSE2; |  | 
| 425   rftfsub_128 = rftfsub_128_SSE2; |  | 
| 426   rftbsub_128 = rftbsub_128_SSE2; |  | 
| 427 } |  | 
| OLD | NEW | 
|---|