OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 // Modified from the Chromium original: | 11 // Modified from the Chromium original: |
12 // src/media/base/simd/sinc_resampler_sse.cc | 12 // src/media/base/simd/sinc_resampler_sse.cc |
13 | 13 |
14 #include "webrtc/common_audio/resampler/sinc_resampler.h" | 14 #include "webrtc/common_audio/resampler/sinc_resampler.h" |
15 | 15 |
16 #include <xmmintrin.h> | 16 #include <xmmintrin.h> |
17 | 17 |
18 namespace webrtc { | 18 namespace webrtc { |
19 | 19 |
20 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | 20 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, |
21 const float* k2, | 21 const float* k2, |
22 double kernel_interpolation_factor) { | 22 double kernel_interpolation_factor) { |
23 __m128 m_input; | 23 __m128 m_input; |
24 __m128 m_sums1 = _mm_setzero_ps(); | 24 __m128 m_sums1 = _mm_setzero_ps(); |
25 __m128 m_sums2 = _mm_setzero_ps(); | 25 __m128 m_sums2 = _mm_setzero_ps(); |
26 | 26 |
27 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | 27 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
28 // these loops hurt performance in local testing. | 28 // these loops hurt performance in local testing. |
29 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | 29 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { |
30 for (int i = 0; i < kKernelSize; i += 4) { | 30 for (size_t i = 0; i < kKernelSize; i += 4) { |
31 m_input = _mm_loadu_ps(input_ptr + i); | 31 m_input = _mm_loadu_ps(input_ptr + i); |
32 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 32 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
33 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 33 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
34 } | 34 } |
35 } else { | 35 } else { |
36 for (int i = 0; i < kKernelSize; i += 4) { | 36 for (size_t i = 0; i < kKernelSize; i += 4) { |
37 m_input = _mm_load_ps(input_ptr + i); | 37 m_input = _mm_load_ps(input_ptr + i); |
38 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 38 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
39 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 39 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
40 } | 40 } |
41 } | 41 } |
42 | 42 |
43 // Linearly interpolate the two "convolutions". | 43 // Linearly interpolate the two "convolutions". |
44 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( | 44 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( |
45 static_cast<float>(1.0 - kernel_interpolation_factor))); | 45 static_cast<float>(1.0 - kernel_interpolation_factor))); |
46 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( | 46 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( |
47 static_cast<float>(kernel_interpolation_factor))); | 47 static_cast<float>(kernel_interpolation_factor))); |
48 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | 48 m_sums1 = _mm_add_ps(m_sums1, m_sums2); |
49 | 49 |
50 // Sum components together. | 50 // Sum components together. |
51 float result; | 51 float result; |
52 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 52 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
53 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 53 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
54 m_sums2, m_sums2, 1))); | 54 m_sums2, m_sums2, 1))); |
55 | 55 |
56 return result; | 56 return result; |
57 } | 57 } |
58 | 58 |
59 } // namespace webrtc | 59 } // namespace webrtc |
OLD | NEW |