| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 // Modified from the Chromium original: | 11 // Modified from the Chromium original: |
| 12 // src/media/base/simd/sinc_resampler_sse.cc | 12 // src/media/base/simd/sinc_resampler_sse.cc |
| 13 | 13 |
| 14 #include "webrtc/common_audio/resampler/sinc_resampler.h" | 14 #include "webrtc/common_audio/resampler/sinc_resampler.h" |
| 15 | 15 |
| 16 #include <xmmintrin.h> | 16 #include <xmmintrin.h> |
| 17 | 17 |
| 18 namespace webrtc { | 18 namespace webrtc { |
| 19 | 19 |
| 20 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | 20 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, |
| 21 const float* k2, | 21 const float* k2, |
| 22 double kernel_interpolation_factor) { | 22 double kernel_interpolation_factor) { |
| 23 __m128 m_input; | 23 __m128 m_input; |
| 24 __m128 m_sums1 = _mm_setzero_ps(); | 24 __m128 m_sums1 = _mm_setzero_ps(); |
| 25 __m128 m_sums2 = _mm_setzero_ps(); | 25 __m128 m_sums2 = _mm_setzero_ps(); |
| 26 | 26 |
| 27 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | 27 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
| 28 // these loops hurt performance in local testing. | 28 // these loops hurt performance in local testing. |
| 29 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | 29 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { |
| 30 for (int i = 0; i < kKernelSize; i += 4) { | 30 for (size_t i = 0; i < kKernelSize; i += 4) { |
| 31 m_input = _mm_loadu_ps(input_ptr + i); | 31 m_input = _mm_loadu_ps(input_ptr + i); |
| 32 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 32 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 33 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 33 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 34 } | 34 } |
| 35 } else { | 35 } else { |
| 36 for (int i = 0; i < kKernelSize; i += 4) { | 36 for (size_t i = 0; i < kKernelSize; i += 4) { |
| 37 m_input = _mm_load_ps(input_ptr + i); | 37 m_input = _mm_load_ps(input_ptr + i); |
| 38 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 38 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 39 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 39 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 40 } | 40 } |
| 41 } | 41 } |
| 42 | 42 |
| 43 // Linearly interpolate the two "convolutions". | 43 // Linearly interpolate the two "convolutions". |
| 44 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( | 44 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( |
| 45 static_cast<float>(1.0 - kernel_interpolation_factor))); | 45 static_cast<float>(1.0 - kernel_interpolation_factor))); |
| 46 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( | 46 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( |
| 47 static_cast<float>(kernel_interpolation_factor))); | 47 static_cast<float>(kernel_interpolation_factor))); |
| 48 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | 48 m_sums1 = _mm_add_ps(m_sums1, m_sums2); |
| 49 | 49 |
| 50 // Sum components together. | 50 // Sum components together. |
| 51 float result; | 51 float result; |
| 52 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 52 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 53 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 53 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 54 m_sums2, m_sums2, 1))); | 54 m_sums2, m_sums2, 1))); |
| 55 | 55 |
| 56 return result; | 56 return result; |
| 57 } | 57 } |
| 58 | 58 |
| 59 } // namespace webrtc | 59 } // namespace webrtc |
| OLD | NEW |