Index: webrtc/modules/audio_coding/codecs/opus/opus/src/celt/x86/pitch_sse.c |
diff --git a/webrtc/modules/audio_coding/codecs/opus/opus/src/celt/x86/pitch_sse.c b/webrtc/modules/audio_coding/codecs/opus/opus/src/celt/x86/pitch_sse.c |
new file mode 100644 |
index 0000000000000000000000000000000000000000..20e73126b623228f850e172c7b8fac01e92fd0cf |
--- /dev/null |
+++ b/webrtc/modules/audio_coding/codecs/opus/opus/src/celt/x86/pitch_sse.c |
@@ -0,0 +1,185 @@ |
+/* Copyright (c) 2014, Cisco Systems, INC |
+ Written by XiangMingZhu WeiZhou MinPeng YanWang |
+ |
+ Redistribution and use in source and binary forms, with or without |
+ modification, are permitted provided that the following conditions |
+ are met: |
+ |
+ - Redistributions of source code must retain the above copyright |
+ notice, this list of conditions and the following disclaimer. |
+ |
+ - Redistributions in binary form must reproduce the above copyright |
+ notice, this list of conditions and the following disclaimer in the |
+ documentation and/or other materials provided with the distribution. |
+ |
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
+*/ |
+ |
+#ifdef HAVE_CONFIG_H |
+#include "config.h" |
+#endif |
+ |
+#include "macros.h" |
+#include "celt_lpc.h" |
+#include "stack_alloc.h" |
+#include "mathops.h" |
+#include "pitch.h" |
+ |
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) |
+ |
+#include <xmmintrin.h> |
+#include "arch.h" |
+ |
+void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) |
+{ |
+ int j; |
+ __m128 xsum1, xsum2; |
+ xsum1 = _mm_loadu_ps(sum); |
+ xsum2 = _mm_setzero_ps(); |
+ |
+ for (j = 0; j < len-3; j += 4) |
+ { |
+ __m128 x0 = _mm_loadu_ps(x+j); |
+ __m128 yj = _mm_loadu_ps(y+j); |
+ __m128 y3 = _mm_loadu_ps(y+j+3); |
+ |
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); |
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), |
+ _mm_shuffle_ps(yj,y3,0x49))); |
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), |
+ _mm_shuffle_ps(yj,y3,0x9e))); |
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); |
+ } |
+ if (j < len) |
+ { |
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
+ if (++j < len) |
+ { |
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
+ if (++j < len) |
+ { |
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
+ } |
+ } |
+ } |
+ _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
+} |
+ |
+ |
+void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, |
+ int N, opus_val32 *xy1, opus_val32 *xy2) |
+{ |
+ int i; |
+ __m128 xsum1, xsum2; |
+ xsum1 = _mm_setzero_ps(); |
+ xsum2 = _mm_setzero_ps(); |
+ for (i=0;i<N-3;i+=4) |
+ { |
+ __m128 xi = _mm_loadu_ps(x+i); |
+ __m128 y1i = _mm_loadu_ps(y01+i); |
+ __m128 y2i = _mm_loadu_ps(y02+i); |
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); |
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); |
+ } |
+ /* Horizontal sum */ |
+ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); |
+ xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); |
+ _mm_store_ss(xy1, xsum1); |
+ xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
+ xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
+ _mm_store_ss(xy2, xsum2); |
+ for (;i<N;i++) |
+ { |
+ *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
+ *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
+ } |
+} |
+ |
+opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, |
+ int N) |
+{ |
+ int i; |
+ float xy; |
+ __m128 sum; |
+ sum = _mm_setzero_ps(); |
+ /* FIXME: We should probably go 8-way and use 2 sums. */ |
+ for (i=0;i<N-3;i+=4) |
+ { |
+ __m128 xi = _mm_loadu_ps(x+i); |
+ __m128 yi = _mm_loadu_ps(y+i); |
+ sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); |
+ } |
+ /* Horizontal sum */ |
+ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
+ sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
+ _mm_store_ss(&xy, sum); |
+ for (;i<N;i++) |
+ { |
+ xy = MAC16_16(xy, x[i], y[i]); |
+ } |
+ return xy; |
+} |
+ |
+void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, |
+ opus_val16 g10, opus_val16 g11, opus_val16 g12) |
+{ |
+ int i; |
+ __m128 x0v; |
+ __m128 g10v, g11v, g12v; |
+ g10v = _mm_load1_ps(&g10); |
+ g11v = _mm_load1_ps(&g11); |
+ g12v = _mm_load1_ps(&g12); |
+ x0v = _mm_loadu_ps(&x[-T-2]); |
+ for (i=0;i<N-3;i+=4) |
+ { |
+ __m128 yi, yi2, x1v, x2v, x3v, x4v; |
+ const opus_val32 *xp = &x[i-T-2]; |
+ yi = _mm_loadu_ps(x+i); |
+ x4v = _mm_loadu_ps(xp+4); |
+#if 0 |
+ /* Slower version with all loads */ |
+ x1v = _mm_loadu_ps(xp+1); |
+ x2v = _mm_loadu_ps(xp+2); |
+ x3v = _mm_loadu_ps(xp+3); |
+#else |
+ x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); |
+ x1v = _mm_shuffle_ps(x0v, x2v, 0x99); |
+ x3v = _mm_shuffle_ps(x2v, x4v, 0x99); |
+#endif |
+ |
+ yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); |
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ |
+ yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); |
+ yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
+#else |
+ /* Use partial sums */ |
+ yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), |
+ _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
+ yi = _mm_add_ps(yi, yi2); |
+#endif |
+ x0v=x4v; |
+ _mm_storeu_ps(y+i, yi); |
+ } |
+#ifdef CUSTOM_MODES |
+ for (;i<N;i++) |
+ { |
+ y[i] = x[i] |
+ + MULT16_32_Q15(g10,x[i-T]) |
+ + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
+ + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
+ } |
+#endif |
+} |
+ |
+ |
+#endif |