OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "webrtc/modules/desktop_capture/differ_vector_sse2.h" |
| 12 |
| 13 #if defined(_MSC_VER) |
| 14 #include <intrin.h> |
| 15 #else |
| 16 #include <mmintrin.h> |
| 17 #include <emmintrin.h> |
| 18 #endif |
| 19 |
| 20 namespace webrtc { |
| 21 |
| 22 extern bool VectorDifference_SSE2_W16(const uint8_t* image1, |
| 23 const uint8_t* image2) { |
| 24 __m128i acc = _mm_setzero_si128(); |
| 25 __m128i v0; |
| 26 __m128i v1; |
| 27 __m128i sad; |
| 28 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); |
| 29 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); |
| 30 v0 = _mm_loadu_si128(i1); |
| 31 v1 = _mm_loadu_si128(i2); |
| 32 sad = _mm_sad_epu8(v0, v1); |
| 33 acc = _mm_adds_epu16(acc, sad); |
| 34 v0 = _mm_loadu_si128(i1 + 1); |
| 35 v1 = _mm_loadu_si128(i2 + 1); |
| 36 sad = _mm_sad_epu8(v0, v1); |
| 37 acc = _mm_adds_epu16(acc, sad); |
| 38 v0 = _mm_loadu_si128(i1 + 2); |
| 39 v1 = _mm_loadu_si128(i2 + 2); |
| 40 sad = _mm_sad_epu8(v0, v1); |
| 41 acc = _mm_adds_epu16(acc, sad); |
| 42 v0 = _mm_loadu_si128(i1 + 3); |
| 43 v1 = _mm_loadu_si128(i2 + 3); |
| 44 sad = _mm_sad_epu8(v0, v1); |
| 45 acc = _mm_adds_epu16(acc, sad); |
| 46 |
| 47 // This essential means sad = acc >> 64. We only care about the lower 16 |
| 48 // bits. |
| 49 sad = _mm_shuffle_epi32(acc, 0xEE); |
| 50 sad = _mm_adds_epu16(sad, acc); |
| 51 return _mm_cvtsi128_si32(sad) != 0; |
| 52 } |
| 53 |
| 54 extern bool VectorDifference_SSE2_W32(const uint8_t* image1, |
| 55 const uint8_t* image2) { |
| 56 __m128i acc = _mm_setzero_si128(); |
| 57 __m128i v0; |
| 58 __m128i v1; |
| 59 __m128i sad; |
| 60 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); |
| 61 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); |
| 62 v0 = _mm_loadu_si128(i1); |
| 63 v1 = _mm_loadu_si128(i2); |
| 64 sad = _mm_sad_epu8(v0, v1); |
| 65 acc = _mm_adds_epu16(acc, sad); |
| 66 v0 = _mm_loadu_si128(i1 + 1); |
| 67 v1 = _mm_loadu_si128(i2 + 1); |
| 68 sad = _mm_sad_epu8(v0, v1); |
| 69 acc = _mm_adds_epu16(acc, sad); |
| 70 v0 = _mm_loadu_si128(i1 + 2); |
| 71 v1 = _mm_loadu_si128(i2 + 2); |
| 72 sad = _mm_sad_epu8(v0, v1); |
| 73 acc = _mm_adds_epu16(acc, sad); |
| 74 v0 = _mm_loadu_si128(i1 + 3); |
| 75 v1 = _mm_loadu_si128(i2 + 3); |
| 76 sad = _mm_sad_epu8(v0, v1); |
| 77 acc = _mm_adds_epu16(acc, sad); |
| 78 v0 = _mm_loadu_si128(i1 + 4); |
| 79 v1 = _mm_loadu_si128(i2 + 4); |
| 80 sad = _mm_sad_epu8(v0, v1); |
| 81 acc = _mm_adds_epu16(acc, sad); |
| 82 v0 = _mm_loadu_si128(i1 + 5); |
| 83 v1 = _mm_loadu_si128(i2 + 5); |
| 84 sad = _mm_sad_epu8(v0, v1); |
| 85 acc = _mm_adds_epu16(acc, sad); |
| 86 v0 = _mm_loadu_si128(i1 + 6); |
| 87 v1 = _mm_loadu_si128(i2 + 6); |
| 88 sad = _mm_sad_epu8(v0, v1); |
| 89 acc = _mm_adds_epu16(acc, sad); |
| 90 v0 = _mm_loadu_si128(i1 + 7); |
| 91 v1 = _mm_loadu_si128(i2 + 7); |
| 92 sad = _mm_sad_epu8(v0, v1); |
| 93 acc = _mm_adds_epu16(acc, sad); |
| 94 |
| 95 // This essential means sad = acc >> 64. We only care about the lower 16 |
| 96 // bits. |
| 97 sad = _mm_shuffle_epi32(acc, 0xEE); |
| 98 sad = _mm_adds_epu16(sad, acc); |
| 99 return _mm_cvtsi128_si32(sad) != 0; |
| 100 } |
| 101 |
| 102 } // namespace webrtc |
OLD | NEW |