OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "webrtc/modules/video_processing/content_analysis.h" | 11 #include "webrtc/modules/video_processing/content_analysis.h" |
12 | 12 |
13 #include <emmintrin.h> | 13 #include <emmintrin.h> |
14 #include <math.h> | 14 #include <math.h> |
15 | 15 |
16 namespace webrtc { | 16 namespace webrtc { |
17 | 17 |
18 int32_t VPMContentAnalysis::TemporalDiffMetric_SSE2() { | 18 int32_t VPMContentAnalysis::TemporalDiffMetric_SSE2() { |
19 uint32_t num_pixels = 0; // counter for # of pixels | 19 uint32_t num_pixels = 0; // counter for # of pixels |
20 const uint8_t* imgBufO = orig_frame_ + border_*width_ + border_; | 20 const uint8_t* imgBufO = orig_frame_ + border_ * width_ + border_; |
21 const uint8_t* imgBufP = prev_frame_ + border_*width_ + border_; | 21 const uint8_t* imgBufP = prev_frame_ + border_ * width_ + border_; |
22 | 22 |
23 const int32_t width_end = ((width_ - 2*border_) & -16) + border_; | 23 const int32_t width_end = ((width_ - 2 * border_) & -16) + border_; |
24 | 24 |
25 __m128i sad_64 = _mm_setzero_si128(); | 25 __m128i sad_64 = _mm_setzero_si128(); |
26 __m128i sum_64 = _mm_setzero_si128(); | 26 __m128i sum_64 = _mm_setzero_si128(); |
27 __m128i sqsum_64 = _mm_setzero_si128(); | 27 __m128i sqsum_64 = _mm_setzero_si128(); |
28 const __m128i z = _mm_setzero_si128(); | 28 const __m128i z = _mm_setzero_si128(); |
29 | 29 |
30 for (uint16_t i = 0; i < (height_ - 2*border_); i += skip_num_) { | 30 for (uint16_t i = 0; i < (height_ - 2 * border_); i += skip_num_) { |
31 __m128i sqsum_32 = _mm_setzero_si128(); | 31 __m128i sqsum_32 = _mm_setzero_si128(); |
32 | 32 |
33 const uint8_t *lineO = imgBufO; | 33 const uint8_t* lineO = imgBufO; |
34 const uint8_t *lineP = imgBufP; | 34 const uint8_t* lineP = imgBufP; |
35 | 35 |
36 // Work on 16 pixels at a time. For HD content with a width of 1920 | 36 // Work on 16 pixels at a time. For HD content with a width of 1920 |
37 // this loop will run ~67 times (depending on border). Maximum for | 37 // this loop will run ~67 times (depending on border). Maximum for |
38 // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit | 38 // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit |
39 // results which are then accumulated. There is no chance of | 39 // results which are then accumulated. There is no chance of |
40 // rollover for these two accumulators. | 40 // rollover for these two accumulators. |
41 // o*o will have a maximum of 255*255 = 65025. This will roll over | 41 // o*o will have a maximum of 255*255 = 65025. This will roll over |
42 // a 16 bit accumulator as 67*65025 > 65535, but will fit in a | 42 // a 16 bit accumulator as 67*65025 > 65535, but will fit in a |
43 // 32 bit accumulator. | 43 // 32 bit accumulator. |
44 for (uint16_t j = 0; j < width_end - border_; j += 16) { | 44 for (uint16_t j = 0; j < width_end - border_; j += 16) { |
45 const __m128i o = _mm_loadu_si128((__m128i*)(lineO)); | 45 const __m128i o = _mm_loadu_si128((__m128i*)(lineO)); |
46 const __m128i p = _mm_loadu_si128((__m128i*)(lineP)); | 46 const __m128i p = _mm_loadu_si128((__m128i*)(lineP)); |
47 | 47 |
48 lineO += 16; | 48 lineO += 16; |
49 lineP += 16; | 49 lineP += 16; |
50 | 50 |
51 // Abs pixel difference between frames. | 51 // Abs pixel difference between frames. |
52 sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p)); | 52 sad_64 = _mm_add_epi64(sad_64, _mm_sad_epu8(o, p)); |
53 | 53 |
54 // sum of all pixels in frame | 54 // sum of all pixels in frame |
55 sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z)); | 55 sum_64 = _mm_add_epi64(sum_64, _mm_sad_epu8(o, z)); |
56 | 56 |
57 // Squared sum of all pixels in frame. | 57 // Squared sum of all pixels in frame. |
58 const __m128i olo = _mm_unpacklo_epi8(o,z); | 58 const __m128i olo = _mm_unpacklo_epi8(o, z); |
59 const __m128i ohi = _mm_unpackhi_epi8(o,z); | 59 const __m128i ohi = _mm_unpackhi_epi8(o, z); |
60 | 60 |
61 const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo); | 61 const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo); |
62 const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi); | 62 const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi); |
63 | 63 |
64 sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo); | 64 sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo); |
65 sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi); | 65 sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi); |
66 } | 66 } |
67 | 67 |
68 // Add to 64 bit running sum as to not roll over. | 68 // Add to 64 bit running sum as to not roll over. |
69 sqsum_64 = _mm_add_epi64(sqsum_64, | 69 sqsum_64 = |
70 _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z), | 70 _mm_add_epi64(sqsum_64, _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32, z), |
71 _mm_unpacklo_epi32(sqsum_32,z))); | 71 _mm_unpacklo_epi32(sqsum_32, z))); |
72 | 72 |
73 imgBufO += width_ * skip_num_; | 73 imgBufO += width_ * skip_num_; |
74 imgBufP += width_ * skip_num_; | 74 imgBufP += width_ * skip_num_; |
75 num_pixels += (width_end - border_); | 75 num_pixels += (width_end - border_); |
76 } | 76 } |
77 | 77 |
78 __m128i sad_final_128; | 78 __m128i sad_final_128; |
79 __m128i sum_final_128; | 79 __m128i sum_final_128; |
80 __m128i sqsum_final_128; | 80 __m128i sqsum_final_128; |
81 | 81 |
82 // Bring sums out of vector registers and into integer register | 82 // Bring sums out of vector registers and into integer register |
83 // domain, summing them along the way. | 83 // domain, summing them along the way. |
84 _mm_store_si128 (&sad_final_128, sad_64); | 84 _mm_store_si128(&sad_final_128, sad_64); |
85 _mm_store_si128 (&sum_final_128, sum_64); | 85 _mm_store_si128(&sum_final_128, sum_64); |
86 _mm_store_si128 (&sqsum_final_128, sqsum_64); | 86 _mm_store_si128(&sqsum_final_128, sqsum_64); |
87 | 87 |
88 uint64_t *sad_final_64 = reinterpret_cast<uint64_t*>(&sad_final_128); | 88 uint64_t* sad_final_64 = reinterpret_cast<uint64_t*>(&sad_final_128); |
89 uint64_t *sum_final_64 = reinterpret_cast<uint64_t*>(&sum_final_128); | 89 uint64_t* sum_final_64 = reinterpret_cast<uint64_t*>(&sum_final_128); |
90 uint64_t *sqsum_final_64 = reinterpret_cast<uint64_t*>(&sqsum_final_128); | 90 uint64_t* sqsum_final_64 = reinterpret_cast<uint64_t*>(&sqsum_final_128); |
91 | 91 |
92 const uint32_t pixelSum = sum_final_64[0] + sum_final_64[1]; | 92 const uint32_t pixelSum = sum_final_64[0] + sum_final_64[1]; |
93 const uint64_t pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1]; | 93 const uint64_t pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1]; |
94 const uint32_t tempDiffSum = sad_final_64[0] + sad_final_64[1]; | 94 const uint32_t tempDiffSum = sad_final_64[0] + sad_final_64[1]; |
95 | 95 |
96 // Default. | 96 // Default. |
97 motion_magnitude_ = 0.0f; | 97 motion_magnitude_ = 0.0f; |
98 | 98 |
99 if (tempDiffSum == 0) return VPM_OK; | 99 if (tempDiffSum == 0) |
| 100 return VPM_OK; |
100 | 101 |
101 // Normalize over all pixels. | 102 // Normalize over all pixels. |
102 const float tempDiffAvg = (float)tempDiffSum / (float)(num_pixels); | 103 const float tempDiffAvg = (float)tempDiffSum / (float)(num_pixels); |
103 const float pixelSumAvg = (float)pixelSum / (float)(num_pixels); | 104 const float pixelSumAvg = (float)pixelSum / (float)(num_pixels); |
104 const float pixelSqSumAvg = (float)pixelSqSum / (float)(num_pixels); | 105 const float pixelSqSumAvg = (float)pixelSqSum / (float)(num_pixels); |
105 float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg); | 106 float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg); |
106 | 107 |
107 if (contrast > 0.0) { | 108 if (contrast > 0.0) { |
108 contrast = sqrt(contrast); | 109 contrast = sqrt(contrast); |
109 motion_magnitude_ = tempDiffAvg/contrast; | 110 motion_magnitude_ = tempDiffAvg / contrast; |
110 } | 111 } |
111 | 112 |
112 return VPM_OK; | 113 return VPM_OK; |
113 } | 114 } |
114 | 115 |
115 int32_t VPMContentAnalysis::ComputeSpatialMetrics_SSE2() { | 116 int32_t VPMContentAnalysis::ComputeSpatialMetrics_SSE2() { |
116 const uint8_t* imgBuf = orig_frame_ + border_*width_; | 117 const uint8_t* imgBuf = orig_frame_ + border_ * width_; |
117 const int32_t width_end = ((width_ - 2 * border_) & -16) + border_; | 118 const int32_t width_end = ((width_ - 2 * border_) & -16) + border_; |
118 | 119 |
119 __m128i se_32 = _mm_setzero_si128(); | 120 __m128i se_32 = _mm_setzero_si128(); |
120 __m128i sev_32 = _mm_setzero_si128(); | 121 __m128i sev_32 = _mm_setzero_si128(); |
121 __m128i seh_32 = _mm_setzero_si128(); | 122 __m128i seh_32 = _mm_setzero_si128(); |
122 __m128i msa_32 = _mm_setzero_si128(); | 123 __m128i msa_32 = _mm_setzero_si128(); |
123 const __m128i z = _mm_setzero_si128(); | 124 const __m128i z = _mm_setzero_si128(); |
124 | 125 |
125 // Error is accumulated as a 32 bit value. Looking at HD content with a | 126 // Error is accumulated as a 32 bit value. Looking at HD content with a |
126 // height of 1080 lines, or about 67 macro blocks. If the 16 bit row | 127 // height of 1080 lines, or about 67 macro blocks. If the 16 bit row |
127 // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which | 128 // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which |
128 // will not roll over a 32 bit accumulator. | 129 // will not roll over a 32 bit accumulator. |
129 // skip_num_ is also used to reduce the number of rows | 130 // skip_num_ is also used to reduce the number of rows |
130 for (int32_t i = 0; i < (height_ - 2*border_); i += skip_num_) { | 131 for (int32_t i = 0; i < (height_ - 2 * border_); i += skip_num_) { |
131 __m128i se_16 = _mm_setzero_si128(); | 132 __m128i se_16 = _mm_setzero_si128(); |
132 __m128i sev_16 = _mm_setzero_si128(); | 133 __m128i sev_16 = _mm_setzero_si128(); |
133 __m128i seh_16 = _mm_setzero_si128(); | 134 __m128i seh_16 = _mm_setzero_si128(); |
134 __m128i msa_16 = _mm_setzero_si128(); | 135 __m128i msa_16 = _mm_setzero_si128(); |
135 | 136 |
136 // Row error is accumulated as a 16 bit value. There are 8 | 137 // Row error is accumulated as a 16 bit value. There are 8 |
137 // accumulators. Max value of a 16 bit number is 65529. Looking | 138 // accumulators. Max value of a 16 bit number is 65529. Looking |
138 // at HD content, 1080p, has a width of 1920, 120 macro blocks. | 139 // at HD content, 1080p, has a width of 1920, 120 macro blocks. |
139 // A mb at a time is processed at a time. Absolute max error at | 140 // A mb at a time is processed at a time. Absolute max error at |
140 // a point would be abs(0-255+255+255+255) which equals 1020. | 141 // a point would be abs(0-255+255+255+255) which equals 1020. |
141 // 120*1020 = 122400. The probability of hitting this is quite low | 142 // 120*1020 = 122400. The probability of hitting this is quite low |
142 // on well behaved content. A specially crafted image could roll over. | 143 // on well behaved content. A specially crafted image could roll over. |
143 // border_ could also be adjusted to concentrate on just the center of | 144 // border_ could also be adjusted to concentrate on just the center of |
144 // the images for an HD capture in order to reduce the possiblity of | 145 // the images for an HD capture in order to reduce the possiblity of |
145 // rollover. | 146 // rollover. |
146 const uint8_t *lineTop = imgBuf - width_ + border_; | 147 const uint8_t* lineTop = imgBuf - width_ + border_; |
147 const uint8_t *lineCen = imgBuf + border_; | 148 const uint8_t* lineCen = imgBuf + border_; |
148 const uint8_t *lineBot = imgBuf + width_ + border_; | 149 const uint8_t* lineBot = imgBuf + width_ + border_; |
149 | 150 |
150 for (int32_t j = 0; j < width_end - border_; j += 16) { | 151 for (int32_t j = 0; j < width_end - border_; j += 16) { |
151 const __m128i t = _mm_loadu_si128((__m128i*)(lineTop)); | 152 const __m128i t = _mm_loadu_si128((__m128i*)(lineTop)); |
152 const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1)); | 153 const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1)); |
153 const __m128i c = _mm_loadu_si128((__m128i*)(lineCen)); | 154 const __m128i c = _mm_loadu_si128((__m128i*)(lineCen)); |
154 const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1)); | 155 const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1)); |
155 const __m128i b = _mm_loadu_si128((__m128i*)(lineBot)); | 156 const __m128i b = _mm_loadu_si128((__m128i*)(lineBot)); |
156 | 157 |
157 lineTop += 16; | 158 lineTop += 16; |
158 lineCen += 16; | 159 lineCen += 16; |
159 lineBot += 16; | 160 lineBot += 16; |
160 | 161 |
161 // center pixel unpacked | 162 // center pixel unpacked |
162 __m128i clo = _mm_unpacklo_epi8(c,z); | 163 __m128i clo = _mm_unpacklo_epi8(c, z); |
163 __m128i chi = _mm_unpackhi_epi8(c,z); | 164 __m128i chi = _mm_unpackhi_epi8(c, z); |
164 | 165 |
165 // left right pixels unpacked and added together | 166 // left right pixels unpacked and added together |
166 const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z), | 167 const __m128i lrlo = |
167 _mm_unpacklo_epi8(r,z)); | 168 _mm_add_epi16(_mm_unpacklo_epi8(l, z), _mm_unpacklo_epi8(r, z)); |
168 const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z), | 169 const __m128i lrhi = |
169 _mm_unpackhi_epi8(r,z)); | 170 _mm_add_epi16(_mm_unpackhi_epi8(l, z), _mm_unpackhi_epi8(r, z)); |
170 | 171 |
171 // top & bottom pixels unpacked and added together | 172 // top & bottom pixels unpacked and added together |
172 const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z), | 173 const __m128i tblo = |
173 _mm_unpacklo_epi8(b,z)); | 174 _mm_add_epi16(_mm_unpacklo_epi8(t, z), _mm_unpacklo_epi8(b, z)); |
174 const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z), | 175 const __m128i tbhi = |
175 _mm_unpackhi_epi8(b,z)); | 176 _mm_add_epi16(_mm_unpackhi_epi8(t, z), _mm_unpackhi_epi8(b, z)); |
176 | 177 |
177 // running sum of all pixels | 178 // running sum of all pixels |
178 msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo)); | 179 msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo)); |
179 | 180 |
180 clo = _mm_slli_epi16(clo, 1); | 181 clo = _mm_slli_epi16(clo, 1); |
181 chi = _mm_slli_epi16(chi, 1); | 182 chi = _mm_slli_epi16(chi, 1); |
182 const __m128i sevtlo = _mm_subs_epi16(clo, tblo); | 183 const __m128i sevtlo = _mm_subs_epi16(clo, tblo); |
183 const __m128i sevthi = _mm_subs_epi16(chi, tbhi); | 184 const __m128i sevthi = _mm_subs_epi16(chi, tbhi); |
184 const __m128i sehtlo = _mm_subs_epi16(clo, lrlo); | 185 const __m128i sehtlo = _mm_subs_epi16(clo, lrlo); |
185 const __m128i sehthi = _mm_subs_epi16(chi, lrhi); | 186 const __m128i sehthi = _mm_subs_epi16(chi, lrhi); |
186 | 187 |
187 clo = _mm_slli_epi16(clo, 1); | 188 clo = _mm_slli_epi16(clo, 1); |
188 chi = _mm_slli_epi16(chi, 1); | 189 chi = _mm_slli_epi16(chi, 1); |
189 const __m128i setlo = _mm_subs_epi16(clo, _mm_add_epi16(lrlo, tblo)); | 190 const __m128i setlo = _mm_subs_epi16(clo, _mm_add_epi16(lrlo, tblo)); |
190 const __m128i sethi = _mm_subs_epi16(chi, _mm_add_epi16(lrhi, tbhi)); | 191 const __m128i sethi = _mm_subs_epi16(chi, _mm_add_epi16(lrhi, tbhi)); |
191 | 192 |
192 // Add to 16 bit running sum | 193 // Add to 16 bit running sum |
193 se_16 = _mm_add_epi16(se_16, _mm_max_epi16(setlo, | 194 se_16 = |
194 _mm_subs_epi16(z, setlo))); | 195 _mm_add_epi16(se_16, _mm_max_epi16(setlo, _mm_subs_epi16(z, setlo))); |
195 se_16 = _mm_add_epi16(se_16, _mm_max_epi16(sethi, | 196 se_16 = |
196 _mm_subs_epi16(z, sethi))); | 197 _mm_add_epi16(se_16, _mm_max_epi16(sethi, _mm_subs_epi16(z, sethi))); |
197 sev_16 = _mm_add_epi16(sev_16, _mm_max_epi16(sevtlo, | 198 sev_16 = _mm_add_epi16(sev_16, |
198 _mm_subs_epi16(z, sevtlo))); | 199 _mm_max_epi16(sevtlo, _mm_subs_epi16(z, sevtlo))); |
199 sev_16 = _mm_add_epi16(sev_16, _mm_max_epi16(sevthi, | 200 sev_16 = _mm_add_epi16(sev_16, |
200 _mm_subs_epi16(z, sevthi))); | 201 _mm_max_epi16(sevthi, _mm_subs_epi16(z, sevthi))); |
201 seh_16 = _mm_add_epi16(seh_16, _mm_max_epi16(sehtlo, | 202 seh_16 = _mm_add_epi16(seh_16, |
202 _mm_subs_epi16(z, sehtlo))); | 203 _mm_max_epi16(sehtlo, _mm_subs_epi16(z, sehtlo))); |
203 seh_16 = _mm_add_epi16(seh_16, _mm_max_epi16(sehthi, | 204 seh_16 = _mm_add_epi16(seh_16, |
204 _mm_subs_epi16(z, sehthi))); | 205 _mm_max_epi16(sehthi, _mm_subs_epi16(z, sehthi))); |
205 } | 206 } |
206 | 207 |
207 // Add to 32 bit running sum as to not roll over. | 208 // Add to 32 bit running sum as to not roll over. |
208 se_32 = _mm_add_epi32(se_32, _mm_add_epi32(_mm_unpackhi_epi16(se_16,z), | 209 se_32 = _mm_add_epi32(se_32, _mm_add_epi32(_mm_unpackhi_epi16(se_16, z), |
209 _mm_unpacklo_epi16(se_16,z))); | 210 _mm_unpacklo_epi16(se_16, z))); |
210 sev_32 = _mm_add_epi32(sev_32, _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z), | 211 sev_32 = |
211 _mm_unpacklo_epi16(sev_16,z))); | 212 _mm_add_epi32(sev_32, _mm_add_epi32(_mm_unpackhi_epi16(sev_16, z), |
212 seh_32 = _mm_add_epi32(seh_32, _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z), | 213 _mm_unpacklo_epi16(sev_16, z))); |
213 _mm_unpacklo_epi16(seh_16,z))); | 214 seh_32 = |
214 msa_32 = _mm_add_epi32(msa_32, _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z), | 215 _mm_add_epi32(seh_32, _mm_add_epi32(_mm_unpackhi_epi16(seh_16, z), |
215 _mm_unpacklo_epi16(msa_16,z))); | 216 _mm_unpacklo_epi16(seh_16, z))); |
| 217 msa_32 = |
| 218 _mm_add_epi32(msa_32, _mm_add_epi32(_mm_unpackhi_epi16(msa_16, z), |
| 219 _mm_unpacklo_epi16(msa_16, z))); |
216 | 220 |
217 imgBuf += width_ * skip_num_; | 221 imgBuf += width_ * skip_num_; |
218 } | 222 } |
219 | 223 |
220 __m128i se_128; | 224 __m128i se_128; |
221 __m128i sev_128; | 225 __m128i sev_128; |
222 __m128i seh_128; | 226 __m128i seh_128; |
223 __m128i msa_128; | 227 __m128i msa_128; |
224 | 228 |
225 // Bring sums out of vector registers and into integer register | 229 // Bring sums out of vector registers and into integer register |
226 // domain, summing them along the way. | 230 // domain, summing them along the way. |
227 _mm_store_si128 (&se_128, _mm_add_epi64(_mm_unpackhi_epi32(se_32,z), | 231 _mm_store_si128(&se_128, _mm_add_epi64(_mm_unpackhi_epi32(se_32, z), |
228 _mm_unpacklo_epi32(se_32,z))); | 232 _mm_unpacklo_epi32(se_32, z))); |
229 _mm_store_si128 (&sev_128, _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z), | 233 _mm_store_si128(&sev_128, _mm_add_epi64(_mm_unpackhi_epi32(sev_32, z), |
230 _mm_unpacklo_epi32(sev_32,z))); | 234 _mm_unpacklo_epi32(sev_32, z))); |
231 _mm_store_si128 (&seh_128, _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z), | 235 _mm_store_si128(&seh_128, _mm_add_epi64(_mm_unpackhi_epi32(seh_32, z), |
232 _mm_unpacklo_epi32(seh_32,z))); | 236 _mm_unpacklo_epi32(seh_32, z))); |
233 _mm_store_si128 (&msa_128, _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z), | 237 _mm_store_si128(&msa_128, _mm_add_epi64(_mm_unpackhi_epi32(msa_32, z), |
234 _mm_unpacklo_epi32(msa_32,z))); | 238 _mm_unpacklo_epi32(msa_32, z))); |
235 | 239 |
236 uint64_t *se_64 = reinterpret_cast<uint64_t*>(&se_128); | 240 uint64_t* se_64 = reinterpret_cast<uint64_t*>(&se_128); |
237 uint64_t *sev_64 = reinterpret_cast<uint64_t*>(&sev_128); | 241 uint64_t* sev_64 = reinterpret_cast<uint64_t*>(&sev_128); |
238 uint64_t *seh_64 = reinterpret_cast<uint64_t*>(&seh_128); | 242 uint64_t* seh_64 = reinterpret_cast<uint64_t*>(&seh_128); |
239 uint64_t *msa_64 = reinterpret_cast<uint64_t*>(&msa_128); | 243 uint64_t* msa_64 = reinterpret_cast<uint64_t*>(&msa_128); |
240 | 244 |
241 const uint32_t spatialErrSum = se_64[0] + se_64[1]; | 245 const uint32_t spatialErrSum = se_64[0] + se_64[1]; |
242 const uint32_t spatialErrVSum = sev_64[0] + sev_64[1]; | 246 const uint32_t spatialErrVSum = sev_64[0] + sev_64[1]; |
243 const uint32_t spatialErrHSum = seh_64[0] + seh_64[1]; | 247 const uint32_t spatialErrHSum = seh_64[0] + seh_64[1]; |
244 const uint32_t pixelMSA = msa_64[0] + msa_64[1]; | 248 const uint32_t pixelMSA = msa_64[0] + msa_64[1]; |
245 | 249 |
246 // Normalize over all pixels. | 250 // Normalize over all pixels. |
247 const float spatialErr = (float)(spatialErrSum >> 2); | 251 const float spatialErr = (float)(spatialErrSum >> 2); |
248 const float spatialErrH = (float)(spatialErrHSum >> 1); | 252 const float spatialErrH = (float)(spatialErrHSum >> 1); |
249 const float spatialErrV = (float)(spatialErrVSum >> 1); | 253 const float spatialErrV = (float)(spatialErrVSum >> 1); |
250 const float norm = (float)pixelMSA; | 254 const float norm = (float)pixelMSA; |
251 | 255 |
252 // 2X2: | 256 // 2X2: |
253 spatial_pred_err_ = spatialErr / norm; | 257 spatial_pred_err_ = spatialErr / norm; |
254 | 258 |
255 // 1X2: | 259 // 1X2: |
256 spatial_pred_err_h_ = spatialErrH / norm; | 260 spatial_pred_err_h_ = spatialErrH / norm; |
257 | 261 |
258 // 2X1: | 262 // 2X1: |
259 spatial_pred_err_v_ = spatialErrV / norm; | 263 spatial_pred_err_v_ = spatialErrV / norm; |
260 | 264 |
261 return VPM_OK; | 265 return VPM_OK; |
262 } | 266 } |
263 | 267 |
264 } // namespace webrtc | 268 } // namespace webrtc |
OLD | NEW |