OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and | 11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and |
12 * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in | 12 * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in |
13 * entropy_coding.c. Results are bit exact with the c code for | 13 * entropy_coding.c. Results are bit exact with the c code for |
14 * generic platforms. | 14 * generic platforms. |
15 */ | 15 */ |
16 | 16 |
17 #include "entropy_coding.h" | 17 #include "entropy_coding.h" |
18 | 18 |
19 #include <arm_neon.h> | 19 #include <arm_neon.h> |
20 #include <assert.h> | |
21 #include <stddef.h> | 20 #include <stddef.h> |
22 | 21 |
23 #include "signal_processing_library.h" | 22 #include "signal_processing_library.h" |
| 23 #include "webrtc/base/checks.h" |
24 | 24 |
25 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[], | 25 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[], |
26 const int32_t matrix1[], | 26 const int32_t matrix1[], |
27 int32_t matrix_product[], | 27 int32_t matrix_product[], |
28 const int matrix1_index_factor1, | 28 const int matrix1_index_factor1, |
29 const int matrix0_index_factor1, | 29 const int matrix0_index_factor1, |
30 const int matrix1_index_init_case, | 30 const int matrix1_index_init_case, |
31 const int matrix1_index_step, | 31 const int matrix1_index_step, |
32 const int matrix0_index_step, | 32 const int matrix0_index_step, |
33 const int inner_loop_count, | 33 const int inner_loop_count, |
34 const int mid_loop_count, | 34 const int mid_loop_count, |
35 const int shift) { | 35 const int shift) { |
36 int j = 0, k = 0, n = 0; | 36 int j = 0, k = 0, n = 0; |
37 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0; | 37 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0; |
38 int* matrix1_index_factor2 = &j; | 38 int* matrix1_index_factor2 = &j; |
39 int* matrix0_index_factor2 = &k; | 39 int* matrix0_index_factor2 = &k; |
40 if (matrix1_index_init_case != 0) { | 40 if (matrix1_index_init_case != 0) { |
41 matrix1_index_factor2 = &k; | 41 matrix1_index_factor2 = &k; |
42 matrix0_index_factor2 = &j; | 42 matrix0_index_factor2 = &j; |
43 } | 43 } |
44 int32x4_t shift32x4 = vdupq_n_s32(shift); | 44 int32x4_t shift32x4 = vdupq_n_s32(shift); |
45 int32x2_t shift32x2 = vdup_n_s32(shift); | 45 int32x2_t shift32x2 = vdup_n_s32(shift); |
46 int32x4_t sum_32x4 = vdupq_n_s32(0); | 46 int32x4_t sum_32x4 = vdupq_n_s32(0); |
47 int32x2_t sum_32x2 = vdup_n_s32(0); | 47 int32x2_t sum_32x2 = vdup_n_s32(0); |
48 | 48 |
49 assert(inner_loop_count % 2 == 0); | 49 RTC_DCHECK_EQ(0, inner_loop_count % 2); |
50 assert(mid_loop_count % 2 == 0); | 50 RTC_DCHECK_EQ(0, mid_loop_count % 2); |
51 | 51 |
52 if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) { | 52 if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) { |
53 for (j = 0; j < SUBFRAMES; j++) { | 53 for (j = 0; j < SUBFRAMES; j++) { |
54 matrix_prod_index = mid_loop_count * j; | 54 matrix_prod_index = mid_loop_count * j; |
55 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) { | 55 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) { |
56 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros. | 56 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros. |
57 matrix1_index = k; | 57 matrix1_index = k; |
58 matrix0_index = matrix0_index_factor1 * j; | 58 matrix0_index = matrix0_index_factor1 * j; |
59 for (n = 0; n < inner_loop_count; n++) { | 59 for (n = 0; n < inner_loop_count; n++) { |
60 int32x4_t matrix0_32x4 = | 60 int32x4_t matrix0_32x4 = |
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
209 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2); | 209 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2); |
210 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2); | 210 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2); |
211 matrix1_index += 2; | 211 matrix1_index += 2; |
212 matrix0_index += matrix0_index_step; | 212 matrix0_index += matrix0_index_step; |
213 } | 213 } |
214 sum_32x2 = vshr_n_s32(sum_32x2, 3); | 214 sum_32x2 = vshr_n_s32(sum_32x2, 3); |
215 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2); | 215 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2); |
216 matrix_prod_index += 2; | 216 matrix_prod_index += 2; |
217 } | 217 } |
218 } | 218 } |
OLD | NEW |