OLD | NEW |
---|---|
1 // | 1 // |
2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. | 2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
3 // | 3 // |
4 // Use of this source code is governed by a BSD-style license | 4 // Use of this source code is governed by a BSD-style license |
5 // that can be found in the LICENSE file in the root of the source | 5 // that can be found in the LICENSE file in the root of the source |
6 // tree. An additional intellectual property rights grant can be found | 6 // tree. An additional intellectual property rights grant can be found |
7 // in the file PATENTS. All contributing project authors may | 7 // in the file PATENTS. All contributing project authors may |
8 // be found in the AUTHORS file in the root of the source tree. | 8 // be found in the AUTHORS file in the root of the source tree. |
9 // | 9 // |
10 // This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s | 10 // This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
86 #define dW1r v16.2s | 86 #define dW1r v16.2s |
87 #define dW1r8b v16.8b | 87 #define dW1r8b v16.8b |
88 #define dW1i v17.2s | 88 #define dW1i v17.2s |
89 #define dY0r v14.2s | 89 #define dY0r v14.2s |
90 #define dY0i v15.2s | 90 #define dY0i v15.2s |
91 #define dY1r v16.2s | 91 #define dY1r v16.2s |
92 #define dY1i v17.2s | 92 #define dY1i v17.2s |
93 #define qT2 v18.2s | 93 #define qT2 v18.2s |
94 #define qT3 v20.2s | 94 #define qT3 v20.2s |
95 | 95 |
96 #define half v0.2s | 96 #define half v0.s |
97 #define dZip v21.2s | 97 #define dZip v21.2s |
98 #define dZip8b v21.8b | 98 #define dZip8b v21.8b |
99 | 99 |
100 // Allocate stack memory required by the function | 100 // Allocate stack memory required by the function |
101 | 101 |
102 // Write function header | 102 // Write function header |
103 M_START ComplexToRealFixup,,d15 | 103 M_START ComplexToRealFixup,,d15 |
104 | 104 |
105 asr N, N, #1 | 105 asr N, N, #1 |
106 | 106 |
107 clz order, subFFTNum // N = 2^order | 107 clz order, subFFTNum // N = 2^order |
108 | 108 |
109 RSB order,order,#63 | 109 rsb order,order,#63 |
110 MOV subFFTSize,subFFTNum // subFFTSize = N/2 | 110 MOV subFFTSize,subFFTNum // subFFTSize = N/2 |
111 //MOV subFFTNum,N | 111 //MOV subFFTNum,N |
112 mov argDst, pDst | 112 mov argDst, pDst |
113 mov argTwiddle, pTwiddle | 113 mov argTwiddle, pTwiddle |
114 | 114 |
115 // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] | 115 // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] |
116 // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] | 116 // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] |
117 // 1/2[2a+j0] - j [0+j2b] | 117 // 1/2[2a+j0] - j [0+j2b] |
118 // (a+b, 0) | 118 // (a+b, 0) |
119 | 119 |
120 // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] | 120 // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] |
121 // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] | 121 // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] |
122 // 1/2[2a+j0] + j [0+j2b] | 122 // 1/2[2a+j0] + j [0+j2b] |
123 // (a-b, 0) | 123 // (a-b, 0) |
124 | 124 |
125 // F(0) and F(N/2) | 125 // F(0) and F(N/2) |
126 ld2 {dX0rs,dX0is}[0],[pSrc], #8 | 126 ld2 {dX0rs,dX0is}[0],[pSrc], #8 |
127 MOV zero,#0 | 127 MOV zero,#0 |
128 mov dX0rs[1],zero | 128 mov dX0rs[1],zero |
129 lsl step,subFFTSize, #3 // step = N/2 * 8 bytes | 129 lsl step,subFFTSize, #3 // step = N/2 * 8 bytes |
130 mov dX0i[1],zero | 130 mov dX0is[1],zero |
131 // twStep = 3N/8 * 8 bytes pointing to W^1 | 131 // twStep = 3N/8 * 8 bytes pointing to W^1 |
132 SUB twStep,step,subFFTSize,LSL #1 | 132 SUB twStep,step,subFFTSize,LSL #1 |
133 | 133 |
134 fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0) | 134 fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0) |
135 lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes | 135 lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes |
136 fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0) | 136 fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0) |
137 SUBS subFFTSize,subFFTSize,#2 | 137 SUBS subFFTSize,subFFTSize,#2 |
138 | 138 |
139 st1 {dY0r},[argDst],step | 139 st1 {dY0r},[argDst],step |
140 ADD pTwiddleTmp,argTwiddle,#8 // W^2 | 140 ADD pTwiddleTmp,argTwiddle,#8 // W^2 |
141 st1 {dY0i},[argDst], #8 | 141 st1 {dY0i},[argDst], #8 |
142 ADD argTwiddle,argTwiddle,twStep // W^1 | 142 ADD argTwiddle,argTwiddle,twStep // W^1 |
143 | 143 |
144 // dup dzero,zero | 144 // dup dzero,zero |
145 SUB argDst,argDst,step | 145 SUB argDst,argDst,step |
146 | 146 |
147 BLT End | 147 BLT End |
148 BEQ lastElement | 148 BEQ lastElement |
149 SUB step,step,#24 | 149 SUB step,step,#24 |
150 SUB step1,step1,#8 // (N/4-1)*8 bytes | 150 SUB step1,step1,#8 // (N/4-1)*8 bytes |
151 | 151 |
152 // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] | 152 // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] |
153 // Note: W^k is stored as negative values in the table | 153 // Note: W^k is stored as negative values in the table |
154 // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) | 154 // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) |
155 // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) | 155 // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) |
156 | 156 |
157 fmov half, #0.5 | 157 fmov v0.2s, #0.5 |
Raymond Toy (Google)
2015/10/30 22:38:46
I think you should change the definition of half i
Riku Voipio
2015/11/02 12:24:12
Ok, I'll define half = v0.2s, and halfs = v0.s to
| |
158 | 158 |
159 evenOddButterflyLoop: | 159 evenOddButterflyLoop: |
160 | 160 |
161 | 161 |
162 ld1 {dW0r},[argTwiddle],step1 | 162 ld1 {dW0r},[argTwiddle],step1 |
163 ld1 {dW1r},[argTwiddle], #8 | 163 ld1 {dW1r},[argTwiddle], #8 |
164 | 164 |
165 ld2 {dX0r,dX0i},[pSrc],step | 165 ld2 {dX0r,dX0i},[pSrc],step |
166 SUB argTwiddle,argTwiddle,step1 | 166 SUB argTwiddle,argTwiddle,step1 |
167 ld2 {dX1r,dX1i},[pSrc], #16 | 167 ld2 {dX1r,dX1i},[pSrc], #16 |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
252 | 252 |
253 st1 {dX0rs}[0],[argDst], #4 | 253 st1 {dX0rs}[0],[argDst], #4 |
254 fneg dX0r,dX0r | 254 fneg dX0r,dX0r |
255 st1 {dX0rs}[1],[argDst], #4 | 255 st1 {dX0rs}[1],[argDst], #4 |
256 End: | 256 End: |
257 | 257 |
258 // Write function tail | 258 // Write function tail |
259 M_END | 259 M_END |
260 | 260 |
261 .end | 261 .end |
OLD | NEW |