Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(143)

Side by Side Diff: dl/sp/src/arm/arm64/ComplexToRealFixup.S

Issue 1420973006: arm64: clang assembler compatability (Closed) Base URL: https://chromium.googlesource.com/external/webrtc/deps/third_party/openmax@master
Patch Set: Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // 1 //
2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 // 3 //
4 // Use of this source code is governed by a BSD-style license 4 // Use of this source code is governed by a BSD-style license
5 // that can be found in the LICENSE file in the root of the source 5 // that can be found in the LICENSE file in the root of the source
6 // tree. An additional intellectual property rights grant can be found 6 // tree. An additional intellectual property rights grant can be found
7 // in the file PATENTS. All contributing project authors may 7 // in the file PATENTS. All contributing project authors may
8 // be found in the AUTHORS file in the root of the source tree. 8 // be found in the AUTHORS file in the root of the source tree.
9 // 9 //
10 // This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s 10 // This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
86 #define dW1r v16.2s 86 #define dW1r v16.2s
87 #define dW1r8b v16.8b 87 #define dW1r8b v16.8b
88 #define dW1i v17.2s 88 #define dW1i v17.2s
89 #define dY0r v14.2s 89 #define dY0r v14.2s
90 #define dY0i v15.2s 90 #define dY0i v15.2s
91 #define dY1r v16.2s 91 #define dY1r v16.2s
92 #define dY1i v17.2s 92 #define dY1i v17.2s
93 #define qT2 v18.2s 93 #define qT2 v18.2s
94 #define qT3 v20.2s 94 #define qT3 v20.2s
95 95
96 #define half v0.2s 96 #define half v0.s
97 #define dZip v21.2s 97 #define dZip v21.2s
98 #define dZip8b v21.8b 98 #define dZip8b v21.8b
99 99
100 // Allocate stack memory required by the function 100 // Allocate stack memory required by the function
101 101
102 // Write function header 102 // Write function header
103 M_START ComplexToRealFixup,,d15 103 M_START ComplexToRealFixup,,d15
104 104
105 asr N, N, #1 105 asr N, N, #1
106 106
107 clz order, subFFTNum // N = 2^order 107 clz order, subFFTNum // N = 2^order
108 108
109 RSB order,order,#63 109 rsb order,order,#63
110 MOV subFFTSize,subFFTNum // subFFTSize = N/2 110 MOV subFFTSize,subFFTNum // subFFTSize = N/2
111 //MOV subFFTNum,N 111 //MOV subFFTNum,N
112 mov argDst, pDst 112 mov argDst, pDst
113 mov argTwiddle, pTwiddle 113 mov argTwiddle, pTwiddle
114 114
115 // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 115 // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
116 // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 116 // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
117 // 1/2[2a+j0] - j [0+j2b] 117 // 1/2[2a+j0] - j [0+j2b]
118 // (a+b, 0) 118 // (a+b, 0)
119 119
120 // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 120 // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
121 // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 121 // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
122 // 1/2[2a+j0] + j [0+j2b] 122 // 1/2[2a+j0] + j [0+j2b]
123 // (a-b, 0) 123 // (a-b, 0)
124 124
125 // F(0) and F(N/2) 125 // F(0) and F(N/2)
126 ld2 {dX0rs,dX0is}[0],[pSrc], #8 126 ld2 {dX0rs,dX0is}[0],[pSrc], #8
127 MOV zero,#0 127 MOV zero,#0
128 mov dX0rs[1],zero 128 mov dX0rs[1],zero
129 lsl step,subFFTSize, #3 // step = N/2 * 8 bytes 129 lsl step,subFFTSize, #3 // step = N/2 * 8 bytes
130 mov dX0i[1],zero 130 mov dX0is[1],zero
131 // twStep = 3N/8 * 8 bytes pointing to W^1 131 // twStep = 3N/8 * 8 bytes pointing to W^1
132 SUB twStep,step,subFFTSize,LSL #1 132 SUB twStep,step,subFFTSize,LSL #1
133 133
134 fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0) 134 fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0)
135 lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes 135 lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes
136 fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0) 136 fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0)
137 SUBS subFFTSize,subFFTSize,#2 137 SUBS subFFTSize,subFFTSize,#2
138 138
139 st1 {dY0r},[argDst],step 139 st1 {dY0r},[argDst],step
140 ADD pTwiddleTmp,argTwiddle,#8 // W^2 140 ADD pTwiddleTmp,argTwiddle,#8 // W^2
141 st1 {dY0i},[argDst], #8 141 st1 {dY0i},[argDst], #8
142 ADD argTwiddle,argTwiddle,twStep // W^1 142 ADD argTwiddle,argTwiddle,twStep // W^1
143 143
144 // dup dzero,zero 144 // dup dzero,zero
145 SUB argDst,argDst,step 145 SUB argDst,argDst,step
146 146
147 BLT End 147 BLT End
148 BEQ lastElement 148 BEQ lastElement
149 SUB step,step,#24 149 SUB step,step,#24
150 SUB step1,step1,#8 // (N/4-1)*8 bytes 150 SUB step1,step1,#8 // (N/4-1)*8 bytes
151 151
152 // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 152 // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
153 // Note: W^k is stored as negative values in the table 153 // Note: W^k is stored as negative values in the table
154 // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) 154 // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
155 // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 155 // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
156 156
157 fmov half, #0.5 157 fmov v0.2s, #0.5
Raymond Toy (Google) 2015/10/30 22:38:46 I think you should change the definition of half i
Riku Voipio 2015/11/02 12:24:12 Ok, I'll define half = v0.2s, and halfs = v0.s to
158 158
159 evenOddButterflyLoop: 159 evenOddButterflyLoop:
160 160
161 161
162 ld1 {dW0r},[argTwiddle],step1 162 ld1 {dW0r},[argTwiddle],step1
163 ld1 {dW1r},[argTwiddle], #8 163 ld1 {dW1r},[argTwiddle], #8
164 164
165 ld2 {dX0r,dX0i},[pSrc],step 165 ld2 {dX0r,dX0i},[pSrc],step
166 SUB argTwiddle,argTwiddle,step1 166 SUB argTwiddle,argTwiddle,step1
167 ld2 {dX1r,dX1i},[pSrc], #16 167 ld2 {dX1r,dX1i},[pSrc], #16
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
252 252
253 st1 {dX0rs}[0],[argDst], #4 253 st1 {dX0rs}[0],[argDst], #4
254 fneg dX0r,dX0r 254 fneg dX0r,dX0r
255 st1 {dX0rs}[1],[argDst], #4 255 st1 {dX0rs}[1],[argDst], #4
256 End: 256 End:
257 257
258 // Write function tail 258 // Write function tail
259 M_END 259 M_END
260 260
261 .end 261 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698