webrtc/modules/audio_processing/aec/aec_rdft_neon.cc - Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe.

Side by Side Diff: webrtc/modules/audio_processing/aec/aec_rdft_neon.cc

Issue 2348213002: Move the aec_rdft* files to a more proper place beneath APM and make them thread-safe. (Closed)

Patch Set: Rebase Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11 /*

12 * The rdft AEC algorithm, neon version of speed-critical functions.

13 *

14 * Based on the sse2 version.

15 */

16

17

18 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"

19

20 #include <arm_neon.h>

21

22 static const ALIGN16_BEG float ALIGN16_END

23 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};

24

25 static void cft1st_128_neon(float* a) {

26 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);

27 int j, k2;

28

29 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {

30 float32x4_t a00v = vld1q_f32(&a[j + 0]);

31 float32x4_t a04v = vld1q_f32(&a[j + 4]);

32 float32x4_t a08v = vld1q_f32(&a[j + 8]);

33 float32x4_t a12v = vld1q_f32(&a[j + 12]);

34 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));

35 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));

36 float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));

37 float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));

38 const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);

39 const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);

40 const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);

41 const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);

42 const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);

43 const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);

44 float32x4_t x0v = vaddq_f32(a01v, a23v);

45 const float32x4_t x1v = vsubq_f32(a01v, a23v);

46 const float32x4_t x2v = vaddq_f32(a45v, a67v);

47 const float32x4_t x3v = vsubq_f32(a45v, a67v);

48 const float32x4_t x3w = vrev64q_f32(x3v);

49 float32x4_t x0w;

50 a01v = vaddq_f32(x0v, x2v);

51 x0v = vsubq_f32(x0v, x2v);

52 x0w = vrev64q_f32(x0v);

53 a45v = vmulq_f32(wk2rv, x0v);

54 a45v = vmlaq_f32(a45v, wk2iv, x0w);

55 x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);

56 x0w = vrev64q_f32(x0v);

57 a23v = vmulq_f32(wk1rv, x0v);

58 a23v = vmlaq_f32(a23v, wk1iv, x0w);

59 x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);

60 x0w = vrev64q_f32(x0v);

61 a67v = vmulq_f32(wk3rv, x0v);

62 a67v = vmlaq_f32(a67v, wk3iv, x0w);

63 a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));

64 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));

65 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));

66 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));

67 vst1q_f32(&a[j + 0], a00v);

68 vst1q_f32(&a[j + 4], a04v);

69 vst1q_f32(&a[j + 8], a08v);

70 vst1q_f32(&a[j + 12], a12v);

71 }

72 }

73

74 static void cftmdl_128_neon(float* a) {

75 int j;

76 const int l = 8;

77 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);

78 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);

79

80 for (j = 0; j < l; j += 2) {

81 const float32x2_t a_00 = vld1_f32(&a[j + 0]);

82 const float32x2_t a_08 = vld1_f32(&a[j + 8]);

83 const float32x2_t a_32 = vld1_f32(&a[j + 32]);

84 const float32x2_t a_40 = vld1_f32(&a[j + 40]);

85 const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);

86 const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);

87 const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);

88 const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);

89 const float32x2_t a_16 = vld1_f32(&a[j + 16]);

90 const float32x2_t a_24 = vld1_f32(&a[j + 24]);

91 const float32x2_t a_48 = vld1_f32(&a[j + 48]);

92 const float32x2_t a_56 = vld1_f32(&a[j + 56]);

93 const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);

94 const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);

95 const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);

96 const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);

97 const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

98 const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

99 const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);

100 const float32x4_t x1_x3_add =

101 vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);

102 const float32x4_t x1_x3_sub =

103 vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);

104 const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);

105 const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);

106 const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);

107 const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);

108 const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);

109 const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);

110 const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);

111 const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);

112 const float32x4_t xx1_rev = vrev64q_f32(xx1);

113 const float32x4_t yy4_rev = vrev64q_f32(yy4);

114

115 vst1_f32(&a[j + 0], vget_low_f32(xx0));

116 vst1_f32(&a[j + 32], vget_high_f32(xx0));

117 vst1_f32(&a[j + 16], vget_low_f32(xx1));

118 vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));

119

120 a[j + 48] = -a[j + 48];

121

122 vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));

123 vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));

124 vst1_f32(&a[j + 40], vget_low_f32(yy4));

125 vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));

126 }

127

128 {

129 const int k = 64;

130 const int k1 = 2;

131 const int k2 = 2 * k1;

132 const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);

133 const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);

134 const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);

135 const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);

136 const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);

137 wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);

138 for (j = k; j < l + k; j += 2) {

139 const float32x2_t a_00 = vld1_f32(&a[j + 0]);

140 const float32x2_t a_08 = vld1_f32(&a[j + 8]);

141 const float32x2_t a_32 = vld1_f32(&a[j + 32]);

142 const float32x2_t a_40 = vld1_f32(&a[j + 40]);

143 const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);

144 const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);

145 const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);

146 const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);

147 const float32x2_t a_16 = vld1_f32(&a[j + 16]);

148 const float32x2_t a_24 = vld1_f32(&a[j + 24]);

149 const float32x2_t a_48 = vld1_f32(&a[j + 48]);

150 const float32x2_t a_56 = vld1_f32(&a[j + 56]);

151 const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);

152 const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);

153 const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);

154 const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);

155 const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

156 const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

157 const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);

158 const float32x4_t x1_x3_add =

159 vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);

160 const float32x4_t x1_x3_sub =

161 vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);

162 float32x4_t xx4 = vmulq_f32(wk2rv, xx1);

163 float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);

164 float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);

165 xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));

166 xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));

167 xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));

168

169 vst1_f32(&a[j + 0], vget_low_f32(xx));

170 vst1_f32(&a[j + 32], vget_high_f32(xx));

171 vst1_f32(&a[j + 16], vget_low_f32(xx4));

172 vst1_f32(&a[j + 48], vget_high_f32(xx4));

173 vst1_f32(&a[j + 8], vget_low_f32(xx12));

174 vst1_f32(&a[j + 40], vget_high_f32(xx12));

175 vst1_f32(&a[j + 24], vget_low_f32(xx22));

176 vst1_f32(&a[j + 56], vget_high_f32(xx22));

177 }

178 }

179 }

180

181 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) {

182 // A B C D -> C D A B

183 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));

184 // C D A B -> D C B A

185 return vrev64q_f32(rev);

186 }

187

188 static void rftfsub_128_neon(float* a) {

189 const float* c = rdft_w + 32;

190 int j1, j2;

191 const float32x4_t mm_half = vdupq_n_f32(0.5f);

192

193 // Vectorized code (four at once).

194 // Note: commented number are indexes for the first iteration of the loop.

195 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {

196 // Load 'wk'.

197 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,

198 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,

199 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,

200 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,

201 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,

202 // Load and shuffle 'a'.

203 // 2, 4, 6, 8, 3, 5, 7, 9

204 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);

205 // 120, 122, 124, 126, 121, 123, 125, 127,

206 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);

207 // 126, 124, 122, 120

208 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);

209 // 127, 125, 123, 121

210 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);

211 // Calculate 'x'.

212 const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);

213 // 2-126, 4-124, 6-122, 8-120,

214 const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);

215 // 3-127, 5-125, 7-123, 9-121,

216 // Calculate product into 'y'.

217 // yr = wkr * xr - wki * xi;

218 // yi = wkr * xi + wki * xr;

219 const float32x4_t a_ = vmulq_f32(wkr_, xr_);

220 const float32x4_t b_ = vmulq_f32(wki_, xi_);

221 const float32x4_t c_ = vmulq_f32(wkr_, xi_);

222 const float32x4_t d_ = vmulq_f32(wki_, xr_);

223 const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,

224 const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,

225 // Update 'a'.

226 // a[j2 + 0] -= yr;

227 // a[j2 + 1] -= yi;

228 // a[k2 + 0] += yr;

229 // a[k2 + 1] -= yi;

230 // 126, 124, 122, 120,

231 const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);

232 // 127, 125, 123, 121,

233 const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);

234 // Shuffle in right order and store.

235 const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);

236 const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);

237 // 124, 125, 126, 127, 120, 121, 122, 123

238 const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);

239 // 2, 4, 6, 8,

240 a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);

241 // 3, 5, 7, 9,

242 a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);

243 // 2, 3, 4, 5, 6, 7, 8, 9,

244 vst2q_f32(&a[0 + j2], a_j2_p);

245

246 vst1q_f32(&a[122 - j2], a_k2_n.val[1]);

247 vst1q_f32(&a[126 - j2], a_k2_n.val[0]);

248 }

249

250 // Scalar code for the remaining items.

251 for (; j2 < 64; j1 += 1, j2 += 2) {

252 const int k2 = 128 - j2;

253 const int k1 = 32 - j1;

254 const float wkr = 0.5f - c[k1];

255 const float wki = c[j1];

256 const float xr = a[j2 + 0] - a[k2 + 0];

257 const float xi = a[j2 + 1] + a[k2 + 1];

258 const float yr = wkr * xr - wki * xi;

259 const float yi = wkr * xi + wki * xr;

260 a[j2 + 0] -= yr;

261 a[j2 + 1] -= yi;

262 a[k2 + 0] += yr;

263 a[k2 + 1] -= yi;

264 }

265 }

266

267 static void rftbsub_128_neon(float* a) {

268 const float* c = rdft_w + 32;

269 int j1, j2;

270 const float32x4_t mm_half = vdupq_n_f32(0.5f);

271

272 a[1] = -a[1];

273 // Vectorized code (four at once).

274 // Note: commented number are indexes for the first iteration of the loop.

275 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {

276 // Load 'wk'.

277 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,

278 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,

279 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,

280 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,

281 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,

282 // Load and shuffle 'a'.

283 // 2, 4, 6, 8, 3, 5, 7, 9

284 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);

285 // 120, 122, 124, 126, 121, 123, 125, 127,

286 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);

287 // 126, 124, 122, 120

288 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);

289 // 127, 125, 123, 121

290 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);

291 // Calculate 'x'.

292 const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);

293 // 2-126, 4-124, 6-122, 8-120,

294 const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);

295 // 3-127, 5-125, 7-123, 9-121,

296 // Calculate product into 'y'.

297 // yr = wkr * xr - wki * xi;

298 // yi = wkr * xi + wki * xr;

299 const float32x4_t a_ = vmulq_f32(wkr_, xr_);

300 const float32x4_t b_ = vmulq_f32(wki_, xi_);

301 const float32x4_t c_ = vmulq_f32(wkr_, xi_);

302 const float32x4_t d_ = vmulq_f32(wki_, xr_);

303 const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,

304 const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,

305 // Update 'a'.

306 // a[j2 + 0] -= yr;

307 // a[j2 + 1] -= yi;

308 // a[k2 + 0] += yr;

309 // a[k2 + 1] -= yi;

310 // 126, 124, 122, 120,

311 const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);

312 // 127, 125, 123, 121,

313 const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);

314 // Shuffle in right order and store.

315 // 2, 3, 4, 5, 6, 7, 8, 9,

316 const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);

317 const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);

318 // 124, 125, 126, 127, 120, 121, 122, 123

319 const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);

320 // 2, 4, 6, 8,

321 a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);

322 // 3, 5, 7, 9,

323 a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);

324 // 2, 3, 4, 5, 6, 7, 8, 9,

325 vst2q_f32(&a[0 + j2], a_j2_p);

326

327 vst1q_f32(&a[122 - j2], a_k2_n.val[1]);

328 vst1q_f32(&a[126 - j2], a_k2_n.val[0]);

329 }

330

331 // Scalar code for the remaining items.

332 for (; j2 < 64; j1 += 1, j2 += 2) {

333 const int k2 = 128 - j2;

334 const int k1 = 32 - j1;

335 const float wkr = 0.5f - c[k1];

336 const float wki = c[j1];

337 const float xr = a[j2 + 0] - a[k2 + 0];

338 const float xi = a[j2 + 1] + a[k2 + 1];

339 const float yr = wkr * xr + wki * xi;

340 const float yi = wkr * xi - wki * xr;

341 a[j2 + 0] = a[j2 + 0] - yr;

342 a[j2 + 1] = yi - a[j2 + 1];

343 a[k2 + 0] = yr + a[k2 + 0];

344 a[k2 + 1] = yi - a[k2 + 1];

345 }

346 a[65] = -a[65];

347 }

348

349 void aec_rdft_init_neon(void) {

350 cft1st_128 = cft1st_128_neon;

351 cftmdl_128 = cftmdl_128_neon;

352 rftfsub_128 = rftfsub_128_neon;

353 rftbsub_128 = rftbsub_128_neon;

354 }

355

OLD	NEW

« no previous file with comments | « webrtc/modules/audio_processing/aec/aec_rdft_mips.cc ('k') | webrtc/modules/audio_processing/aec/aec_rdft_sse2.cc » ('j') | no next file with comments »