Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
10/*
11 * Copyright (c) 2016-2019 ARM Limited.
12 *
13 * SPDX-License-Identifier: MIT
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this software and associated documentation files (the "Software"), to
17 * deal in the Software without restriction, including without limitation the
18 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
19 * sell copies of the Software, and to permit persons to whom the Software is
20 * furnished to do so, subject to the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in all
23 * copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 *
33 * _vtaylor_polyq_f32
34 * _vlogq_f32
35 *
36 */
37
38/* Copyright (C) 2011 Julien Pommier
39
40 This software is provided 'as-is', without any express or implied
41 warranty. In no event will the authors be held liable for any damages
42 arising from the use of this software.
43
44 Permission is granted to anyone to use this software for any purpose,
45 including commercial applications, and to alter it and redistribute it
46 freely, subject to the following restrictions:
47
48 1. The origin of this software must not be misrepresented; you must not
49 claim that you wrote the original software. If you use this software
50 in a product, an acknowledgment in the product documentation would be
51 appreciated but is not required.
52 2. Altered source versions must be plainly marked as such, and must not be
53 misrepresented as being the original software.
54 3. This notice may not be removed or altered from any source distribution.
55
56 (this is the zlib license)
57
58 _vsincosq_f32
59
60*/
61
62/*
63 * This file is intended to hold NEON intrinsics of intrinsics.
64 * They should be used in VOLK kernels to avoid copy-pasta.
65 */
66
67#ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
68#define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
69#ifdef LV_HAVE_NEON
70#include <arm_neon.h>
71
72
73/* Magnitude squared for float32x4x2_t */
74static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
75{
76 float32x4_t iValue, qValue, result;
77 iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
78 qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
79 result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
80 return result;
81}
82
83/* Inverse square root for float32x4_t */
84static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
85{
86 float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
87 sqrt_reciprocal = vmulq_f32(
88 vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
89 sqrt_reciprocal = vmulq_f32(
90 vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
91
92 return sqrt_reciprocal;
93}
94
95/* Inverse */
96static inline float32x4_t _vinvq_f32(float32x4_t x)
97{
98 // Newton's method
99 float32x4_t recip = vrecpeq_f32(x);
100 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
101 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
102 return recip;
103}
104
105/* Complex multiplication for float32x4x2_t */
106static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
107 float32x4x2_t b_val)
108{
109 float32x4x2_t tmp_real;
110 float32x4x2_t tmp_imag;
111 float32x4x2_t c_val;
112
113 // multiply the real*real and imag*imag to get real result
114 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
115 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
116 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
117 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
118 // Multiply cross terms to get the imaginary result
119 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
120 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
121 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
122 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
123 // combine the products
124 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
125 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
126 return c_val;
127}
128
129/* From ARM Compute Library, MIT license */
130static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
131{
132 float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
133 float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
134 float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
135 float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
136 float32x4_t x2 = vmulq_f32(x, x);
137 float32x4_t x4 = vmulq_f32(x2, x2);
138 float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
139 return res;
140}
141
142/* Natural logarithm.
143 * From ARM Compute Library, MIT license */
144static inline float32x4_t _vlogq_f32(float32x4_t x)
145{
146 const float32x4_t log_tab[8] = {
147 vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
148 vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
149 vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
150 vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
151 };
152
153 const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
154 const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
155
156 // Extract exponent
157 int32x4_t m = vsubq_s32(
158 vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
159 float32x4_t val =
160 vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
161
162 // Polynomial Approximation
163 float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
164
165 // Reconstruct
166 poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
167
168 return poly;
169}
170
171/* Evaluation of 4 sines & cosines at once.
172 * Optimized from here (zlib license)
173 * http://gruntthepeon.free.fr/ssemath/ */
174static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
175{
176 const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
177 const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
178 const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
179 const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
180 const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
181 const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
182 const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
183 const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
184 const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
185 const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
186
187 const float32x4_t CONST_1 = vdupq_n_f32(1.f);
188 const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
189 const float32x4_t CONST_0 = vdupq_n_f32(0.f);
190 const uint32x4_t CONST_2 = vdupq_n_u32(2);
191 const uint32x4_t CONST_4 = vdupq_n_u32(4);
192
193 uint32x4_t emm2;
194
195 uint32x4_t sign_mask_sin, sign_mask_cos;
196 sign_mask_sin = vcltq_f32(x, CONST_0);
197 x = vabsq_f32(x);
198 // scale by 4/pi
199 float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
200
201 // store the integer part of y in mm0
202 emm2 = vcvtq_u32_f32(y);
203 /* j=(j+1) & (~1) (see the cephes sources) */
204 emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
205 emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
206 y = vcvtq_f32_u32(emm2);
207
208 /* get the polynom selection mask
209 there is one polynom for 0 <= x <= Pi/4
210 and another one for Pi/4<x<=Pi/2
211 Both branches will be computed. */
212 const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
213
214 // The magic pass: "Extended precision modular arithmetic"
215 x = vmlaq_f32(x, y, c_minus_cephes_DP1);
216 x = vmlaq_f32(x, y, c_minus_cephes_DP2);
217 x = vmlaq_f32(x, y, c_minus_cephes_DP3);
218
219 sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
220 sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
221
222 /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
223 and the second polynom (Pi/4 <= x <= 0) in y2 */
224 float32x4_t y1, y2;
225 float32x4_t z = vmulq_f32(x, x);
226
227 y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
228 y1 = vmlaq_f32(c_coscof_p2, z, y1);
229 y1 = vmulq_f32(y1, z);
230 y1 = vmulq_f32(y1, z);
231 y1 = vmlsq_f32(y1, z, CONST_1_2);
232 y1 = vaddq_f32(y1, CONST_1);
233
234 y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
235 y2 = vmlaq_f32(c_sincof_p2, z, y2);
236 y2 = vmulq_f32(y2, z);
237 y2 = vmlaq_f32(x, x, y2);
238
239 /* select the correct result from the two polynoms */
240 const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
241 const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
242
243 float32x4x2_t sincos;
244 sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
245 sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
246
247 return sincos;
248}
249
250static inline float32x4_t _vsinq_f32(float32x4_t x)
251{
252 const float32x4x2_t sincos = _vsincosq_f32(x);
253 return sincos.val[0];
254}
255
256static inline float32x4_t _vcosq_f32(float32x4_t x)
257{
258 const float32x4x2_t sincos = _vsincosq_f32(x);
259 return sincos.val[1];
260}
261
262static inline float32x4_t _vtanq_f32(float32x4_t x)
263{
264 const float32x4x2_t sincos = _vsincosq_f32(x);
265 return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
266}
267
268static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
269 float32x4_t acc,
270 float32x4_t val,
271 float32x4_t rec,
272 float32x4_t aux)
273{
274 aux = vmulq_f32(aux, val);
275 aux = vsubq_f32(aux, acc);
276 aux = vmulq_f32(aux, aux);
277#ifdef LV_HAVE_NEONV8
278 return vfmaq_f32(sq_acc, aux, rec);
279#else
280 aux = vmulq_f32(aux, rec);
281 return vaddq_f32(sq_acc, aux);
282#endif
283}
284
285#endif /*LV_HAVE_NEON*/
286#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */