Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
60#ifdef LV_HAVE_GENERIC
61#include <volk/volk_common.h>
62
63static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
64 const lv_32fc_t* complexVector,
65 const float scalar,
66 unsigned int num_points)
67{
68 const float* complexVectorPtr = (float*)complexVector;
69 int16_t* magnitudeVectorPtr = magnitudeVector;
70 unsigned int number = 0;
71 for (number = 0; number < num_points; number++) {
72 __VOLK_VOLATILE float real = *complexVectorPtr++;
73 __VOLK_VOLATILE float imag = *complexVectorPtr++;
74 real *= real;
75 imag *= imag;
76 *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
77 }
78}
79#endif /* LV_HAVE_GENERIC */
80
81#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
82#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
83
84#include <inttypes.h>
85#include <math.h>
86#include <stdio.h>
87#include <volk/volk_common.h>
88
89#ifdef LV_HAVE_AVX2
90#include <immintrin.h>
91
92static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
93 const lv_32fc_t* complexVector,
94 const float scalar,
95 unsigned int num_points)
96{
97 unsigned int number = 0;
98 const unsigned int eighthPoints = num_points / 8;
99
100 const float* complexVectorPtr = (const float*)complexVector;
101 int16_t* magnitudeVectorPtr = magnitudeVector;
102
103 __m256 vScalar = _mm256_set1_ps(scalar);
104 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
105 __m256 cplxValue1, cplxValue2, result;
106 __m256i resultInt;
107 __m128i resultShort;
108
109 for (; number < eighthPoints; number++) {
110 cplxValue1 = _mm256_load_ps(complexVectorPtr);
111 complexVectorPtr += 8;
112
113 cplxValue2 = _mm256_load_ps(complexVectorPtr);
114 complexVectorPtr += 8;
115
116 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
117 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
118
119 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
120
121 result = _mm256_sqrt_ps(result);
122
123 result = _mm256_mul_ps(result, vScalar);
124
125 resultInt = _mm256_cvtps_epi32(result);
126 resultInt = _mm256_packs_epi32(resultInt, resultInt);
127 resultInt = _mm256_permutevar8x32_epi32(
128 resultInt, idx); // permute to compensate for shuffling in hadd and packs
129 resultShort = _mm256_extracti128_si256(resultInt, 0);
130 _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
131 magnitudeVectorPtr += 8;
132 }
133
134 number = eighthPoints * 8;
136 magnitudeVector + number, complexVector + number, scalar, num_points - number);
137}
138#endif /* LV_HAVE_AVX2 */
139
140#ifdef LV_HAVE_SSE3
141#include <pmmintrin.h>
142
143static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
144 const lv_32fc_t* complexVector,
145 const float scalar,
146 unsigned int num_points)
147{
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
150
151 const float* complexVectorPtr = (const float*)complexVector;
152 int16_t* magnitudeVectorPtr = magnitudeVector;
153
154 __m128 vScalar = _mm_set_ps1(scalar);
155
156 __m128 cplxValue1, cplxValue2, result;
157
158 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
159
160 for (; number < quarterPoints; number++) {
161 cplxValue1 = _mm_load_ps(complexVectorPtr);
162 complexVectorPtr += 4;
163
164 cplxValue2 = _mm_load_ps(complexVectorPtr);
165 complexVectorPtr += 4;
166
167 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
168 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
169
170 result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
171
172 result = _mm_sqrt_ps(result);
173
174 result = _mm_mul_ps(result, vScalar);
175
176 _mm_store_ps(floatBuffer, result);
177 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
178 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
179 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
180 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
181 }
182
183 number = quarterPoints * 4;
185 magnitudeVector + number, complexVector + number, scalar, num_points - number);
186}
187#endif /* LV_HAVE_SSE3 */
188
189
190#ifdef LV_HAVE_SSE
191#include <xmmintrin.h>
192
193static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
194 const lv_32fc_t* complexVector,
195 const float scalar,
196 unsigned int num_points)
197{
198 unsigned int number = 0;
199 const unsigned int quarterPoints = num_points / 4;
200
201 const float* complexVectorPtr = (const float*)complexVector;
202 int16_t* magnitudeVectorPtr = magnitudeVector;
203
204 __m128 vScalar = _mm_set_ps1(scalar);
205
206 __m128 cplxValue1, cplxValue2, result;
207 __m128 iValue, qValue;
208
209 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
210
211 for (; number < quarterPoints; number++) {
212 cplxValue1 = _mm_load_ps(complexVectorPtr);
213 complexVectorPtr += 4;
214
215 cplxValue2 = _mm_load_ps(complexVectorPtr);
216 complexVectorPtr += 4;
217
218 // Arrange in i1i2i3i4 format
219 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
220 // Arrange in q1q2q3q4 format
221 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
222
223 __VOLK_VOLATILE __m128 iValue2 =
224 _mm_mul_ps(iValue, iValue); // Square the I values
225 __VOLK_VOLATILE __m128 qValue2 =
226 _mm_mul_ps(qValue, qValue); // Square the Q Values
227
228 result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
229
230 result = _mm_sqrt_ps(result);
231
232 result = _mm_mul_ps(result, vScalar);
233
234 _mm_store_ps(floatBuffer, result);
235 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
236 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
237 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
238 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
239 }
240
241 number = quarterPoints * 4;
243 magnitudeVector + number, complexVector + number, scalar, num_points - number);
244}
245#endif /* LV_HAVE_SSE */
246
247
248#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
249
250#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
251#define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
252
253#include <inttypes.h>
254#include <math.h>
255#include <stdio.h>
256#include <volk/volk_common.h>
257
258#ifdef LV_HAVE_AVX2
259#include <immintrin.h>
260
261static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
262 const lv_32fc_t* complexVector,
263 const float scalar,
264 unsigned int num_points)
265{
266 unsigned int number = 0;
267 const unsigned int eighthPoints = num_points / 8;
268
269 const float* complexVectorPtr = (const float*)complexVector;
270 int16_t* magnitudeVectorPtr = magnitudeVector;
271
272 __m256 vScalar = _mm256_set1_ps(scalar);
273 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
274 __m256 cplxValue1, cplxValue2, result;
275 __m256i resultInt;
276 __m128i resultShort;
277
278 for (; number < eighthPoints; number++) {
279 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
280 complexVectorPtr += 8;
281
282 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
283 complexVectorPtr += 8;
284
285 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
286 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
287
288 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
289
290 result = _mm256_sqrt_ps(result);
291
292 result = _mm256_mul_ps(result, vScalar);
293
294 resultInt = _mm256_cvtps_epi32(result);
295 resultInt = _mm256_packs_epi32(resultInt, resultInt);
296 resultInt = _mm256_permutevar8x32_epi32(
297 resultInt, idx); // permute to compensate for shuffling in hadd and packs
298 resultShort = _mm256_extracti128_si256(resultInt, 0);
299 _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
300 magnitudeVectorPtr += 8;
301 }
302
303 number = eighthPoints * 8;
305 magnitudeVector + number, complexVector + number, scalar, num_points - number);
306}
307#endif /* LV_HAVE_AVX2 */
308
309#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */