Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_x2_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
75 #define INCLUDED_volk_32fc_x2_add_32fc_u_H
76 
77 #ifdef LV_HAVE_AVX
78 #include <immintrin.h>
79 
80 static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
81  const lv_32fc_t* aVector,
82  const lv_32fc_t* bVector,
83  unsigned int num_points)
84 {
85  unsigned int number = 0;
86  const unsigned int quarterPoints = num_points / 4;
87 
88  lv_32fc_t* cPtr = cVector;
89  const lv_32fc_t* aPtr = aVector;
90  const lv_32fc_t* bPtr = bVector;
91 
92  __m256 aVal, bVal, cVal;
93  for (; number < quarterPoints; number++) {
94 
95  aVal = _mm256_loadu_ps((float*)aPtr);
96  bVal = _mm256_loadu_ps((float*)bPtr);
97 
98  cVal = _mm256_add_ps(aVal, bVal);
99 
100  _mm256_storeu_ps((float*)cPtr,
101  cVal); // Store the results back into the C container
102 
103  aPtr += 4;
104  bPtr += 4;
105  cPtr += 4;
106  }
107 
108  number = quarterPoints * 4;
109  for (; number < num_points; number++) {
110  *cPtr++ = (*aPtr++) + (*bPtr++);
111  }
112 }
113 #endif /* LV_HAVE_AVX */
114 
115 
116 #ifdef LV_HAVE_AVX
117 #include <immintrin.h>
118 
119 static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
120  const lv_32fc_t* aVector,
121  const lv_32fc_t* bVector,
122  unsigned int num_points)
123 {
124  unsigned int number = 0;
125  const unsigned int quarterPoints = num_points / 4;
126 
127  lv_32fc_t* cPtr = cVector;
128  const lv_32fc_t* aPtr = aVector;
129  const lv_32fc_t* bPtr = bVector;
130 
131  __m256 aVal, bVal, cVal;
132  for (; number < quarterPoints; number++) {
133 
134  aVal = _mm256_load_ps((float*)aPtr);
135  bVal = _mm256_load_ps((float*)bPtr);
136 
137  cVal = _mm256_add_ps(aVal, bVal);
138 
139  _mm256_store_ps((float*)cPtr,
140  cVal); // Store the results back into the C container
141 
142  aPtr += 4;
143  bPtr += 4;
144  cPtr += 4;
145  }
146 
147  number = quarterPoints * 4;
148  for (; number < num_points; number++) {
149  *cPtr++ = (*aPtr++) + (*bPtr++);
150  }
151 }
152 #endif /* LV_HAVE_AVX */
153 
154 
155 #ifdef LV_HAVE_SSE
156 #include <xmmintrin.h>
157 
158 static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
159  const lv_32fc_t* aVector,
160  const lv_32fc_t* bVector,
161  unsigned int num_points)
162 {
163  unsigned int number = 0;
164  const unsigned int halfPoints = num_points / 2;
165 
166  lv_32fc_t* cPtr = cVector;
167  const lv_32fc_t* aPtr = aVector;
168  const lv_32fc_t* bPtr = bVector;
169 
170  __m128 aVal, bVal, cVal;
171  for (; number < halfPoints; number++) {
172 
173  aVal = _mm_loadu_ps((float*)aPtr);
174  bVal = _mm_loadu_ps((float*)bPtr);
175 
176  cVal = _mm_add_ps(aVal, bVal);
177 
178  _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
179 
180  aPtr += 2;
181  bPtr += 2;
182  cPtr += 2;
183  }
184 
185  number = halfPoints * 2;
186  for (; number < num_points; number++) {
187  *cPtr++ = (*aPtr++) + (*bPtr++);
188  }
189 }
190 #endif /* LV_HAVE_SSE */
191 
192 
193 #ifdef LV_HAVE_GENERIC
194 
195 static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
196  const lv_32fc_t* aVector,
197  const lv_32fc_t* bVector,
198  unsigned int num_points)
199 {
200  lv_32fc_t* cPtr = cVector;
201  const lv_32fc_t* aPtr = aVector;
202  const lv_32fc_t* bPtr = bVector;
203  unsigned int number = 0;
204 
205  for (number = 0; number < num_points; number++) {
206  *cPtr++ = (*aPtr++) + (*bPtr++);
207  }
208 }
209 #endif /* LV_HAVE_GENERIC */
210 
211 
212 #ifdef LV_HAVE_SSE
213 #include <xmmintrin.h>
214 
215 static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
216  const lv_32fc_t* aVector,
217  const lv_32fc_t* bVector,
218  unsigned int num_points)
219 {
220  unsigned int number = 0;
221  const unsigned int halfPoints = num_points / 2;
222 
223  lv_32fc_t* cPtr = cVector;
224  const lv_32fc_t* aPtr = aVector;
225  const lv_32fc_t* bPtr = bVector;
226 
227  __m128 aVal, bVal, cVal;
228  for (; number < halfPoints; number++) {
229  aVal = _mm_load_ps((float*)aPtr);
230  bVal = _mm_load_ps((float*)bPtr);
231 
232  cVal = _mm_add_ps(aVal, bVal);
233 
234  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
235 
236  aPtr += 2;
237  bPtr += 2;
238  cPtr += 2;
239  }
240 
241  number = halfPoints * 2;
242  for (; number < num_points; number++) {
243  *cPtr++ = (*aPtr++) + (*bPtr++);
244  }
245 }
246 #endif /* LV_HAVE_SSE */
247 
248 
249 #ifdef LV_HAVE_NEON
250 #include <arm_neon.h>
251 
252 static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
253  const lv_32fc_t* aVector,
254  const lv_32fc_t* bVector,
255  unsigned int num_points)
256 {
257  unsigned int number = 0;
258  const unsigned int halfPoints = num_points / 2;
259 
260  lv_32fc_t* cPtr = cVector;
261  const lv_32fc_t* aPtr = aVector;
262  const lv_32fc_t* bPtr = bVector;
263  float32x4_t aVal, bVal, cVal;
264  for (number = 0; number < halfPoints; number++) {
265  // Load in to NEON registers
266  aVal = vld1q_f32((const float32_t*)(aPtr));
267  bVal = vld1q_f32((const float32_t*)(bPtr));
268  __VOLK_PREFETCH(aPtr + 2);
269  __VOLK_PREFETCH(bPtr + 2);
270 
271  // vector add
272  cVal = vaddq_f32(aVal, bVal);
273  // Store the results back into the C container
274  vst1q_f32((float*)(cPtr), cVal);
275 
276  aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
277  bPtr += 2;
278  cPtr += 2;
279  }
280 
281  number = halfPoints * 2; // should be = num_points
282  for (; number < num_points; number++) {
283  *cPtr++ = (*aPtr++) + (*bPtr++);
284  }
285 }
286 
287 #endif /* LV_HAVE_NEON */
288 
289 
290 #endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */
volk_32fc_x2_add_32fc_a_avx
static void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:119
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_32fc_x2_add_32fc_u_sse
static void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:158
volk_32fc_x2_add_32fc_u_neon
static void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:252
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:70
volk_32fc_x2_add_32fc_u_avx
static void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:80
volk_32fc_x2_add_32fc_a_sse
static void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:215
volk_32fc_x2_add_32fc_generic
static void volk_32fc_x2_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:195