27 #ifndef EIGEN_PACKET_MATH_NEON_H
28 #define EIGEN_PACKET_MATH_NEON_H
34 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
35 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
40 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
41 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
48 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
49 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
51 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
52 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
54 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
55 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
57 #if defined(__llvm__) && !defined(__clang__)
59 #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
60 #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
63 #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
64 #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
68 #define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
71 template<>
struct packet_traits<float> : default_packet_traits
88 template<>
struct packet_traits<
int> : default_packet_traits
99 #if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
101 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(
const float* x) { return ::vld1q_f32((
const float32_t*)x); }
102 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (
const float* x) { return ::vld1_f32 ((
const float32_t*)x); }
103 EIGEN_STRONG_INLINE void vst1q_f32(
float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
104 EIGEN_STRONG_INLINE void vst1_f32 (
float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
107 template<>
struct unpacket_traits<
Packet4f> {
typedef float type;
enum {size=4}; };
108 template<>
struct unpacket_traits<
Packet4i> {
typedef int type;
enum {size=4}; };
145 inv = vrecpeq_f32(b);
149 restep = vrecpsq_f32(b, inv);
150 inv = vmulq_f32(restep, inv);
153 div = vmulq_f32(a, inv);
158 {
eigen_assert(
false &&
"packet integer division are not supported by NEON");
175 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
181 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
187 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
193 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
206 lo = vdup_n_f32(*from);
207 hi = vdup_n_f32(*(from+1));
208 return vcombine_f32(lo, hi);
213 lo = vdup_n_s32(*from);
214 hi = vdup_n_s32(*(from+1));
215 return vcombine_s32(lo, hi);
232 float32x2_t a_lo, a_hi;
235 a_r64 = vrev64q_f32(a);
236 a_lo = vget_low_f32(a_r64);
237 a_hi = vget_high_f32(a_r64);
238 return vcombine_f32(a_hi, a_lo);
241 int32x2_t a_lo, a_hi;
244 a_r64 = vrev64q_s32(a);
245 a_lo = vget_low_s32(a_r64);
246 a_hi = vget_high_s32(a_r64);
247 return vcombine_s32(a_hi, a_lo);
254 float32x2_t a_lo, a_hi, sum;
257 a_lo = vget_low_f32(a);
258 a_hi = vget_high_f32(a);
259 sum = vpadd_f32(a_lo, a_hi);
260 sum = vpadd_f32(sum, sum);
268 float32x4x2_t vtrn1, vtrn2, res1, res2;
273 vtrn1 = vzipq_f32(vecs[0], vecs[2]);
274 vtrn2 = vzipq_f32(vecs[1], vecs[3]);
275 res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
276 res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
279 sum1 = vaddq_f32(res1.val[0], res1.val[1]);
280 sum2 = vaddq_f32(res2.val[0], res2.val[1]);
281 sum = vaddq_f32(sum1, sum2);
288 int32x2_t a_lo, a_hi, sum;
291 a_lo = vget_low_s32(a);
292 a_hi = vget_high_s32(a);
293 sum = vpadd_s32(a_lo, a_hi);
294 sum = vpadd_s32(sum, sum);
302 int32x4x2_t vtrn1, vtrn2, res1, res2;
307 vtrn1 = vzipq_s32(vecs[0], vecs[2]);
308 vtrn2 = vzipq_s32(vecs[1], vecs[3]);
309 res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
310 res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
313 sum1 = vaddq_s32(res1.val[0], res1.val[1]);
314 sum2 = vaddq_s32(res2.val[0], res2.val[1]);
315 sum = vaddq_s32(sum1, sum2);
324 float32x2_t a_lo, a_hi, prod;
328 a_lo = vget_low_f32(a);
329 a_hi = vget_high_f32(a);
331 prod = vmul_f32(a_lo, a_hi);
333 prod = vmul_f32(prod, vrev64_f32(prod));
340 int32x2_t a_lo, a_hi, prod;
344 a_lo = vget_low_s32(a);
345 a_hi = vget_high_s32(a);
347 prod = vmul_s32(a_lo, a_hi);
349 prod = vmul_s32(prod, vrev64_s32(prod));
358 float32x2_t a_lo, a_hi, min;
361 a_lo = vget_low_f32(a);
362 a_hi = vget_high_f32(a);
363 min = vpmin_f32(a_lo, a_hi);
364 min = vpmin_f32(min, min);
371 int32x2_t a_lo, a_hi, min;
374 a_lo = vget_low_s32(a);
375 a_hi = vget_high_s32(a);
376 min = vpmin_s32(a_lo, a_hi);
377 min = vpmin_s32(min, min);
386 float32x2_t a_lo, a_hi, max;
389 a_lo = vget_low_f32(a);
390 a_hi = vget_high_f32(a);
391 max = vpmax_f32(a_lo, a_hi);
392 max = vpmax_f32(max, max);
399 int32x2_t a_lo, a_hi, max;
402 a_lo = vget_low_s32(a);
403 a_hi = vget_high_s32(a);
404 max = vpmax_s32(a_lo, a_hi);
405 max = vpmax_s32(max, max);
413 #define PALIGN_NEON(Offset,Type,Command) \
415 struct palign_impl<Offset,Type>\
417 EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
420 first = Command(first, second, Offset);\
426 PALIGN_NEON(2,Packet4f,vextq_f32)
427 PALIGN_NEON(3,Packet4f,vextq_f32)
429 PALIGN_NEON(1,Packet4i,vextq_s32)
430 PALIGN_NEON(2,Packet4i,vextq_s32)
431 PALIGN_NEON(3,Packet4i,vextq_s32)
439 #endif // EIGEN_PACKET_MATH_NEON_H