25 #ifndef EIGEN_PACKET_MATH_SSE_H
26 #define EIGEN_PACKET_MATH_SSE_H
32 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
33 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
36 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
37 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
44 template<>
struct is_arithmetic<__m128> {
enum { value =
true }; };
45 template<>
struct is_arithmetic<__m128i> {
enum { value =
true }; };
46 template<>
struct is_arithmetic<__m128d> {
enum { value =
true }; };
48 #define vec4f_swizzle1(v,p,q,r,s) \
49 (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
51 #define vec4i_swizzle1(v,p,q,r,s) \
52 (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
54 #define vec2d_swizzle1(v,p,q) \
55 (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
57 #define vec4f_swizzle2(a,b,p,q,r,s) \
58 (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
60 #define vec4i_swizzle2(a,b,p,q,r,s) \
61 (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
63 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
64 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
66 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
67 const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
69 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
70 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
73 template<>
struct packet_traits<float> : default_packet_traits
89 template<>
struct packet_traits<double> : default_packet_traits
100 template<>
struct packet_traits<
int> : default_packet_traits
111 template<>
struct unpacket_traits<
Packet4f> {
typedef float type;
enum {size=4}; };
112 template<>
struct unpacket_traits<
Packet2d> {
typedef double type;
enum {size=2}; };
113 template<>
struct unpacket_traits<
Packet4i> {
typedef int type;
enum {size=4}; };
115 #if defined(_MSC_VER) && (_MSC_VER==1500)
142 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
143 return _mm_xor_ps(a,mask);
147 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
148 return _mm_xor_pd(a,mask);
152 return psub(_mm_setr_epi32(0,0,0,0), a);
159 #ifdef EIGEN_VECTORIZE_SSE4_1
160 return _mm_mullo_epi32(a,b);
176 {
eigen_assert(
false &&
"packet integer division are not supported by SSE");
188 Packet4i mask = _mm_cmplt_epi32(a,b);
189 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
197 Packet4i mask = _mm_cmpgt_epi32(a,b);
198 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
221 #if defined(_MSC_VER)
229 __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (
const __m64*)(from));
230 res = _mm_loadh_pi(res, (
const __m64*)(from+2));
233 return _mm_loadu_ps(from);
246 #if defined(__GNUC__) && defined(__i386__)
248 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
249 #elif defined(__clang__)
251 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
253 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
259 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
260 return _mm_loadu_ps(from);
263 res = _mm_load_sd((
const double*)(from)) ;
264 res = _mm_loadh_pd(res, (
const double*)(from+2)) ;
265 return _mm_castpd_ps(res);
271 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
272 return _mm_loadu_pd(from);
275 res = _mm_load_sd(from) ;
276 res = _mm_loadh_pd(res,from+1);
283 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
284 return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
287 res = _mm_load_sd((
const double*)(from)) ;
288 res = _mm_loadh_pd(res, (
const double*)(from+2)) ;
289 return _mm_castpd_si128(res);
296 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
303 tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
313 _mm_storel_pd((to), from);
314 _mm_storeh_pd((to+1), from);
336 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
342 #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
354 {
return _mm_shuffle_ps(a,a,0x1B); }
356 {
return _mm_shuffle_pd(a,a,0x1); }
358 {
return _mm_shuffle_epi32(a,0x1B); }
363 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
364 return _mm_and_ps(a,mask);
368 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
369 return _mm_and_pd(a,mask);
373 #ifdef EIGEN_VECTORIZE_SSSE3
374 return _mm_abs_epi32(a);
376 Packet4i aux = _mm_srai_epi32(a,31);
377 return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
383 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
384 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
385 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
386 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
389 #ifdef EIGEN_VECTORIZE_SSE3
393 return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
397 return _mm_hadd_pd(vecs[0], vecs[1]);
408 return pfirst(_mm_hadd_ps(tmp0, tmp0));
423 Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
424 return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
428 return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
434 tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
435 tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
436 tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
437 tmp0 = _mm_add_ps(tmp0, tmp1);
438 tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
439 tmp1 = _mm_add_ps(tmp1, tmp2);
440 tmp2 = _mm_movehl_ps(tmp1, tmp0);
441 tmp0 = _mm_movelh_ps(tmp0, tmp1);
442 return _mm_add_ps(tmp0, tmp2);
447 return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
453 Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
460 tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
461 tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
462 tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
463 tmp0 = _mm_add_epi32(tmp0, tmp1);
464 tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
465 tmp1 = _mm_add_epi32(tmp1, tmp2);
466 tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
467 tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
468 return _mm_add_epi32(tmp0, tmp2);
476 Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
477 return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
481 return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
490 return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
496 Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
497 return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
501 return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
509 register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
510 register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
511 return aux0<aux2 ? aux0 : aux2;
517 Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
518 return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
522 return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
530 register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
531 register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
532 return aux0>aux2 ? aux0 : aux2;
535 #if (defined __GNUC__)
550 #ifdef EIGEN_VECTORIZE_SSSE3
558 first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
568 first = _mm_alignr_epi8(second,first, Offset*4);
578 first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
590 first = _mm_move_ss(first,second);
591 first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
595 first = _mm_movehl_ps(first,first);
596 first = _mm_movelh_ps(first,second);
600 first = _mm_move_ss(first,second);
601 first = _mm_shuffle_ps(first,second,0x93);
613 first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
614 first = _mm_shuffle_epi32(first,0x39);
618 first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
619 first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
623 first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
624 first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
636 first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
637 first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
647 #endif // EIGEN_PACKET_MATH_SSE_H