30 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
31 #define EIGEN_MATH_FUNCTIONS_SSE_H
69 Packet4f invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
71 x =
pmax(x, p4f_min_norm_pos);
72 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
75 x = _mm_and_ps(x, p4f_inv_mant_mask);
76 x = _mm_or_ps(x, p4f_half);
78 emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
87 Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
90 e =
psub(e, _mm_and_ps(p4f_1, mask));
97 y =
pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
98 y1 =
pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
99 y2 =
pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
100 y =
pmadd(y , x, p4f_cephes_log_p2);
101 y1 =
pmadd(y1, x, p4f_cephes_log_p5);
102 y2 =
pmadd(y2, x, p4f_cephes_log_p8);
103 y =
pmadd(y, x3, y1);
104 y =
pmadd(y, x3, y2);
107 y1 =
pmul(e, p4f_cephes_log_q1);
108 tmp =
pmul(x2, p4f_half);
111 y2 =
pmul(e, p4f_cephes_log_q2);
114 return _mm_or_ps(x, invalid_mask);
140 Packet4f tmp = _mm_setzero_ps(), fx;
144 x =
pmax(
pmin(x, p4f_exp_hi), p4f_exp_lo);
147 fx =
pmadd(x, p4f_cephes_LOG2EF, p4f_half);
150 emm0 = _mm_cvttps_epi32(fx);
151 tmp = _mm_cvtepi32_ps(emm0);
153 Packet4f mask = _mm_cmpgt_ps(tmp, fx);
154 mask = _mm_and_ps(mask, p4f_1);
155 fx =
psub(tmp, mask);
157 tmp =
pmul(fx, p4f_cephes_exp_C1);
165 y =
pmadd(y, x, p4f_cephes_exp_p1);
166 y =
pmadd(y, x, p4f_cephes_exp_p2);
167 y =
pmadd(y, x, p4f_cephes_exp_p3);
168 y =
pmadd(y, x, p4f_cephes_exp_p4);
169 y =
pmadd(y, x, p4f_cephes_exp_p5);
174 emm0 = _mm_cvttps_epi32(fx);
175 emm0 = _mm_add_epi32(emm0, p4i_0x7f);
176 emm0 = _mm_slli_epi32(emm0, 23);
177 return pmul(y, _mm_castsi128_ps(emm0));
217 Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit,
y;
227 sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
230 y =
pmul(x, p4f_cephes_FOPI);
233 emm2 = _mm_cvttps_epi32(y);
235 emm2 = _mm_add_epi32(emm2, p4i_1);
236 emm2 = _mm_and_si128(emm2, p4i_not1);
237 y = _mm_cvtepi32_ps(emm2);
239 emm0 = _mm_and_si128(emm2, p4i_4);
240 emm0 = _mm_slli_epi32(emm0, 29);
247 emm2 = _mm_and_si128(emm2, p4i_2);
248 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
250 Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
251 Packet4f poly_mask = _mm_castsi128_ps(emm2);
252 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
256 xmm1 =
pmul(y, p4f_minus_cephes_DP1);
257 xmm2 =
pmul(y, p4f_minus_cephes_DP2);
258 xmm3 =
pmul(y, p4f_minus_cephes_DP3);
267 y =
pmadd(y, z, p4f_coscof_p1);
268 y =
pmadd(y, z, p4f_coscof_p2);
278 y2 =
pmadd(y2, z, p4f_sincof_p1);
279 y2 =
pmadd(y2, z, p4f_sincof_p2);
285 y2 = _mm_and_ps(poly_mask, y2);
286 y = _mm_andnot_ps(poly_mask, y);
289 return _mm_xor_ps(y, sign_bit);
316 Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3,
y;
322 y =
pmul(x, p4f_cephes_FOPI);
325 emm2 = _mm_cvttps_epi32(y);
327 emm2 = _mm_add_epi32(emm2, p4i_1);
328 emm2 = _mm_and_si128(emm2, p4i_not1);
329 y = _mm_cvtepi32_ps(emm2);
331 emm2 = _mm_sub_epi32(emm2, p4i_2);
334 emm0 = _mm_andnot_si128(emm2, p4i_4);
335 emm0 = _mm_slli_epi32(emm0, 29);
337 emm2 = _mm_and_si128(emm2, p4i_2);
338 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
340 Packet4f sign_bit = _mm_castsi128_ps(emm0);
341 Packet4f poly_mask = _mm_castsi128_ps(emm2);
345 xmm1 =
pmul(y, p4f_minus_cephes_DP1);
346 xmm2 =
pmul(y, p4f_minus_cephes_DP2);
347 xmm3 =
pmul(y, p4f_minus_cephes_DP3);
356 y =
pmadd(y,z,p4f_coscof_p1);
357 y =
pmadd(y,z,p4f_coscof_p2);
360 Packet4f tmp = _mm_mul_ps(z, p4f_half);
366 y2 =
pmadd(y2, z, p4f_sincof_p1);
367 y2 =
pmadd(y2, z, p4f_sincof_p2);
369 y2 =
pmadd(y2, x, x);
372 y2 = _mm_and_ps(poly_mask, y2);
373 y = _mm_andnot_ps(poly_mask, y);
377 return _mm_xor_ps(y, sign_bit);
389 Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
399 #endif // EIGEN_MATH_FUNCTIONS_SSE_H