1 #ifndef LIBRARY_SIMD_SSE2_H_ 2 #define LIBRARY_SIMD_SSE2_H_ 32 #define vadd_ps _mm_add_ps 33 #define vsub_ps _mm_sub_ps 34 #define vmul_ps _mm_mul_ps 35 #define vdiv_ps _mm_div_ps 36 #define vsqrt_ps _mm_sqrt_ps 37 #define vrsqrt_ps _mm_rsqrt_ps_ 38 #define vfmadd_ps _mm_fmadd_ps 39 #define vfmsub_ps _mm_fmsub_ps 40 #define vfnmadd_ps _mm_fnmadd_ps 41 #define vfnmsub_ps _mm_fnmsub_ps 47 #define vadd_pd _mm_add_pd 48 #define vsub_pd _mm_sub_pd 49 #define vmul_pd _mm_mul_pd 50 #define vdiv_pd _mm_div_pd 51 #define vsqrt_pd _mm_sqrt_pd 52 #define vrsqrt_pd _mm_rsqrt_pd_ 53 #define vfmadd_pd _mm_fmadd_pd 54 #define vfmsub_pd _mm_fmsub_pd 55 #define vfnmadd_pd _mm_fnmadd_pd 56 #define vfnmsub_pd _mm_fnmsub_pd 66 #define vsin_ps _mm_sin_ps_ 67 #define vcos_ps _mm_cos_ps_ 68 #define vsincos_ps _mm_sincos_ps_ 69 #define vexp_ps _mm_exp_ps_ 75 #define vsin_pd _mm_sin_pd_ 76 #define vcos_pd _mm_cos_pd_ 77 #define vsincos_pd _mm_sincos_pd_ 78 #define vexp_pd _mm_exp_pd_ 88 #define vload_ps _mm_load_ps 89 #define vload1_ps _mm_load1_ps 90 #define vloadu_ps _mm_loadu_ps 91 #define vset1_ps _mm_set1_ps 92 #define vsetzero_ps _mm_setzero_ps 93 #define vstore_ps _mm_store_ps 94 #define vstoreu_ps _mm_storeu_ps 100 #define vload_pd _mm_load_pd 101 #define vload1_pd _mm_load1_pd 102 #define vloadu_pd _mm_loadu_pd 103 #define vset1_pd _mm_set1_pd 104 #define vsetzero_pd _mm_setzero_pd 105 #define vstore_pd _mm_store_pd 106 #define vstoreu_pd _mm_storeu_pd 116 #define vcmpeq_ps(a,b) _mm_cmpeq_ps(a,b) 117 #define vcmpneq_ps(a,b) _mm_cmpneq_ps(a,b) 118 #define vcmpge_ps(a,b) _mm_cmpge_ps(a,b) 119 #define vcmpgt_ps(a,b) _mm_cmpgt_ps(a,b) 120 #define vcmpnge_ps(a,b) _mm_cmpnge_ps(a,b) 121 #define vcmpngt_ps(a,b) _mm_cmpngt_ps(a,b) 122 #define vcmple_ps(a,b) _mm_cmple_ps(a,b) 123 #define vcmplt_ps(a,b) _mm_cmplt_ps(a,b) 124 #define vcmpnle_ps(a,b) _mm_cmpnle_ps(a,b) 125 #define vcmpnlt_ps(a,b) _mm_cmpnlt_ps(a,b) 131 #define vcmpeq_pd(a,b) _mm_cmpeq_pd(a,b) 132 #define vcmpneq_pd(a,b) _mm_cmpneq_pd(a,b) 133 #define vcmpge_pd(a,b) _mm_cmpge_pd(a,b) 134 #define vcmpgt_pd(a,b) _mm_cmpgt_pd(a,b) 135 #define vcmpnge_pd(a,b) _mm_cmpnge_pd(a,b) 136 #define vcmpngt_pd(a,b) _mm_cmpngt_pd(a,b) 137 #define vcmple_pd(a,b) _mm_cmple_pd(a,b) 138 #define vcmplt_pd(a,b) _mm_cmplt_pd(a,b) 139 #define vcmpnle_pd(a,b) _mm_cmpnle_pd(a,b) 140 #define vcmpnlt_pd(a,b) _mm_cmpnlt_pd(a,b) 150 #define vand_ps _mm_and_ps 151 #define vandnot_ps _mm_andnot_ps 152 #define vor_ps _mm_or_ps 153 #define vxor_ps _mm_xor_ps 159 #define vand_pd _mm_and_pd 160 #define vandnot_pd _mm_andnot_pd 161 #define vor_pd _mm_or_pd 162 #define vxor_pd _mm_xor_pd 172 #define vreduce_ps _mm_reduce_ps_ 178 #define vreduce_pd _mm_reduce_pd_ 188 #define vdot3_ps _mm_dot3_ps_ 194 #define vdot3_pd _mm_dot3_pd_ 205 static inline vecf _mm_fmadd_ps(vecf a, vecf b, vecf c) {
206 return vadd_ps(vmul_ps(a, b), c);
209 static inline vecf _mm_fmsub_ps(vecf a, vecf b, vecf c) {
210 return vsub_ps(vmul_ps(a, b), c);
213 static inline vecf _mm_fnmadd_ps(vecf a, vecf b, vecf c) {
214 return vsub_ps(c, vmul_ps(a, b));
217 static inline vecf _mm_fnmsub_ps(vecf a, vecf b, vecf c) {
218 return vmul_ps(vset1_ps(-1.0f), vadd_ps(vmul_ps(a, b), c));
227 static inline vecd _mm_fmadd_pd(vecd a, vecd b, vecd c) {
228 return vadd_pd(vmul_pd(a, b), c);
231 static inline vecd _mm_fmsub_pd(vecd a, vecd b, vecd c) {
232 return vsub_pd(vmul_pd(a, b), c);
235 static inline vecd _mm_fnmadd_pd(vecd a, vecd b, vecd c) {
236 return vsub_pd(c, vmul_pd(a, b));
239 static inline vecd _mm_fnmsub_pd(vecd a, vecd b, vecd c) {
240 return vmul_pd(vset1_pd(-1.0), vadd_pd(vmul_pd(a, b), c));
245 static inline vecf _mm_floor_ps_(vecf a) {
246 return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
249 static inline vecd _mm_floor_pd_(vecd a) {
250 return _mm_cvtepi32_pd(_mm_cvttpd_epi32(a));
253 static inline vecf _mm_floor_ps_(vecf a) {
254 return _mm_floor_ps(a);
257 static inline vecd _mm_floor_pd_(vecd a) {
258 return _mm_floor_pd(a);
263 static inline __m128i _mm_cvtepi32_epi64_(__m128i a) {
264 __m128i sel = _mm_set1_epi32(0x80000000);
270 mask = _mm_cmpeq_epi32(_mm_and_si128(a, sel), sel);
271 a1 = _mm_andnot_si128(sel, a);
273 b = _mm_or_si128(_mm_and_si128(mask, a1), _mm_andnot_si128(mask, a));
274 c = _mm_or_si128(_mm_and_si128(mask, sel),
275 _mm_andnot_si128(mask, _mm_setzero_si128()));
276 b = _mm_unpacklo_epi32(b,c);
281 static inline __m128i _mm_cvtepi32_epi64_(__m128i a) {
282 return _mm_cvtepi32_epi64(a);
290 static inline vecf _mm_rsqrt_ps_(vecf x) {
293 x2 = vmul_ps(x, vset1_ps(0.5f));
298 t3 = vmul_ps(vset1_ps(1.5f), x);
299 x = vfnmadd_ps(t1, t2, t3);
304 static inline vecd _mm_rsqrt_pd_(vecd x) {
308 cd_15 = vset1_pd(1.5);
310 x2 = vmul_pd(x, vset1_pd(0.5));
311 x = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x)));
315 t3 = vmul_pd(cd_15, x);
316 x = vfnmadd_pd(t1, t2, t3);
320 t3 = vmul_pd(cd_15, x);
321 x = vfnmadd_pd(t1, t2, t3);
330 static inline float _mm_reduce_ps_(vecf x) {
331 return x[0] + x[1] + x[2] + x[3];
338 static inline double _mm_reduce_pd_(vecd x) {
350 static const float coeff_fcos[] = {
351 1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
352 1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
353 / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
354 / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
356 static const float coeff_fsin[] = {
357 1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
358 1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
359 / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
360 / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
362 static const int f_terms = 5;
364 static const float fpi = 3.14159265358979323846264338327950288L;
365 static const float fone_2pi = 1.0L / 6.28318530717958647692528676655900577L;
367 static inline vecf _mm_cos_ps_(vecf xi) {
368 vecf xi2, n, v_one, c, c_const, c_const2;
371 v_one = vset1_ps(1.0f);
373 n = _mm_floor_ps_(vmul_ps(xi, vset1_ps(fone_2pi)));
374 n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
376 xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
377 xi2 = vmul_ps(xi, xi);
379 c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
380 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
381 c = vfnmadd_ps(c_const2, c_const, c_const);
383 for (i = f_terms - 1; i >= 0; i--) {
384 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
385 c = vfnmadd_ps(c, c_const, c_const);
389 c = vsub_ps(c, v_one);
394 static inline vecf _mm_sin_ps_(vecf xi) {
395 vecf xi2, n, v_one, s, s_const;
398 v_one = vset1_ps(1.0f);
400 n = _mm_floor_ps_(vmul_ps(xi, vset1_ps(fone_2pi)));
401 n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
403 xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
404 xi2 = vmul_ps(xi, xi);
406 s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
408 for (i = f_terms - 1; i >= 0; i--) {
409 s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
410 s = vfnmadd_ps(s, s_const, s_const);
413 s = vfmsub_ps(xi, s, xi);
418 static inline void _mm_sincos_ps_(vecf xi, vecf *cp, vecf *sp) {
419 vecf xi2, n, v_one, c, c_const, c_const2, s, s_const;
422 v_one = vset1_ps(1.0f);
424 n = _mm_floor_ps_(vmul_ps(xi, vset1_ps(fone_2pi)));
425 n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
427 xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
428 xi2 = vmul_ps(xi, xi);
430 s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
432 c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
433 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
434 c = vfnmadd_ps(c_const2, c_const, c_const);
436 for (i = f_terms - 1; i >= 0; i--) {
437 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
438 s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
440 c = vfnmadd_ps(c, c_const, c_const);
441 s = vfnmadd_ps(s, s_const, s_const);
444 *cp = vsub_ps(c, v_one);
445 *sp = vfmsub_ps(xi, s, xi);
448 static const float coeff_fexp[] = {
449 1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
450 1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
451 / 14.0, 1.0 / 15.0 };
453 static const float flde = 1.4426950408889634e+00f;
454 static const float fln2 = 6.9314718055994529e-01f;
456 static const int f_expterms = 7;
458 static inline vecf _mm_exp_ps_(vecf x) {
460 __m128 x1, y1, y, c_one, c;
463 c_one = vset1_ps(1.0f);
465 x1 = vmul_ps(vset1_ps(flde), x);
467 pow2 = _mm_cvttps_epi32(x1);
469 x1 = vfnmadd_ps(vset1_ps(fln2), _mm_cvtepi32_ps(pow2), x);
471 c = vset1_ps(coeff_fexp[f_expterms - 1]);
472 y1 = vfmadd_ps(c, x1, c_one);
473 for (i = f_expterms - 2; i >= 0; i--) {
474 c = vset1_ps(coeff_fexp[i]);
475 y1 = vfmadd_ps(vmul_ps(c, x1), y1, c_one);
479 by1 = _mm_castps_si128(y1);
480 by1 = _mm_add_epi32(by1, _mm_slli_epi32(pow2, 23));
482 y = _mm_castsi128_ps(by1);
491 static const double coeff_dcos[] = {
492 1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
493 1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
494 / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
495 / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
497 static const double coeff_dsin[] = {
498 1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
499 1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
500 / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
501 / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
503 static const int d_terms = 10;
505 static const double dpi = 3.14159265358979323846264338327950288L;
506 static const double done_2pi = 1.0L / 6.28318530717958647692528676655900577L;
508 static inline vecd _mm_cos_pd_(vecd xi) {
509 vecd xi2, n, v_one, c, c_const, c_const2;
512 v_one = vset1_pd(1.0);
514 n = _mm_floor_pd_(vmul_pd(xi, vset1_pd(done_2pi)));
515 n = vfmadd_pd(vset1_pd(2.0), n, v_one);
517 xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
518 xi2 = vmul_pd(xi, xi);
520 c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
521 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
522 c = vfnmadd_pd(c_const2, c_const, c_const);
524 for (i = d_terms - 1; i >= 0; i--) {
525 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
526 c = vfnmadd_pd(c, c_const, c_const);
530 c = vsub_pd(c, v_one);
535 static inline vecd _mm_sin_pd_(vecd xi) {
536 vecd xi2, n, v_one, s, s_const;
539 v_one = vset1_pd(1.0);
541 n = _mm_floor_pd_(vmul_pd(xi, vset1_pd(done_2pi)));
542 n = vfmadd_pd(vset1_pd(2.0), n, v_one);
544 xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
545 xi2 = vmul_pd(xi, xi);
547 s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
549 for (i = d_terms - 1; i >= 0; i--) {
550 s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
551 s = vfnmadd_pd(s, s_const, s_const);
554 s = vfmsub_pd(xi, s, xi);
559 static inline void _mm_sincos_pd_(vecd xi, vecd *cp, vecd *sp) {
560 vecd xi2, n, v_one, c, c_const, c_const2, s, s_const;
563 v_one = vset1_pd(1.0);
565 n = _mm_floor_pd_(vmul_pd(xi, vset1_pd(done_2pi)));
566 n = vfmadd_pd(vset1_pd(2.0), n, v_one);
568 xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
569 xi2 = vmul_pd(xi, xi);
571 s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
573 c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
574 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
575 c = vfnmadd_pd(c_const2, c_const, c_const);
577 for (i = d_terms - 1; i >= 0; i--) {
578 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
579 s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
581 c = vfnmadd_pd(c, c_const, c_const);
582 s = vfnmadd_pd(s, s_const, s_const);
585 *cp = vsub_pd(c, v_one);
586 *sp = vfmsub_pd(xi, s, xi);
589 static const double coeff_dexp[] = {
590 1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
591 1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
592 / 14.0, 1.0 / 15.0 };
594 static const double dlde = 1.4426950408889634e+00;
595 static const double dln2 = 6.9314718055994529e-01;
597 static const int d_expterms = 14;
599 static inline vecd _mm_exp_pd_(vecd x) {
601 __m128d x1, y1, y, c_one, c;
604 c_one = vset1_pd(1.0);
606 x1 = vmul_pd(vset1_pd(dlde), x);
607 pow2 = _mm_cvttpd_epi32(x1);
609 x1 = vfnmadd_pd(vset1_pd(dln2), _mm_cvtepi32_pd(pow2), x);
611 c = vset1_pd(coeff_dexp[d_expterms - 1]);
612 y1 = vfmadd_pd(c, x1, c_one);
613 for (i = d_expterms - 2; i >= 0; i--) {
614 c = vset1_pd(coeff_dexp[i]);
615 y1 = vfmadd_pd(vmul_pd(c, x1), y1, c_one);
619 by1 = _mm_castpd_si128(y1);
620 by1 = _mm_add_epi64(by1, _mm_slli_epi64(_mm_cvtepi32_epi64_(pow2), 52));
621 y = _mm_castsi128_pd(by1);
634 static inline vecf _mm_dot3_ps_(vecf x[3], vecf y[3]) {
637 res = vmul_ps(x[0], y[0]);
638 res = vfmadd_ps(x[1], y[1], res);
639 res = vfmadd_ps(x[2], y[2], res);
648 static inline vecd _mm_dot3_pd_(vecd x[3], vecd y[3]) {
651 res = vmul_pd(x[0], y[0]);
652 res = vfmadd_pd(x[1], y[1], res);
653 res = vfmadd_pd(x[2], y[2], res);