1 #ifndef LIBRARY_SIMD_AVX_H_ 2 #define LIBRARY_SIMD_AVX_H_ 32 #define vadd_ps _mm256_add_ps 33 #define vsub_ps _mm256_sub_ps 34 #define vmul_ps _mm256_mul_ps 35 #define vdiv_ps _mm256_div_ps 36 #define vsqrt_ps _mm256_sqrt_ps 37 #define vrsqrt_ps _mm256_rsqrt_ps_ 38 #define vfmadd_ps _mm256_fmadd_ps 39 #define vfmsub_ps _mm256_fmsub_ps 40 #define vfnmadd_ps _mm256_fnmadd_ps 41 #define vfnmsub_ps _mm256_fnmsub_ps 47 #define vadd_pd _mm256_add_pd 48 #define vsub_pd _mm256_sub_pd 49 #define vmul_pd _mm256_mul_pd 50 #define vdiv_pd _mm256_div_pd 51 #define vsqrt_pd _mm256_sqrt_pd 52 #define vrsqrt_pd _mm256_rsqrt_pd_ 53 #define vfmadd_pd _mm256_fmadd_pd 54 #define vfmsub_pd _mm256_fmsub_pd 55 #define vfnmadd_pd _mm256_fnmadd_pd 56 #define vfnmsub_pd _mm256_fnmsub_pd 66 #define vsin_ps _mm256_sin_ps_ 67 #define vcos_ps _mm256_cos_ps_ 68 #define vsincos_ps _mm256_sincos_ps_ 69 #define vexp_ps _mm256_exp_ps_ 75 #define vsin_pd _mm256_sin_pd_ 76 #define vcos_pd _mm256_cos_pd_ 77 #define vsincos_pd _mm256_sincos_pd_ 78 #define vexp_pd _mm256_exp_pd_ 88 #define vload_ps _mm256_load_ps 89 #define vload1_ps _mm256_broadcast_ss 90 #define vloadu_ps _mm256_loadu_ps 91 #define vset1_ps _mm256_set1_ps 92 #define vsetzero_ps _mm256_setzero_ps 93 #define vstore_ps _mm256_store_ps 94 #define vstoreu_ps _mm256_storeu_ps 100 #define vload_pd _mm256_load_pd 101 #define vload1_pd _mm256_broadcast_sd 102 #define vloadu_pd _mm256_loadu_pd 103 #define vset1_pd _mm256_set1_pd 104 #define vsetzero_pd _mm256_setzero_pd 105 #define vstore_pd _mm256_store_pd 106 #define vstoreu_pd _mm256_storeu_pd 116 #define vcmpeq_ps(a,b) _mm256_cmp_ps(a,b, _CMP_EQ_OQ) 117 #define vcmpneq_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NEQ_OQ) 118 #define vcmpge_ps(a,b) _mm256_cmp_ps(a,b, _CMP_GE_OQ) 119 #define vcmpgt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_GT_OQ) 120 #define vcmpnge_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NGE_OQ) 121 #define vcmpngt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NGT_OQ) 122 #define vcmple_ps(a,b) _mm256_cmp_ps(a,b, _CMP_LE_OQ) 123 #define vcmplt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_LT_OQ) 124 #define vcmpnle_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NLE_OQ) 125 #define vcmpnlt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NLT_OQ) 131 #define vcmpeq_pd(a,b) _mm256_cmp_pd(a,b, _CMP_EQ_OQ) 132 #define vcmpneq_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NEQ_OQ) 133 #define vcmpge_pd(a,b) _mm256_cmp_pd(a,b, _CMP_GE_OQ) 134 #define vcmpgt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_GT_OQ) 135 #define vcmpnge_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NGE_OQ) 136 #define vcmpngt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NGT_OQ) 137 #define vcmple_pd(a,b) _mm256_cmp_pd(a,b, _CMP_LE_OQ) 138 #define vcmplt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_LT_OQ) 139 #define vcmpnle_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NLE_OQ) 140 #define vcmpnlt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NLT_OQ) 150 #define vand_ps _mm256_and_ps 151 #define vandnot_ps _mm256_andnot_ps 152 #define vor_ps _mm256_or_ps 153 #define vxor_ps _mm256_xor_ps 159 #define vand_pd _mm256_and_pd 160 #define vandnot_pd _mm256_andnot_pd 161 #define vor_pd _mm256_or_pd 162 #define vxor_pd _mm256_xor_pd 172 #define vreduce_ps _mm256_reduce_ps_ 178 #define vreduce_pd _mm256_reduce_pd_ 188 #define vdot3_ps _mm256_dot3_ps_ 194 #define vdot3_pd _mm256_dot3_pd_ 205 static inline vecf _mm256_fmadd_ps(vecf a, vecf b, vecf c) {
206 return vadd_ps(vmul_ps(a, b), c);
209 static inline vecf _mm256_fmsub_ps(vecf a, vecf b, vecf c) {
210 return vsub_ps(vmul_ps(a, b), c);
213 static inline vecf _mm256_fnmadd_ps(vecf a, vecf b, vecf c) {
214 return vsub_ps(c, vmul_ps(a, b));
217 static inline vecf _mm256_fnmsub_ps(vecf a, vecf b, vecf c) {
218 return vmul_ps(vset1_ps(-1.0f), vadd_ps(vmul_ps(a, b), c));
227 static inline vecd _mm256_fmadd_pd(vecd a, vecd b, vecd c) {
228 return vadd_pd(vmul_pd(a, b), c);
231 static inline vecd _mm256_fmsub_pd(vecd a, vecd b, vecd c) {
232 return vsub_pd(vmul_pd(a, b), c);
235 static inline vecd _mm256_fnmadd_pd(vecd a, vecd b, vecd c) {
236 return vsub_pd(c, vmul_pd(a, b));
239 static inline vecd _mm256_fnmsub_pd(vecd a, vecd b, vecd c) {
240 return vmul_pd(vset1_pd(-1.0), vadd_pd(vmul_pd(a, b), c));
248 static inline vecf _mm256_rsqrt_ps_(vecf x) {
251 x2 = vmul_ps(x, vset1_ps(0.5f));
252 x = _mm256_rsqrt_ps(x);
256 t3 = vmul_ps(vset1_ps(1.5f), x);
257 x = vfnmadd_ps(t1, t2, t3);
262 static inline vecd _mm256_rsqrt_pd_(vecd x) {
266 cd_15 = vset1_pd(1.5);
268 x2 = vmul_pd(x, vset1_pd(0.5));
269 x = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)));
273 t3 = vmul_pd(cd_15, x);
274 x = vfnmadd_pd(t1, t2, t3);
278 t3 = vmul_pd(cd_15, x);
279 x = vfnmadd_pd(t1, t2, t3);
288 static inline float _mm256_reduce_ps_(vecf x) {
289 return x[0] + x[1] + x[2] + x[3] + x[4] + x[5] + x[6] + x[7];
296 static inline double _mm256_reduce_pd_(vecd x) {
297 return x[0] + x[1] + x[2] + x[3];
308 static const float coeff_fcos[] = {
309 1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
310 1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
311 / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
312 / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
314 static const float coeff_fsin[] = {
315 1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
316 1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
317 / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
318 / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
320 static const int f_terms = 5;
322 static const float fpi = 3.14159265358979323846264338327950288L;
323 static const float fone_2pi = 1.0L / 6.28318530717958647692528676655900577L;
325 static inline vecf _mm256_cos_ps_(vecf xi) {
326 vecf xi2, n, v_one, c, c_const, c_const2;
329 v_one = vset1_ps(1.0f);
331 n = _mm256_floor_ps(vmul_ps(xi, vset1_ps(fone_2pi)));
332 n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
334 xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
335 xi2 = vmul_ps(xi, xi);
337 c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
338 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
339 c = vfnmadd_ps(c_const2, c_const, c_const);
341 for (i = f_terms - 1; i >= 0; i--) {
342 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
343 c = vfnmadd_ps(c, c_const, c_const);
347 c = vsub_ps(c, v_one);
352 static inline vecf _mm256_sin_ps_(vecf xi) {
353 vecf xi2, n, v_one, s, s_const;
356 v_one = vset1_ps(1.0f);
358 n = _mm256_floor_ps(vmul_ps(xi, vset1_ps(fone_2pi)));
359 n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
361 xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
362 xi2 = vmul_ps(xi, xi);
364 s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
366 for (i = f_terms - 1; i >= 0; i--) {
367 s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
368 s = vfnmadd_ps(s, s_const, s_const);
371 s = vfmsub_ps(xi, s, xi);
376 static inline void _mm256_sincos_ps_(vecf xi, vecf *cp, vecf *sp) {
377 vecf xi2, n, v_one, c, c_const, c_const2, s, s_const;
380 v_one = vset1_ps(1.0f);
382 n = _mm256_floor_ps(vmul_ps(xi, vset1_ps(fone_2pi)));
383 n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
385 xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
386 xi2 = vmul_ps(xi, xi);
388 s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
390 c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
391 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
392 c = vfnmadd_ps(c_const2, c_const, c_const);
394 for (i = f_terms - 1; i >= 0; i--) {
395 c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
396 s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
398 c = vfnmadd_ps(c, c_const, c_const);
399 s = vfnmadd_ps(s, s_const, s_const);
402 *cp = vsub_ps(c, v_one);
403 *sp = vfmsub_ps(xi, s, xi);
406 static const float coeff_fexp[] = {
407 1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
408 1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
409 / 14.0, 1.0 / 15.0 };
411 static const float flde = 1.4426950408889634e+00f;
412 static const float fln2 = 6.9314718055994529e-01f;
415 static const int f_expterms = 6;
417 static inline vecf _mm256_exp_ps_(vecf x) {
419 __m256 x1, y1, y, c_one, c;
422 c_one = vset1_ps(1.0f);
424 pow2 = _mm256_cvtps_epi32(vmul_ps(vset1_ps(flde), x));
425 x1 = vfnmadd_ps(vset1_ps(fln2), _mm256_cvtepi32_ps(pow2), x);
427 c = vset1_ps(coeff_fexp[f_expterms - 1]);
428 y1 = vfmadd_ps(c, x1, c_one);
429 for (i = f_expterms - 2; i >= 0; i--) {
430 c = vset1_ps(coeff_fexp[i]);
431 y1 = vfmadd_ps(vmul_ps(c, x1), y1, c_one);
435 by1 = _mm256_castps_si256(y1);
436 by1 = _mm256_add_epi32(by1, _mm256_slli_epi32(pow2, 23));
437 y = _mm256_castsi256_ps(by1);
443 static const int f_expterms = 7;
445 static inline vecf _mm256_exp_ps_(vecf x) {
447 __m256 x1, y1, y, c_one, c;
448 __m128i pow2_1, pow2_2, by1_1, by1_2;
451 c_one = vset1_ps(1.0f);
453 x1 = vmul_ps(vset1_ps(flde), x);
455 pow2 = _mm256_cvttps_epi32(x1);
456 pow2_1 = _mm256_extractf128_si256(pow2, 0);
457 pow2_2 = _mm256_extractf128_si256(pow2, 1);
459 x1 = vfnmadd_ps(vset1_ps(fln2), _mm256_cvtepi32_ps(pow2), x);
461 c = vset1_ps(coeff_fexp[f_expterms - 1]);
462 y1 = vfmadd_ps(c, x1, c_one);
463 for (i = f_expterms - 2; i >= 0; i--) {
464 c = vset1_ps(coeff_fexp[i]);
465 y1 = vfmadd_ps(vmul_ps(c, x1), y1, c_one);
469 by1_1 = _mm_castps_si128(_mm256_extractf128_ps(y1, 0));
470 by1_1 = _mm_add_epi32(by1_1, _mm_slli_epi32(pow2_1, 23));
471 by1_2 = _mm_castps_si128(_mm256_extractf128_ps(y1, 1));
472 by1_2 = _mm_add_epi32(by1_2, _mm_slli_epi32(pow2_2, 23));
473 by1 = _mm256_setzero_si256();
474 by1 = _mm256_insertf128_si256(by1, by1_1, 0);
475 by1 = _mm256_insertf128_si256(by1, by1_2, 1);
476 y = _mm256_castsi256_ps(by1);
486 static const double coeff_dcos[] = {
487 1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
488 1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
489 / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
490 / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
492 static const double coeff_dsin[] = {
493 1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
494 1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
495 / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
496 / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
498 static const int d_terms = 10;
500 static const double dpi = 3.14159265358979323846264338327950288L;
501 static const double done_2pi = 1.0L / 6.28318530717958647692528676655900577L;
503 static inline vecd _mm256_cos_pd_(vecd xi) {
504 vecd xi2, n, v_one, c, c_const, c_const2;
507 v_one = vset1_pd(1.0);
509 n = _mm256_floor_pd(vmul_pd(xi, vset1_pd(done_2pi)));
510 n = vfmadd_pd(vset1_pd(2.0), n, v_one);
512 xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
513 xi2 = vmul_pd(xi, xi);
515 c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
516 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
517 c = vfnmadd_pd(c_const2, c_const, c_const);
519 for (i = d_terms - 1; i >= 0; i--) {
520 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
521 c = vfnmadd_pd(c, c_const, c_const);
524 c = vsub_pd(c, v_one);
529 static inline vecd _mm256_sin_pd_(vecd xi) {
530 vecd xi2, n, v_one, s, s_const;
533 v_one = vset1_pd(1.0);
535 n = _mm256_floor_pd(vmul_pd(xi, vset1_pd(done_2pi)));
536 n = vfmadd_pd(vset1_pd(2.0), n, v_one);
538 xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
539 xi2 = vmul_pd(xi, xi);
541 s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
543 for (i = d_terms - 1; i >= 0; i--) {
544 s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
545 s = vfnmadd_pd(s, s_const, s_const);
548 s = vfmsub_pd(xi, s, xi);
553 static inline void _mm256_sincos_pd_(vecd xi, vecd *cp, vecd *sp) {
554 vecd xi2, n, v_one, c, c_const, c_const2, s, s_const;
557 v_one = vset1_pd(1.0);
559 n = _mm256_floor_pd(vmul_pd(xi, vset1_pd(done_2pi)));
560 n = vfmadd_pd(vset1_pd(2.0), n, v_one);
562 xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
563 xi2 = vmul_pd(xi, xi);
565 s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
567 c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
568 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
569 c = vfnmadd_pd(c_const2, c_const, c_const);
571 for (i = d_terms - 1; i >= 0; i--) {
572 c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
573 s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
575 c = vfnmadd_pd(c, c_const, c_const);
576 s = vfnmadd_pd(s, s_const, s_const);
579 *cp = vsub_pd(c, v_one);
580 *sp = vfmsub_pd(xi, s, xi);
583 static const double coeff_dexp[] = {
584 1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
585 1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
586 / 14.0, 1.0 / 15.0 };
588 static const double dlde = 1.4426950408889634e+00;
589 static const double dln2 = 6.9314718055994529e-01;
592 static const int d_expterms = 11;
594 static inline vecd _mm256_exp_pd_(vecd x) {
596 __m256d x1, y1, y, c_one, c;
600 c_one = vset1_pd(1.0);
602 pow2 = _mm256_cvtpd_epi32(vmul_pd(vset1_pd(dlde), x));
603 x1 = vfnmadd_pd(vset1_pd(dln2), _mm256_cvtepi32_pd(pow2), x);
605 c = vset1_pd(coeff_dexp[d_expterms - 1]);
606 y1 = vfmadd_pd(c, x1, c_one);
607 for (i = d_expterms - 2; i >= 0; i--) {
608 c = vset1_pd(coeff_dexp[i]);
609 y1 = vfmadd_pd(vmul_pd(c, x1), y1, c_one);
613 by1 = _mm256_castpd_si256(y1);
614 by1 = _mm256_add_epi64(by1,
615 _mm256_slli_epi64(_mm256_cvtepi32_epi64(pow2), 52));
616 y = _mm256_castsi256_pd(by1);
622 static const int d_expterms = 14;
624 static inline vecd _mm256_exp_pd_(vecd x) {
626 __m256d x1, y1, y, c_one, c;
627 __m128i pow2, pow2_1, pow2_2, by1_1, by1_2;
630 c_one = vset1_pd(1.0);
632 x1 = vmul_pd(vset1_pd(dlde), x);
633 pow2_1 = _mm_cvttpd_epi32(_mm256_extractf128_pd(x1, 0));
634 pow2_2 = _mm_cvttpd_epi32(_mm256_extractf128_pd(x1, 1));
635 pow2 = _mm_setzero_si128();
636 pow2 = _mm_unpacklo_epi32(pow2_1, pow2_2);
637 pow2 = _mm_shuffle_epi32(pow2, 216);
639 x1 = vfnmadd_pd(vset1_pd(dln2), _mm256_cvtepi32_pd(pow2), x);
641 c = vset1_pd(coeff_dexp[d_expterms - 1]);
642 y1 = vfmadd_pd(c, x1, c_one);
643 for (i = d_expterms - 2; i >= 0; i--) {
644 c = vset1_pd(coeff_dexp[i]);
645 y1 = vfmadd_pd(vmul_pd(c, x1), y1, c_one);
649 by1_1 = _mm_castpd_si128(_mm256_extractf128_pd(y1, 0));
650 by1_1 = _mm_add_epi64(by1_1, _mm_slli_epi64(_mm_cvtepi32_epi64(pow2_1), 52));
651 by1_2 = _mm_castpd_si128(_mm256_extractf128_pd(y1, 1));
652 by1_2 = _mm_add_epi64(by1_2, _mm_slli_epi64(_mm_cvtepi32_epi64(pow2_2), 52));
653 by1 = _mm256_setzero_si256();
654 by1 = _mm256_insertf128_si256(by1, by1_1, 0);
655 by1 = _mm256_insertf128_si256(by1, by1_2, 1);
656 y = _mm256_castsi256_pd(by1);
670 static inline vecf _mm256_dot3_ps_(vecf x[3], vecf y[3]) {
673 res = vmul_ps(x[0], y[0]);
674 res = vfmadd_ps(x[1], y[1], res);
675 res = vfmadd_ps(x[2], y[2], res);
684 static inline vecd _mm256_dot3_pd_(vecd x[3], vecd y[3]) {
687 res = vmul_pd(x[0], y[0]);
688 res = vfmadd_pd(x[1], y[1], res);
689 res = vfmadd_pd(x[2], y[2], res);