H2Lib  3.0
simd_avx.h
1 #ifndef LIBRARY_SIMD_AVX_H_
2 #define LIBRARY_SIMD_AVX_H_
3 
4 #ifdef USE_SIMD
5 
6 #include <math.h>
7 #include <immintrin.h>
8 
9 /****************************************************
10  * Define vector sizes and alignment
11  ****************************************************/
12 
13 #define VFLOAT 8
14 #define VDOUBLE 4
15 #define VALIGN 64
16 
17 /****************************************************
18  * Define vector types
19  ****************************************************/
20 
21 #define vecf __m256
22 #define vecd __m256d
23 
24 /****************************************************
25  * Define arithmetic operations
26  ****************************************************/
27 
28 /****************************************************
29  * Arithmetic operations for float
30  ****************************************************/
31 
32 #define vadd_ps _mm256_add_ps
33 #define vsub_ps _mm256_sub_ps
34 #define vmul_ps _mm256_mul_ps
35 #define vdiv_ps _mm256_div_ps
36 #define vsqrt_ps _mm256_sqrt_ps
37 #define vrsqrt_ps _mm256_rsqrt_ps_
38 #define vfmadd_ps _mm256_fmadd_ps
39 #define vfmsub_ps _mm256_fmsub_ps
40 #define vfnmadd_ps _mm256_fnmadd_ps
41 #define vfnmsub_ps _mm256_fnmsub_ps
42 
43 /****************************************************
44  * Arithmetic operations for double
45  ****************************************************/
46 
47 #define vadd_pd _mm256_add_pd
48 #define vsub_pd _mm256_sub_pd
49 #define vmul_pd _mm256_mul_pd
50 #define vdiv_pd _mm256_div_pd
51 #define vsqrt_pd _mm256_sqrt_pd
52 #define vrsqrt_pd _mm256_rsqrt_pd_
53 #define vfmadd_pd _mm256_fmadd_pd
54 #define vfmsub_pd _mm256_fmsub_pd
55 #define vfnmadd_pd _mm256_fnmadd_pd
56 #define vfnmsub_pd _mm256_fnmsub_pd
57 
58 /****************************************************
59  * Define advanced arithmetic operations
60  ****************************************************/
61 
62 /****************************************************
63  * Advanced arithmetic operations for float
64  ****************************************************/
65 
66 #define vsin_ps _mm256_sin_ps_
67 #define vcos_ps _mm256_cos_ps_
68 #define vsincos_ps _mm256_sincos_ps_
69 #define vexp_ps _mm256_exp_ps_
70 
71 /****************************************************
72  * Advanced arithmetic operations for double
73  ****************************************************/
74 
75 #define vsin_pd _mm256_sin_pd_
76 #define vcos_pd _mm256_cos_pd_
77 #define vsincos_pd _mm256_sincos_pd_
78 #define vexp_pd _mm256_exp_pd_
79 
80 /****************************************************
81  * Define load/store operations
82  ****************************************************/
83 
84 /****************************************************
85  * Load/store operations for float
86  ****************************************************/
87 
88 #define vload_ps _mm256_load_ps
89 #define vload1_ps _mm256_broadcast_ss
90 #define vloadu_ps _mm256_loadu_ps
91 #define vset1_ps _mm256_set1_ps
92 #define vsetzero_ps _mm256_setzero_ps
93 #define vstore_ps _mm256_store_ps
94 #define vstoreu_ps _mm256_storeu_ps
95 
96 /****************************************************
97  * Load/store operations for double
98  ****************************************************/
99 
100 #define vload_pd _mm256_load_pd
101 #define vload1_pd _mm256_broadcast_sd
102 #define vloadu_pd _mm256_loadu_pd
103 #define vset1_pd _mm256_set1_pd
104 #define vsetzero_pd _mm256_setzero_pd
105 #define vstore_pd _mm256_store_pd
106 #define vstoreu_pd _mm256_storeu_pd
107 
108 /****************************************************
109  * Define compare operations
110  ****************************************************/
111 
112 /****************************************************
113  * Define compare operations for float
114  ****************************************************/
115 
116 #define vcmpeq_ps(a,b) _mm256_cmp_ps(a,b, _CMP_EQ_OQ)
117 #define vcmpneq_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NEQ_OQ)
118 #define vcmpge_ps(a,b) _mm256_cmp_ps(a,b, _CMP_GE_OQ)
119 #define vcmpgt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_GT_OQ)
120 #define vcmpnge_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NGE_OQ)
121 #define vcmpngt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NGT_OQ)
122 #define vcmple_ps(a,b) _mm256_cmp_ps(a,b, _CMP_LE_OQ)
123 #define vcmplt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_LT_OQ)
124 #define vcmpnle_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NLE_OQ)
125 #define vcmpnlt_ps(a,b) _mm256_cmp_ps(a,b, _CMP_NLT_OQ)
126 
127 /****************************************************
128  * Define compare operations for float
129  ****************************************************/
130 
131 #define vcmpeq_pd(a,b) _mm256_cmp_pd(a,b, _CMP_EQ_OQ)
132 #define vcmpneq_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NEQ_OQ)
133 #define vcmpge_pd(a,b) _mm256_cmp_pd(a,b, _CMP_GE_OQ)
134 #define vcmpgt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_GT_OQ)
135 #define vcmpnge_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NGE_OQ)
136 #define vcmpngt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NGT_OQ)
137 #define vcmple_pd(a,b) _mm256_cmp_pd(a,b, _CMP_LE_OQ)
138 #define vcmplt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_LT_OQ)
139 #define vcmpnle_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NLE_OQ)
140 #define vcmpnlt_pd(a,b) _mm256_cmp_pd(a,b, _CMP_NLT_OQ)
141 
142 /****************************************************
143  * Definitions of bit operations
144  ****************************************************/
145 
146 /****************************************************
147  * Definitions of bit operations for float
148  ****************************************************/
149 
150 #define vand_ps _mm256_and_ps
151 #define vandnot_ps _mm256_andnot_ps
152 #define vor_ps _mm256_or_ps
153 #define vxor_ps _mm256_xor_ps
154 
155 /****************************************************
156  * Definitions of bit operations for double
157  ****************************************************/
158 
159 #define vand_pd _mm256_and_pd
160 #define vandnot_pd _mm256_andnot_pd
161 #define vor_pd _mm256_or_pd
162 #define vxor_pd _mm256_xor_pd
163 
164 /****************************************************
165  * Define reductions of vector registers
166  ****************************************************/
167 
168 /****************************************************
169  * Define reductions of vector registers for floats
170  ****************************************************/
171 
172 #define vreduce_ps _mm256_reduce_ps_
173 
174 /****************************************************
175  * Define reductions of vector registers for floats
176  ****************************************************/
177 
178 #define vreduce_pd _mm256_reduce_pd_
179 
180 /****************************************************
181  * Definition of little helper functions
182  ****************************************************/
183 
184 /****************************************************
185  * Definition of little helfer functions for float
186  ****************************************************/
187 
188 #define vdot3_ps _mm256_dot3_ps_
189 
190 /****************************************************
191  * Definition of little helfer functions for double
192  ****************************************************/
193 
194 #define vdot3_pd _mm256_dot3_pd_
195 
196 /****************************************************
197  * Implementation of FMA function, if needed
198  ****************************************************/
199 
200 /****************************************************
201  * Implementation of FMA function, if needed for float
202  ****************************************************/
203 
204 #ifndef __FMA__
205 static inline vecf _mm256_fmadd_ps(vecf a, vecf b, vecf c) {
206  return vadd_ps(vmul_ps(a, b), c);
207 }
208 
209 static inline vecf _mm256_fmsub_ps(vecf a, vecf b, vecf c) {
210  return vsub_ps(vmul_ps(a, b), c);
211 }
212 
213 static inline vecf _mm256_fnmadd_ps(vecf a, vecf b, vecf c) {
214  return vsub_ps(c, vmul_ps(a, b));
215 }
216 
217 static inline vecf _mm256_fnmsub_ps(vecf a, vecf b, vecf c) {
218  return vmul_ps(vset1_ps(-1.0f), vadd_ps(vmul_ps(a, b), c));
219 }
220 #endif
221 
222 /****************************************************
223  * Implementation of FMA function, if needed for double
224  ****************************************************/
225 
226 #ifndef __FMA__
227 static inline vecd _mm256_fmadd_pd(vecd a, vecd b, vecd c) {
228  return vadd_pd(vmul_pd(a, b), c);
229 }
230 
231 static inline vecd _mm256_fmsub_pd(vecd a, vecd b, vecd c) {
232  return vsub_pd(vmul_pd(a, b), c);
233 }
234 
235 static inline vecd _mm256_fnmadd_pd(vecd a, vecd b, vecd c) {
236  return vsub_pd(c, vmul_pd(a, b));
237 }
238 
239 static inline vecd _mm256_fnmsub_pd(vecd a, vecd b, vecd c) {
240  return vmul_pd(vset1_pd(-1.0), vadd_pd(vmul_pd(a, b), c));
241 }
242 #endif
243 
244 /****************************************************
245  * Implementations of basic vector functions
246  ****************************************************/
247 
248 static inline vecf _mm256_rsqrt_ps_(vecf x) {
249  vecf x2, t1, t2, t3;
250 
251  x2 = vmul_ps(x, vset1_ps(0.5f));
252  x = _mm256_rsqrt_ps(x);
253 
254  t1 = vmul_ps(x, x);
255  t2 = vmul_ps(x2, x);
256  t3 = vmul_ps(vset1_ps(1.5f), x);
257  x = vfnmadd_ps(t1, t2, t3);
258 
259  return x;
260 }
261 
262 static inline vecd _mm256_rsqrt_pd_(vecd x) {
263  vecd x2, t1, t2, t3;
264  vecd cd_15;
265 
266  cd_15 = vset1_pd(1.5);
267 
268  x2 = vmul_pd(x, vset1_pd(0.5));
269  x = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)));
270 
271  t1 = vmul_pd(x, x);
272  t2 = vmul_pd(x2, x);
273  t3 = vmul_pd(cd_15, x);
274  x = vfnmadd_pd(t1, t2, t3);
275 
276  t1 = vmul_pd(x, x);
277  t2 = vmul_pd(x2, x);
278  t3 = vmul_pd(cd_15, x);
279  x = vfnmadd_pd(t1, t2, t3);
280 
281  return x;
282 }
283 
284 /****************************************************
285  * Implementations of basic vector functions for float
286  ****************************************************/
287 
288 static inline float _mm256_reduce_ps_(vecf x) {
289  return x[0] + x[1] + x[2] + x[3] + x[4] + x[5] + x[6] + x[7];
290 }
291 
292 /****************************************************
293  * Implementations of basic vector functions for double
294  ****************************************************/
295 
296 static inline double _mm256_reduce_pd_(vecd x) {
297  return x[0] + x[1] + x[2] + x[3];
298 }
299 
300 /****************************************************
301  * Implementation of advanced arithmetic operations
302  ****************************************************/
303 
304 /****************************************************
305  * Implementation of advanced arithmetic operations for float
306  ****************************************************/
307 
308 static const float coeff_fcos[] = {
309  1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
310  1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
311  / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
312  / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
313  / (27.0 * 28.0) };
314 static const float coeff_fsin[] = {
315  1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
316  1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
317  / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
318  / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
319  / (28.0 * 29.0) };
320 static const int f_terms = 5;
321 
322 static const float fpi = 3.14159265358979323846264338327950288L;
323 static const float fone_2pi = 1.0L / 6.28318530717958647692528676655900577L;
324 
325 static inline vecf _mm256_cos_ps_(vecf xi) {
326  vecf xi2, n, v_one, c, c_const, c_const2;
327  int i;
328 
329  v_one = vset1_ps(1.0f);
330 
331  n = _mm256_floor_ps(vmul_ps(xi, vset1_ps(fone_2pi)));
332  n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
333 
334  xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
335  xi2 = vmul_ps(xi, xi);
336 
337  c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
338  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
339  c = vfnmadd_ps(c_const2, c_const, c_const);
340 
341  for (i = f_terms - 1; i >= 0; i--) {
342  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
343  c = vfnmadd_ps(c, c_const, c_const);
344  ;
345  }
346 
347  c = vsub_ps(c, v_one);
348 
349  return c;
350 }
351 
352 static inline vecf _mm256_sin_ps_(vecf xi) {
353  vecf xi2, n, v_one, s, s_const;
354  int i;
355 
356  v_one = vset1_ps(1.0f);
357 
358  n = _mm256_floor_ps(vmul_ps(xi, vset1_ps(fone_2pi)));
359  n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
360 
361  xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
362  xi2 = vmul_ps(xi, xi);
363 
364  s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
365 
366  for (i = f_terms - 1; i >= 0; i--) {
367  s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
368  s = vfnmadd_ps(s, s_const, s_const);
369  }
370 
371  s = vfmsub_ps(xi, s, xi);
372 
373  return s;
374 }
375 
376 static inline void _mm256_sincos_ps_(vecf xi, vecf *cp, vecf *sp) {
377  vecf xi2, n, v_one, c, c_const, c_const2, s, s_const;
378  int i;
379 
380  v_one = vset1_ps(1.0f);
381 
382  n = _mm256_floor_ps(vmul_ps(xi, vset1_ps(fone_2pi)));
383  n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
384 
385  xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
386  xi2 = vmul_ps(xi, xi);
387 
388  s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
389 
390  c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
391  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
392  c = vfnmadd_ps(c_const2, c_const, c_const);
393 
394  for (i = f_terms - 1; i >= 0; i--) {
395  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
396  s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
397 
398  c = vfnmadd_ps(c, c_const, c_const);
399  s = vfnmadd_ps(s, s_const, s_const);
400  }
401 
402  *cp = vsub_ps(c, v_one);
403  *sp = vfmsub_ps(xi, s, xi);
404 }
405 
406 static const float coeff_fexp[] = {
407  1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
408  1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
409  / 14.0, 1.0 / 15.0 };
410 
411 static const float flde = 1.4426950408889634e+00f;
412 static const float fln2 = 6.9314718055994529e-01f;
413 
414 #ifdef __AVX2__
415 static const int f_expterms = 6;
416 
417 static inline vecf _mm256_exp_ps_(vecf x) {
418  int i;
419  __m256 x1, y1, y, c_one, c;
420  __m256i by1, pow2;
421 
422  c_one = vset1_ps(1.0f);
423 
424  pow2 = _mm256_cvtps_epi32(vmul_ps(vset1_ps(flde), x));
425  x1 = vfnmadd_ps(vset1_ps(fln2), _mm256_cvtepi32_ps(pow2), x);
426 
427  c = vset1_ps(coeff_fexp[f_expterms - 1]);
428  y1 = vfmadd_ps(c, x1, c_one);
429  for (i = f_expterms - 2; i >= 0; i--) {
430  c = vset1_ps(coeff_fexp[i]);
431  y1 = vfmadd_ps(vmul_ps(c, x1), y1, c_one);
432  }
433 
434  /* Multiply by 2^pow2 by adding to exponent in binary representation */
435  by1 = _mm256_castps_si256(y1);
436  by1 = _mm256_add_epi32(by1, _mm256_slli_epi32(pow2, 23));
437  y = _mm256_castsi256_ps(by1);
438 
439  return y;
440 }
441 
442 #else
443 static const int f_expterms = 7;
444 
445 static inline vecf _mm256_exp_ps_(vecf x) {
446  int i;
447  __m256 x1, y1, y, c_one, c;
448  __m128i pow2_1, pow2_2, by1_1, by1_2;
449  __m256i by1, pow2;
450 
451  c_one = vset1_ps(1.0f);
452 
453  x1 = vmul_ps(vset1_ps(flde), x);
454 
455  pow2 = _mm256_cvttps_epi32(x1);
456  pow2_1 = _mm256_extractf128_si256(pow2, 0);
457  pow2_2 = _mm256_extractf128_si256(pow2, 1);
458 
459  x1 = vfnmadd_ps(vset1_ps(fln2), _mm256_cvtepi32_ps(pow2), x);
460 
461  c = vset1_ps(coeff_fexp[f_expterms - 1]);
462  y1 = vfmadd_ps(c, x1, c_one);
463  for (i = f_expterms - 2; i >= 0; i--) {
464  c = vset1_ps(coeff_fexp[i]);
465  y1 = vfmadd_ps(vmul_ps(c, x1), y1, c_one);
466  }
467 
468  /* Multiply by 2^pow2 by adding to exponent in binary representation */
469  by1_1 = _mm_castps_si128(_mm256_extractf128_ps(y1, 0));
470  by1_1 = _mm_add_epi32(by1_1, _mm_slli_epi32(pow2_1, 23));
471  by1_2 = _mm_castps_si128(_mm256_extractf128_ps(y1, 1));
472  by1_2 = _mm_add_epi32(by1_2, _mm_slli_epi32(pow2_2, 23));
473  by1 = _mm256_setzero_si256();
474  by1 = _mm256_insertf128_si256(by1, by1_1, 0);
475  by1 = _mm256_insertf128_si256(by1, by1_2, 1);
476  y = _mm256_castsi256_ps(by1);
477 
478  return y;
479 }
480 #endif
481 
482 /****************************************************
483  * Implementation of advanced arithmetic operations for double
484  ****************************************************/
485 
486 static const double coeff_dcos[] = {
487  1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
488  1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
489  / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
490  / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
491  / (27.0 * 28.0) };
492 static const double coeff_dsin[] = {
493  1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
494  1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
495  / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
496  / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
497  / (28.0 * 29.0) };
498 static const int d_terms = 10;
499 
500 static const double dpi = 3.14159265358979323846264338327950288L;
501 static const double done_2pi = 1.0L / 6.28318530717958647692528676655900577L;
502 
503 static inline vecd _mm256_cos_pd_(vecd xi) {
504  vecd xi2, n, v_one, c, c_const, c_const2;
505  int i;
506 
507  v_one = vset1_pd(1.0);
508 
509  n = _mm256_floor_pd(vmul_pd(xi, vset1_pd(done_2pi)));
510  n = vfmadd_pd(vset1_pd(2.0), n, v_one);
511 
512  xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
513  xi2 = vmul_pd(xi, xi);
514 
515  c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
516  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
517  c = vfnmadd_pd(c_const2, c_const, c_const);
518 
519  for (i = d_terms - 1; i >= 0; i--) {
520  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
521  c = vfnmadd_pd(c, c_const, c_const);
522  }
523 
524  c = vsub_pd(c, v_one);
525 
526  return c;
527 }
528 
529 static inline vecd _mm256_sin_pd_(vecd xi) {
530  vecd xi2, n, v_one, s, s_const;
531  int i;
532 
533  v_one = vset1_pd(1.0);
534 
535  n = _mm256_floor_pd(vmul_pd(xi, vset1_pd(done_2pi)));
536  n = vfmadd_pd(vset1_pd(2.0), n, v_one);
537 
538  xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
539  xi2 = vmul_pd(xi, xi);
540 
541  s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
542 
543  for (i = d_terms - 1; i >= 0; i--) {
544  s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
545  s = vfnmadd_pd(s, s_const, s_const);
546  }
547 
548  s = vfmsub_pd(xi, s, xi);
549 
550  return s;
551 }
552 
553 static inline void _mm256_sincos_pd_(vecd xi, vecd *cp, vecd *sp) {
554  vecd xi2, n, v_one, c, c_const, c_const2, s, s_const;
555  int i;
556 
557  v_one = vset1_pd(1.0);
558 
559  n = _mm256_floor_pd(vmul_pd(xi, vset1_pd(done_2pi)));
560  n = vfmadd_pd(vset1_pd(2.0), n, v_one);
561 
562  xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
563  xi2 = vmul_pd(xi, xi);
564 
565  s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
566 
567  c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
568  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
569  c = vfnmadd_pd(c_const2, c_const, c_const);
570 
571  for (i = d_terms - 1; i >= 0; i--) {
572  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
573  s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
574 
575  c = vfnmadd_pd(c, c_const, c_const);
576  s = vfnmadd_pd(s, s_const, s_const);
577  }
578 
579  *cp = vsub_pd(c, v_one);
580  *sp = vfmsub_pd(xi, s, xi);
581 }
582 
583 static const double coeff_dexp[] = {
584  1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
585  1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
586  / 14.0, 1.0 / 15.0 };
587 
588 static const double dlde = 1.4426950408889634e+00;
589 static const double dln2 = 6.9314718055994529e-01;
590 
591 #ifdef __AVX2__
592 static const int d_expterms = 11;
593 
594 static inline vecd _mm256_exp_pd_(vecd x) {
595  int i;
596  __m256d x1, y1, y, c_one, c;
597  __m128i pow2;
598  __m256i by1;
599 
600  c_one = vset1_pd(1.0);
601 
602  pow2 = _mm256_cvtpd_epi32(vmul_pd(vset1_pd(dlde), x));
603  x1 = vfnmadd_pd(vset1_pd(dln2), _mm256_cvtepi32_pd(pow2), x);
604 
605  c = vset1_pd(coeff_dexp[d_expterms - 1]);
606  y1 = vfmadd_pd(c, x1, c_one);
607  for (i = d_expterms - 2; i >= 0; i--) {
608  c = vset1_pd(coeff_dexp[i]);
609  y1 = vfmadd_pd(vmul_pd(c, x1), y1, c_one);
610  }
611 
612  /* Multiply by 2^pow2 by adding to exponent in binary representation */
613  by1 = _mm256_castpd_si256(y1);
614  by1 = _mm256_add_epi64(by1,
615  _mm256_slli_epi64(_mm256_cvtepi32_epi64(pow2), 52));
616  y = _mm256_castsi256_pd(by1);
617 
618  return y;
619 }
620 
621 #else
622 static const int d_expterms = 14;
623 
624 static inline vecd _mm256_exp_pd_(vecd x) {
625  int i;
626  __m256d x1, y1, y, c_one, c;
627  __m128i pow2, pow2_1, pow2_2, by1_1, by1_2;
628  __m256i by1;
629 
630  c_one = vset1_pd(1.0);
631 
632  x1 = vmul_pd(vset1_pd(dlde), x);
633  pow2_1 = _mm_cvttpd_epi32(_mm256_extractf128_pd(x1, 0));
634  pow2_2 = _mm_cvttpd_epi32(_mm256_extractf128_pd(x1, 1));
635  pow2 = _mm_setzero_si128();
636  pow2 = _mm_unpacklo_epi32(pow2_1, pow2_2);
637  pow2 = _mm_shuffle_epi32(pow2, 216);
638 
639  x1 = vfnmadd_pd(vset1_pd(dln2), _mm256_cvtepi32_pd(pow2), x);
640 
641  c = vset1_pd(coeff_dexp[d_expterms - 1]);
642  y1 = vfmadd_pd(c, x1, c_one);
643  for (i = d_expterms - 2; i >= 0; i--) {
644  c = vset1_pd(coeff_dexp[i]);
645  y1 = vfmadd_pd(vmul_pd(c, x1), y1, c_one);
646  }
647 
648  /* Multiply by 2^pow2 by adding to exponent in binary representation */
649  by1_1 = _mm_castpd_si128(_mm256_extractf128_pd(y1, 0));
650  by1_1 = _mm_add_epi64(by1_1, _mm_slli_epi64(_mm_cvtepi32_epi64(pow2_1), 52));
651  by1_2 = _mm_castpd_si128(_mm256_extractf128_pd(y1, 1));
652  by1_2 = _mm_add_epi64(by1_2, _mm_slli_epi64(_mm_cvtepi32_epi64(pow2_2), 52));
653  by1 = _mm256_setzero_si256();
654  by1 = _mm256_insertf128_si256(by1, by1_1, 0);
655  by1 = _mm256_insertf128_si256(by1, by1_2, 1);
656  y = _mm256_castsi256_pd(by1);
657 
658  return y;
659 }
660 #endif
661 
662 /****************************************************
663  * Implementation of little helper functions
664  ****************************************************/
665 
666 /****************************************************
667  * Implementation of little helper functions for float
668  ****************************************************/
669 
670 static inline vecf _mm256_dot3_ps_(vecf x[3], vecf y[3]) {
671  vecf res;
672 
673  res = vmul_ps(x[0], y[0]);
674  res = vfmadd_ps(x[1], y[1], res);
675  res = vfmadd_ps(x[2], y[2], res);
676 
677  return res;
678 }
679 
680 /****************************************************
681  * Implementation of little helper functions for double
682  ****************************************************/
683 
684 static inline vecd _mm256_dot3_pd_(vecd x[3], vecd y[3]) {
685  vecd res;
686 
687  res = vmul_pd(x[0], y[0]);
688  res = vfmadd_pd(x[1], y[1], res);
689  res = vfmadd_pd(x[2], y[2], res);
690 
691  return res;
692 }
693 
694 #endif
695 
696 #endif /* LIBRARY_SIMD_AVX_H_ */