H2Lib  3.0
simd_sse2.h
1 #ifndef LIBRARY_SIMD_SSE2_H_
2 #define LIBRARY_SIMD_SSE2_H_
3 
4 #ifdef USE_SIMD
5 
6 #include <math.h>
7 #include <immintrin.h>
8 
9 /****************************************************
10  * Define vector sizes and alignment
11  ****************************************************/
12 
13 #define VFLOAT 4
14 #define VDOUBLE 2
15 #define VALIGN 64
16 
17 /****************************************************
18  * Define vector types
19  ****************************************************/
20 
21 #define vecf __m128
22 #define vecd __m128d
23 
24 /****************************************************
25  * Define arithmetic operations
26  ****************************************************/
27 
28 /****************************************************
29  * Arithmetic operations for float
30  ****************************************************/
31 
32 #define vadd_ps _mm_add_ps
33 #define vsub_ps _mm_sub_ps
34 #define vmul_ps _mm_mul_ps
35 #define vdiv_ps _mm_div_ps
36 #define vsqrt_ps _mm_sqrt_ps
37 #define vrsqrt_ps _mm_rsqrt_ps_
38 #define vfmadd_ps _mm_fmadd_ps
39 #define vfmsub_ps _mm_fmsub_ps
40 #define vfnmadd_ps _mm_fnmadd_ps
41 #define vfnmsub_ps _mm_fnmsub_ps
42 
43 /****************************************************
44  * Arithmetic operations for double
45  ****************************************************/
46 
47 #define vadd_pd _mm_add_pd
48 #define vsub_pd _mm_sub_pd
49 #define vmul_pd _mm_mul_pd
50 #define vdiv_pd _mm_div_pd
51 #define vsqrt_pd _mm_sqrt_pd
52 #define vrsqrt_pd _mm_rsqrt_pd_
53 #define vfmadd_pd _mm_fmadd_pd
54 #define vfmsub_pd _mm_fmsub_pd
55 #define vfnmadd_pd _mm_fnmadd_pd
56 #define vfnmsub_pd _mm_fnmsub_pd
57 
58 /****************************************************
59  * Define advanced arithmetic operations
60  ****************************************************/
61 
62 /****************************************************
63  * Advanced arithmetic operations for float
64  ****************************************************/
65 
66 #define vsin_ps _mm_sin_ps_
67 #define vcos_ps _mm_cos_ps_
68 #define vsincos_ps _mm_sincos_ps_
69 #define vexp_ps _mm_exp_ps_
70 
71 /****************************************************
72  * Advanced arithmetic operations for double
73  ****************************************************/
74 
75 #define vsin_pd _mm_sin_pd_
76 #define vcos_pd _mm_cos_pd_
77 #define vsincos_pd _mm_sincos_pd_
78 #define vexp_pd _mm_exp_pd_
79 
80 /****************************************************
81  * Define load/store operations
82  ****************************************************/
83 
84 /****************************************************
85  * Load/store operations for float
86  ****************************************************/
87 
88 #define vload_ps _mm_load_ps
89 #define vload1_ps _mm_load1_ps
90 #define vloadu_ps _mm_loadu_ps
91 #define vset1_ps _mm_set1_ps
92 #define vsetzero_ps _mm_setzero_ps
93 #define vstore_ps _mm_store_ps
94 #define vstoreu_ps _mm_storeu_ps
95 
96 /****************************************************
97  * Load/store operations for double
98  ****************************************************/
99 
100 #define vload_pd _mm_load_pd
101 #define vload1_pd _mm_load1_pd
102 #define vloadu_pd _mm_loadu_pd
103 #define vset1_pd _mm_set1_pd
104 #define vsetzero_pd _mm_setzero_pd
105 #define vstore_pd _mm_store_pd
106 #define vstoreu_pd _mm_storeu_pd
107 
108 /****************************************************
109  * Define compare operations
110  ****************************************************/
111 
112 /****************************************************
113  * Define compare operations for float
114  ****************************************************/
115 
116 #define vcmpeq_ps(a,b) _mm_cmpeq_ps(a,b)
117 #define vcmpneq_ps(a,b) _mm_cmpneq_ps(a,b)
118 #define vcmpge_ps(a,b) _mm_cmpge_ps(a,b)
119 #define vcmpgt_ps(a,b) _mm_cmpgt_ps(a,b)
120 #define vcmpnge_ps(a,b) _mm_cmpnge_ps(a,b)
121 #define vcmpngt_ps(a,b) _mm_cmpngt_ps(a,b)
122 #define vcmple_ps(a,b) _mm_cmple_ps(a,b)
123 #define vcmplt_ps(a,b) _mm_cmplt_ps(a,b)
124 #define vcmpnle_ps(a,b) _mm_cmpnle_ps(a,b)
125 #define vcmpnlt_ps(a,b) _mm_cmpnlt_ps(a,b)
126 
127 /****************************************************
128  * Define compare operations for float
129  ****************************************************/
130 
131 #define vcmpeq_pd(a,b) _mm_cmpeq_pd(a,b)
132 #define vcmpneq_pd(a,b) _mm_cmpneq_pd(a,b)
133 #define vcmpge_pd(a,b) _mm_cmpge_pd(a,b)
134 #define vcmpgt_pd(a,b) _mm_cmpgt_pd(a,b)
135 #define vcmpnge_pd(a,b) _mm_cmpnge_pd(a,b)
136 #define vcmpngt_pd(a,b) _mm_cmpngt_pd(a,b)
137 #define vcmple_pd(a,b) _mm_cmple_pd(a,b)
138 #define vcmplt_pd(a,b) _mm_cmplt_pd(a,b)
139 #define vcmpnle_pd(a,b) _mm_cmpnle_pd(a,b)
140 #define vcmpnlt_pd(a,b) _mm_cmpnlt_pd(a,b)
141 
142 /****************************************************
143  * Definitions of bit operations
144  ****************************************************/
145 
146 /****************************************************
147  * Definitions of bit operations for float
148  ****************************************************/
149 
150 #define vand_ps _mm_and_ps
151 #define vandnot_ps _mm_andnot_ps
152 #define vor_ps _mm_or_ps
153 #define vxor_ps _mm_xor_ps
154 
155 /****************************************************
156  * Definitions of bit operations for double
157  ****************************************************/
158 
159 #define vand_pd _mm_and_pd
160 #define vandnot_pd _mm_andnot_pd
161 #define vor_pd _mm_or_pd
162 #define vxor_pd _mm_xor_pd
163 
164 /****************************************************
165  * Define reductions of vector registers
166  ****************************************************/
167 
168 /****************************************************
169  * Define reductions of vector registers for floats
170  ****************************************************/
171 
172 #define vreduce_ps _mm_reduce_ps_
173 
174 /****************************************************
175  * Define reductions of vector registers for floats
176  ****************************************************/
177 
178 #define vreduce_pd _mm_reduce_pd_
179 
180 /****************************************************
181  * Definition of little helper functions
182  ****************************************************/
183 
184 /****************************************************
185  * Definition of little helfer functions for float
186  ****************************************************/
187 
188 #define vdot3_ps _mm_dot3_ps_
189 
190 /****************************************************
191  * Definition of little helfer functions for double
192  ****************************************************/
193 
194 #define vdot3_pd _mm_dot3_pd_
195 
196 /****************************************************
197  * Implementation of FMA function, if needed
198  ****************************************************/
199 
200 /****************************************************
201  * Implementation of FMA function, if needed for float
202  ****************************************************/
203 
204 #ifndef __FMA__
205 static inline vecf _mm_fmadd_ps(vecf a, vecf b, vecf c) {
206  return vadd_ps(vmul_ps(a, b), c);
207 }
208 
209 static inline vecf _mm_fmsub_ps(vecf a, vecf b, vecf c) {
210  return vsub_ps(vmul_ps(a, b), c);
211 }
212 
213 static inline vecf _mm_fnmadd_ps(vecf a, vecf b, vecf c) {
214  return vsub_ps(c, vmul_ps(a, b));
215 }
216 
217 static inline vecf _mm_fnmsub_ps(vecf a, vecf b, vecf c) {
218  return vmul_ps(vset1_ps(-1.0f), vadd_ps(vmul_ps(a, b), c));
219 }
220 #endif
221 
222 /****************************************************
223  * Implementation of FMA function, if needed for double
224  ****************************************************/
225 
226 #ifndef __FMA__
227 static inline vecd _mm_fmadd_pd(vecd a, vecd b, vecd c) {
228  return vadd_pd(vmul_pd(a, b), c);
229 }
230 
231 static inline vecd _mm_fmsub_pd(vecd a, vecd b, vecd c) {
232  return vsub_pd(vmul_pd(a, b), c);
233 }
234 
235 static inline vecd _mm_fnmadd_pd(vecd a, vecd b, vecd c) {
236  return vsub_pd(c, vmul_pd(a, b));
237 }
238 
239 static inline vecd _mm_fnmsub_pd(vecd a, vecd b, vecd c) {
240  return vmul_pd(vset1_pd(-1.0), vadd_pd(vmul_pd(a, b), c));
241 }
242 #endif
243 
244 #ifndef __SSE4_1__
245 static inline vecf _mm_floor_ps_(vecf a) {
246  return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
247 }
248 
249 static inline vecd _mm_floor_pd_(vecd a) {
250  return _mm_cvtepi32_pd(_mm_cvttpd_epi32(a));
251 }
252 #else
253 static inline vecf _mm_floor_ps_(vecf a) {
254  return _mm_floor_ps(a);
255 }
256 
257 static inline vecd _mm_floor_pd_(vecd a) {
258  return _mm_floor_pd(a);
259 }
260 #endif
261 
262 #ifndef __SSE4_1__
263 static inline __m128i _mm_cvtepi32_epi64_(__m128i a) {
264  __m128i sel = _mm_set1_epi32(0x80000000);
265  __m128i mask;
266  __m128i a1;
267 
268  __m128i b, c;
269 
270  mask = _mm_cmpeq_epi32(_mm_and_si128(a, sel), sel);
271  a1 = _mm_andnot_si128(sel, a);
272 
273  b = _mm_or_si128(_mm_and_si128(mask, a1), _mm_andnot_si128(mask, a));
274  c = _mm_or_si128(_mm_and_si128(mask, sel),
275  _mm_andnot_si128(mask, _mm_setzero_si128()));
276  b = _mm_unpacklo_epi32(b,c);
277 
278  return b;
279 }
280 #else
281 static inline __m128i _mm_cvtepi32_epi64_(__m128i a) {
282  return _mm_cvtepi32_epi64(a);
283 }
284 #endif
285 
286 /****************************************************
287  * Implementations of basic vector functions
288  ****************************************************/
289 
290 static inline vecf _mm_rsqrt_ps_(vecf x) {
291  vecf x2, t1, t2, t3;
292 
293  x2 = vmul_ps(x, vset1_ps(0.5f));
294  x = _mm_rsqrt_ps(x);
295 
296  t1 = vmul_ps(x, x);
297  t2 = vmul_ps(x2, x);
298  t3 = vmul_ps(vset1_ps(1.5f), x);
299  x = vfnmadd_ps(t1, t2, t3);
300 
301  return x;
302 }
303 
304 static inline vecd _mm_rsqrt_pd_(vecd x) {
305  vecd x2, t1, t2, t3;
306  vecd cd_15;
307 
308  cd_15 = vset1_pd(1.5);
309 
310  x2 = vmul_pd(x, vset1_pd(0.5));
311  x = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x)));
312 
313  t1 = vmul_pd(x, x);
314  t2 = vmul_pd(x2, x);
315  t3 = vmul_pd(cd_15, x);
316  x = vfnmadd_pd(t1, t2, t3);
317 
318  t1 = vmul_pd(x, x);
319  t2 = vmul_pd(x2, x);
320  t3 = vmul_pd(cd_15, x);
321  x = vfnmadd_pd(t1, t2, t3);
322 
323  return x;
324 }
325 
326 /****************************************************
327  * Implementations of basic vector functions for float
328  ****************************************************/
329 
330 static inline float _mm_reduce_ps_(vecf x) {
331  return x[0] + x[1] + x[2] + x[3];
332 }
333 
334 /****************************************************
335  * Implementations of basic vector functions for double
336  ****************************************************/
337 
338 static inline double _mm_reduce_pd_(vecd x) {
339  return x[0] + x[1];
340 }
341 
342 /****************************************************
343  * Implementation of advanced arithmetic operations
344  ****************************************************/
345 
346 /****************************************************
347  * Implementation of advanced arithmetic operations for float
348  ****************************************************/
349 
350 static const float coeff_fcos[] = {
351  1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
352  1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
353  / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
354  / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
355  / (27.0 * 28.0) };
356 static const float coeff_fsin[] = {
357  1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
358  1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
359  / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
360  / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
361  / (28.0 * 29.0) };
362 static const int f_terms = 5;
363 
364 static const float fpi = 3.14159265358979323846264338327950288L;
365 static const float fone_2pi = 1.0L / 6.28318530717958647692528676655900577L;
366 
367 static inline vecf _mm_cos_ps_(vecf xi) {
368  vecf xi2, n, v_one, c, c_const, c_const2;
369  int i;
370 
371  v_one = vset1_ps(1.0f);
372 
373  n = _mm_floor_ps_(vmul_ps(xi, vset1_ps(fone_2pi)));
374  n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
375 
376  xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
377  xi2 = vmul_ps(xi, xi);
378 
379  c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
380  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
381  c = vfnmadd_ps(c_const2, c_const, c_const);
382 
383  for (i = f_terms - 1; i >= 0; i--) {
384  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
385  c = vfnmadd_ps(c, c_const, c_const);
386  ;
387  }
388 
389  c = vsub_ps(c, v_one);
390 
391  return c;
392 }
393 
394 static inline vecf _mm_sin_ps_(vecf xi) {
395  vecf xi2, n, v_one, s, s_const;
396  int i;
397 
398  v_one = vset1_ps(1.0f);
399 
400  n = _mm_floor_ps_(vmul_ps(xi, vset1_ps(fone_2pi)));
401  n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
402 
403  xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
404  xi2 = vmul_ps(xi, xi);
405 
406  s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
407 
408  for (i = f_terms - 1; i >= 0; i--) {
409  s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
410  s = vfnmadd_ps(s, s_const, s_const);
411  }
412 
413  s = vfmsub_ps(xi, s, xi);
414 
415  return s;
416 }
417 
418 static inline void _mm_sincos_ps_(vecf xi, vecf *cp, vecf *sp) {
419  vecf xi2, n, v_one, c, c_const, c_const2, s, s_const;
420  int i;
421 
422  v_one = vset1_ps(1.0f);
423 
424  n = _mm_floor_ps_(vmul_ps(xi, vset1_ps(fone_2pi)));
425  n = vfmadd_ps(vset1_ps(2.0f), n, v_one);
426 
427  xi = vfnmadd_ps(n, vset1_ps(fpi), xi);
428  xi2 = vmul_ps(xi, xi);
429 
430  s = vmul_ps(xi2, vset1_ps(coeff_fsin[f_terms]));
431 
432  c_const2 = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms + 1]));
433  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[f_terms]));
434  c = vfnmadd_ps(c_const2, c_const, c_const);
435 
436  for (i = f_terms - 1; i >= 0; i--) {
437  c_const = vmul_ps(xi2, vset1_ps(coeff_fcos[i]));
438  s_const = vmul_ps(xi2, vset1_ps(coeff_fsin[i]));
439 
440  c = vfnmadd_ps(c, c_const, c_const);
441  s = vfnmadd_ps(s, s_const, s_const);
442  }
443 
444  *cp = vsub_ps(c, v_one);
445  *sp = vfmsub_ps(xi, s, xi);
446 }
447 
448 static const float coeff_fexp[] = {
449  1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
450  1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
451  / 14.0, 1.0 / 15.0 };
452 
453 static const float flde = 1.4426950408889634e+00f;
454 static const float fln2 = 6.9314718055994529e-01f;
455 
456 static const int f_expterms = 7;
457 
458 static inline vecf _mm_exp_ps_(vecf x) {
459  int i;
460  __m128 x1, y1, y, c_one, c;
461  __m128i by1, pow2;
462 
463  c_one = vset1_ps(1.0f);
464 
465  x1 = vmul_ps(vset1_ps(flde), x);
466 
467  pow2 = _mm_cvttps_epi32(x1);
468 
469  x1 = vfnmadd_ps(vset1_ps(fln2), _mm_cvtepi32_ps(pow2), x);
470 
471  c = vset1_ps(coeff_fexp[f_expterms - 1]);
472  y1 = vfmadd_ps(c, x1, c_one);
473  for (i = f_expterms - 2; i >= 0; i--) {
474  c = vset1_ps(coeff_fexp[i]);
475  y1 = vfmadd_ps(vmul_ps(c, x1), y1, c_one);
476  }
477 
478  /* Multiply by 2^pow2 by adding to exponent in binary representation */
479  by1 = _mm_castps_si128(y1);
480  by1 = _mm_add_epi32(by1, _mm_slli_epi32(pow2, 23));
481  ;
482  y = _mm_castsi128_ps(by1);
483 
484  return y;
485 }
486 
487 /****************************************************
488  * Implementation of advanced arithmetic operations for double
489  ****************************************************/
490 
491 static const double coeff_dcos[] = {
492  1.0 / (1.0 * 2.0), 1.0 / (3.0 * 4.0), 1.0 / (5.0 * 6.0), 1.0 / (7.0 * 8.0),
493  1.0 / (9.0 * 10.0), 1.0 / (11.0 * 12.0), 1.0 / (13.0 * 14.0), 1.0
494  / (15.0 * 16.0), 1.0 / (17.0 * 18.0), 1.0 / (19.0 * 20.0), 1.0
495  / (21.0 * 22.0), 1.0 / (23.0 * 24.0), 1.0 / (25.0 * 26.0), 1.0
496  / (27.0 * 28.0) };
497 static const double coeff_dsin[] = {
498  1.0 / (2.0 * 3.0), 1.0 / (4.0 * 5.0), 1.0 / (6.0 * 7.0), 1.0 / (8.0 * 9.0),
499  1.0 / (10.0 * 11.0), 1.0 / (12.0 * 13.0), 1.0 / (14.0 * 15.0), 1.0
500  / (16.0 * 17.0), 1.0 / (18.0 * 19.0), 1.0 / (20.0 * 21.0), 1.0
501  / (22.0 * 23.0), 1.0 / (24.0 * 25.0), 1.0 / (26.0 * 27.0), 1.0
502  / (28.0 * 29.0) };
503 static const int d_terms = 10;
504 
505 static const double dpi = 3.14159265358979323846264338327950288L;
506 static const double done_2pi = 1.0L / 6.28318530717958647692528676655900577L;
507 
508 static inline vecd _mm_cos_pd_(vecd xi) {
509  vecd xi2, n, v_one, c, c_const, c_const2;
510  int i;
511 
512  v_one = vset1_pd(1.0);
513 
514  n = _mm_floor_pd_(vmul_pd(xi, vset1_pd(done_2pi)));
515  n = vfmadd_pd(vset1_pd(2.0), n, v_one);
516 
517  xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
518  xi2 = vmul_pd(xi, xi);
519 
520  c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
521  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
522  c = vfnmadd_pd(c_const2, c_const, c_const);
523 
524  for (i = d_terms - 1; i >= 0; i--) {
525  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
526  c = vfnmadd_pd(c, c_const, c_const);
527  ;
528  }
529 
530  c = vsub_pd(c, v_one);
531 
532  return c;
533 }
534 
535 static inline vecd _mm_sin_pd_(vecd xi) {
536  vecd xi2, n, v_one, s, s_const;
537  int i;
538 
539  v_one = vset1_pd(1.0);
540 
541  n = _mm_floor_pd_(vmul_pd(xi, vset1_pd(done_2pi)));
542  n = vfmadd_pd(vset1_pd(2.0), n, v_one);
543 
544  xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
545  xi2 = vmul_pd(xi, xi);
546 
547  s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
548 
549  for (i = d_terms - 1; i >= 0; i--) {
550  s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
551  s = vfnmadd_pd(s, s_const, s_const);
552  }
553 
554  s = vfmsub_pd(xi, s, xi);
555 
556  return s;
557 }
558 
559 static inline void _mm_sincos_pd_(vecd xi, vecd *cp, vecd *sp) {
560  vecd xi2, n, v_one, c, c_const, c_const2, s, s_const;
561  int i;
562 
563  v_one = vset1_pd(1.0);
564 
565  n = _mm_floor_pd_(vmul_pd(xi, vset1_pd(done_2pi)));
566  n = vfmadd_pd(vset1_pd(2.0), n, v_one);
567 
568  xi = vfnmadd_pd(n, vset1_pd(dpi), xi);
569  xi2 = vmul_pd(xi, xi);
570 
571  s = vmul_pd(xi2, vset1_pd(coeff_dsin[d_terms]));
572 
573  c_const2 = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms + 1]));
574  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[d_terms]));
575  c = vfnmadd_pd(c_const2, c_const, c_const);
576 
577  for (i = d_terms - 1; i >= 0; i--) {
578  c_const = vmul_pd(xi2, vset1_pd(coeff_dcos[i]));
579  s_const = vmul_pd(xi2, vset1_pd(coeff_dsin[i]));
580 
581  c = vfnmadd_pd(c, c_const, c_const);
582  s = vfnmadd_pd(s, s_const, s_const);
583  }
584 
585  *cp = vsub_pd(c, v_one);
586  *sp = vfmsub_pd(xi, s, xi);
587 }
588 
589 static const double coeff_dexp[] = {
590  1.0 / 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
591  1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0, 1.0 / 12.0, 1.0 / 13.0, 1.0
592  / 14.0, 1.0 / 15.0 };
593 
594 static const double dlde = 1.4426950408889634e+00;
595 static const double dln2 = 6.9314718055994529e-01;
596 
597 static const int d_expterms = 14;
598 
599 static inline vecd _mm_exp_pd_(vecd x) {
600  int i;
601  __m128d x1, y1, y, c_one, c;
602  __m128i pow2, by1;
603 
604  c_one = vset1_pd(1.0);
605 
606  x1 = vmul_pd(vset1_pd(dlde), x);
607  pow2 = _mm_cvttpd_epi32(x1);
608 
609  x1 = vfnmadd_pd(vset1_pd(dln2), _mm_cvtepi32_pd(pow2), x);
610 
611  c = vset1_pd(coeff_dexp[d_expterms - 1]);
612  y1 = vfmadd_pd(c, x1, c_one);
613  for (i = d_expterms - 2; i >= 0; i--) {
614  c = vset1_pd(coeff_dexp[i]);
615  y1 = vfmadd_pd(vmul_pd(c, x1), y1, c_one);
616  }
617 
618  /* Multiply by 2^pow2 by adding to exponent in binary representation */
619  by1 = _mm_castpd_si128(y1);
620  by1 = _mm_add_epi64(by1, _mm_slli_epi64(_mm_cvtepi32_epi64_(pow2), 52));
621  y = _mm_castsi128_pd(by1);
622 
623  return y;
624 }
625 
626 /****************************************************
627  * Implementation of little helper functions
628  ****************************************************/
629 
630 /****************************************************
631  * Implementation of little helper functions for float
632  ****************************************************/
633 
634 static inline vecf _mm_dot3_ps_(vecf x[3], vecf y[3]) {
635  vecf res;
636 
637  res = vmul_ps(x[0], y[0]);
638  res = vfmadd_ps(x[1], y[1], res);
639  res = vfmadd_ps(x[2], y[2], res);
640 
641  return res;
642 }
643 
644 /****************************************************
645  * Implementation of little helper functions for double
646  ****************************************************/
647 
648 static inline vecd _mm_dot3_pd_(vecd x[3], vecd y[3]) {
649  vecd res;
650 
651  res = vmul_pd(x[0], y[0]);
652  res = vfmadd_pd(x[1], y[1], res);
653  res = vfmadd_pd(x[2], y[2], res);
654 
655  return res;
656 }
657 
658 #endif
659 
660 #endif /* LIBRARY_SIMD_AVX_H_ */