cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_math.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2017 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_MATH_H
5 #define VECTORIZE_MATH_H
6 
7 // uint64 is required for AVX support -- uint64 should be present on all modern systems
8 #ifndef HAVE_UINT64
9 #undef __AVX__
10 #endif
11 
12 #ifdef __AVX__
13 
14 // unconditionally disable avx512f optimizations since these have not been tested yet
15 // remove the following line once the testing is complete...
16 #undef __AVX512F__
17 
18 // use shorthand for packed types
19 typedef __m128d v2df;
20 typedef __m128 v4sf;
21 typedef __m128i v2di;
22 typedef __m128i v4si;
23 
24 typedef __m256d v4df;
25 typedef __m256 v8sf;
26 typedef __m256i v4di;
27 typedef __m256i v8si;
28 
29 #ifdef __AVX512F__
30 
31 typedef __m512d v8df;
32 typedef __m512 v16sf;
33 typedef __m512i v8di;
34 typedef __m512i v16si;
35 
36 // some macros for defining vector constants
37 
38 #define VECD_CONST(name,x) \
39 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
40 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
41 
42 #define VECDI_CONST(name,x) \
43 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
44 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
45 
46 #define VECF_CONST(name,x) \
47 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
48 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
49 
50 #define VECFI_CONST(name,x) \
51 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
52 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
53 
54 #define VECI_CONST(name,x) \
55 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
56 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
57 
58 #define VECII_CONST(name,x) \
59 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
60 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
61 
62 #define VECL_CONST(name,x) \
63 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
64 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
65 
66 #define VECLL_CONST(name,x) \
67 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
68 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
69 
70 union vpack
71 {
72  ALIGNED(64) v8df pd;
73  ALIGNED(64) v16sf ps;
74  ALIGNED(64) double d[8];
75  ALIGNED(64) int64 l[8];
76  ALIGNED(64) sys_float f[16];
77  ALIGNED(64) int32 i[16];
78 };
79 
80 inline bool getmaskbit(unsigned int mask, unsigned int n)
81 {
82  return ( ((mask>>n)&1) == 1 );
83 }
84 
85 template<class T>
86 inline string print_pack(const T x[], int npack, unsigned int mask, const string& name)
87 {
88  ostringstream oss;
89  for( int i=0; i < npack; ++i )
90  {
91  if( getmaskbit(mask, i) )
92  oss << " => ";
93  else
94  oss << " ";
95  oss << name << "[" << i << "] = " << x[i] << "\n";
96  }
97  return oss.str();
98 }
99 
100 inline string DEMsg(const string& routine, v8df x, __mmask8 mask)
101 {
102  vpack p;
103  ostringstream oss;
104  oss << "domain error in " << routine << ".\n";
105  p.pd = x;
106  oss << print_pack(p.d, 8, mask, "x");
107  return oss.str();
108 }
109 
110 inline string DEMsg(const string& routine, v16sf x, __mmask16 mask)
111 {
112  vpack p;
113  ostringstream oss;
114  oss << "domain error in " << routine << ".\n";
115  p.ps = x;
116  oss << print_pack(p.f, 16, mask, "x");
117  return oss.str();
118 }
119 
120 inline string DEMsg(const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
121 {
122  vpack p;
123  ostringstream oss;
124  oss << "domain error in " << routine << ".\n";
125  p.pd = x;
126  oss << print_pack(p.d, 8, mask1, "x");
127  p.pd = y;
128  oss << print_pack(p.d, 8, mask2, "y");
129  return oss.str();
130 }
131 
132 inline string DEMsg(const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
133 {
134  vpack p;
135  ostringstream oss;
136  oss << "domain error in " << routine << ".\n";
137  p.ps = x;
138  oss << print_pack(p.f, 16, mask1, "x");
139  p.ps = y;
140  oss << print_pack(p.f, 16, mask2, "y");
141  return oss.str();
142 }
143 
144 #else // __AVX512F__
145 
146 #define VECD_CONST(name,x) \
147 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
148 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
149 
150 #define VECDI_CONST(name,x) \
151 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
152 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
153 
154 #define VECF_CONST(name,x) \
155 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
156 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
157 
158 #define VECFI_CONST(name,x) \
159 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
160 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
161 
162 #define VECI_CONST(name,x) \
163 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
164 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
165 
166 #define VECII_CONST(name,x) \
167 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
168 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
169 
170 #define VECL_CONST(name,x) \
171 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
172 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
173 
174 #define VECLL_CONST(name,x) \
175 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
176 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
177 
178 union vpack
179 {
180  ALIGNED(32) v4df pd;
181  ALIGNED(32) v8sf ps;
182  ALIGNED(32) double d[4];
183  ALIGNED(32) int64 l[4];
184  ALIGNED(32) sys_float f[8];
185  ALIGNED(32) int32 i[8];
186 };
187 
188 template<class T, class U>
189 inline string print_pack(const T x[], const U m[], int npack, const string& name)
190 {
191  ostringstream oss;
192  for( int i=0; i < npack; ++i )
193  {
194  if( m[i] < 0 )
195  oss << " => ";
196  else
197  oss << " ";
198  oss << name << "[" << i << "] = " << x[i] << "\n";
199  }
200  return oss.str();
201 }
202 
203 inline string DEMsg(const string& routine, v4df x, v4df mask)
204 {
205  vpack p, m;
206  ostringstream oss;
207  oss << "domain error in " << routine << ".\n";
208  p.pd = x;
209  m.pd = mask;
210  oss << print_pack(p.d, m.l, 4, "x");
211  return oss.str();
212 }
213 
214 inline string DEMsg(const string& routine, v8sf x, v8sf mask)
215 {
216  vpack p, m;
217  ostringstream oss;
218  oss << "domain error in " << routine << ".\n";
219  p.ps = x;
220  m.ps = mask;
221  oss << print_pack(p.f, m.i, 8, "x");
222  return oss.str();
223 }
224 
225 inline string DEMsg(const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
226 {
227  vpack p, m;
228  ostringstream oss;
229  oss << "domain error in " << routine << ".\n";
230  p.pd = x;
231  m.pd = mask1;
232  oss << print_pack(p.d, m.l, 4, "x");
233  p.pd = y;
234  m.pd = mask2;
235  oss << print_pack(p.d, m.l, 4, "y");
236  return oss.str();
237 }
238 
239 inline string DEMsg(const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
240 {
241  vpack p, m;
242  ostringstream oss;
243  oss << "domain error in " << routine << ".\n";
244  p.ps = x;
245  m.ps = mask1;
246  oss << print_pack(p.f, m.i, 8, "x");
247  p.ps = y;
248  m.ps = mask2;
249  oss << print_pack(p.f, m.i, 8, "y");
250  return oss.str();
251 }
252 
253 #endif // __AVX512F__
254 
255 VECD_CONST(mthree,-3.);
256 VECD_CONST(mone,-1.);
257 VECD_CONST(mhalf,-0.5);
258 VECD_CONST(zero,0.);
259 VECDI_CONST(third,0x3fd5555555555555); // 0.33333333333333333333
260 VECD_CONST(half,0.5);
261 VECD_CONST(one,1.);
262 VECD_CONST(c1p5,1.5);
263 VECD_CONST(two,2.);
264 VECD_CONST(three,3.);
265 VECD_CONST(six,6.);
266 VECD_CONST(ten,10.);
267 VECD_CONST(c1023,1023.);
268 
269 VECDI_CONST(ln_two,0x3fe62e42fefa39ef); // log(2.)
270 VECDI_CONST(log2e,0x3ff71547652b82fe); // log2(e) = 1./log(2.));
271 VECDI_CONST(ln_ten,0x40026bb1bbb55516); // log(10.)
272 VECDI_CONST(log10e,0x3fdbcb7b1526e50e); // log10(e) = 1./log(10.)
273 
274 VECDI_CONST(dbl_min,0x0010000000000000); // DBL_MIN
275 VECDI_CONST(dbl_max,0x7fefffffffffffff); // DBL_MAX
276 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff); // sqrt(DBL_MAX)
277 
278 VECF_CONST(mthreef,-3.f);
279 VECF_CONST(monef,-1.f);
280 VECF_CONST(mhalff,-0.5f);
281 VECF_CONST(zerof,0.f);
282 VECFI_CONST(thirdf,0x3eaaaaab); // 0.3333333333f
283 VECF_CONST(halff,0.5f);
284 VECF_CONST(onef,1.f);
285 VECF_CONST(twof,2.f);
286 VECF_CONST(c1p5f,1.5f);
287 VECF_CONST(threef,3.f);
288 VECF_CONST(sixf,6.f);
289 VECF_CONST(tenf,10.f);
290 VECF_CONST(c127,127.f);
291 
292 VECFI_CONST(ln_twof,0x3f317218); // logf(2.f)
293 VECFI_CONST(log2ef,0x3fb8aa3b); // log2f(e) = 1.f/logf(2.f)
294 VECFI_CONST(ln_tenf,0x40135d8e); // logf(10.f)
295 VECFI_CONST(log10ef,0x3ede5bd9); // log10f(e) = 1.f/logf(10.f)
296 
297 VECFI_CONST(flt_min,0x00800000); // FLT_MIN
298 VECFI_CONST(flt_max,0x7f7fffff); // FLT_MAX
299 VECFI_CONST(sqrt_flt_max,0x5f7fffff); // sqrt(FLT_MAX)
300 
301 template<class V>
302 inline void vecfun_partial(const sys_float x[], sys_float y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
303 {
304  vpack xx, yy;
305  for( long i=0; i < npack; ++i )
306  xx.f[i] = x[min(nlo+i,nhi-1)];
307  yy.ps = vecfun1(xx.ps);
308  for( long i=nlo; i < nhi; ++i )
309  y[i] = yy.f[i-nlo];
310 }
311 
312 template<class V>
313 inline void vecfun_partial(const double x[], double y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
314 {
315  vpack xx, yy;
316  for( long i=0; i < npack; ++i )
317  xx.d[i] = x[min(nlo+i,nhi-1)];
318  yy.pd = vecfun1(xx.pd);
319  for( long i=nlo; i < nhi; ++i )
320  y[i] = yy.d[i-nlo];
321 }
322 
323 template<class V>
324 inline void vecfun2_partial(const sys_float x1[], const sys_float x2[], sys_float y[], long nlo, long nhi,
325  long npack, V (*vecfun1)(V, V))
326 {
327  vpack xx1, xx2, yy;
328  for( long i=0; i < npack; ++i )
329  {
330  xx1.f[i] = x1[min(nlo+i,nhi-1)];
331  xx2.f[i] = x2[min(nlo+i,nhi-1)];
332  }
333  yy.ps = vecfun1(xx1.ps, xx2.ps);
334  for( long i=nlo; i < nhi; ++i )
335  y[i] = yy.f[i-nlo];
336 }
337 
338 template<class V>
339 inline void vecfun2_partial(const double x1[], const double x2[], double y[], long nlo, long nhi, long npack,
340  V (*vecfun1)(V, V))
341 {
342  vpack xx1, xx2, yy;
343  for( long i=0; i < npack; ++i )
344  {
345  xx1.d[i] = x1[min(nlo+i,nhi-1)];
346  xx2.d[i] = x2[min(nlo+i,nhi-1)];
347  }
348  yy.pd = vecfun1(xx1.pd, xx2.pd);
349  for( long i=nlo; i < nhi; ++i )
350  y[i] = yy.d[i-nlo];
351 }
352 
353 // this is the generic wrapper routine for all vectorized math functions
354 // it makes sure that the vectorized function gets arguments that are properly aligned
355 // scalfun1 is the normal scalar routine working on a single argument
356 // vecfun1 is the vectorized routine working on a packed register
357 template<class T, class V>
358 inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*)(T), V (*vecfun1)(V))
359 {
360  if( nhi <= nlo )
361  return;
362 
363  // determine number of elements of type T in a packed register
364  long npack = sizeof(V)/sizeof(T);
365  if( nhi-nlo < npack )
366  {
367  vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
368  return;
369  }
370  long i, ilo = nlo;
371  long ox = (reinterpret_cast<long>(x)/sizeof(T)+nlo)%npack;
372  long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
373  bool lgNonAligned = ( ox != oy );
374  T *yl, *ylocal = NULL;
375  if( lgNonAligned )
376  {
377  ylocal = new T[nhi+npack-1];
378  long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
379  if( ox >= ol )
380  yl = &ylocal[ox-ol];
381  else
382  yl = &ylocal[ox+npack-ol];
383  }
384  else
385  {
386  yl = y;
387  }
388  // the initial element is not aligned correctly ->
389  // use scalar routine until we are correctly aligned
390  if( ox > 0 )
391  {
392  vecfun_partial(x, yl, ilo, min(ilo+npack-ox,nhi), npack, vecfun1);
393  ilo = min(ilo+npack-ox,nhi);
394  }
395  // use vectorized routine as long as there are at least npack evaluations left
396  for( i=ilo; i < nhi-npack+1; i += npack )
397  {
398  const V& xx = *reinterpret_cast<const V*>(&x[i]);
399  V& yy = *reinterpret_cast<V*>(&yl[i]);
400  yy = vecfun1(xx);
401  }
402  ilo = i;
403  // use partial routine again for the remaining calls (if any)...
404  if( ilo < nhi )
405  vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
406  if( lgNonAligned )
407  {
408  for( i=nlo; i < nhi; ++i )
409  y[i] = yl[i];
410  delete[] ylocal;
411  }
412 }
413 
414 template<class T, class V>
415 inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*)(T, T), V (*vecfun1)(V, V))
416 {
417  if( nhi <= nlo )
418  return;
419 
420  // determine number of elements of type T in a packed register
421  long npack = sizeof(V)/sizeof(T);
422  if( nhi-nlo < npack )
423  {
424  vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
425  return;
426  }
427  long i, ilo = nlo;
428  long ox1 = (reinterpret_cast<long>(x1)/sizeof(T)+nlo)%npack;
429  long ox2 = (reinterpret_cast<long>(x2)/sizeof(T)+nlo)%npack;
430  long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
431  bool lgNonAligned1 = ( ox1 != ox2 );
432  bool lgNonAligned2 = ( ox1 != oy );
433  const T *x2l;
434  T *x2local = NULL;
435  if( lgNonAligned1 )
436  {
437  x2local = new T[nhi+npack-1];
438  long ol = (reinterpret_cast<long>(x2local)/sizeof(T)+nlo)%npack;
439  T *ptr;
440  if( ox1 >= ol )
441  ptr = &x2local[ox1-ol];
442  else
443  ptr = &x2local[ox1+npack-ol];
444  memcpy(ptr+nlo, x2+nlo, size_t((nhi-nlo)*sizeof(T)));
445  x2l = ptr;
446  }
447  else
448  {
449  x2l = x2;
450  }
451  T *yl, *ylocal = NULL;
452  if( lgNonAligned2 )
453  {
454  ylocal = new T[nhi+npack-1];
455  long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
456  if( ox1 >= ol )
457  yl = &ylocal[ox1-ol];
458  else
459  yl = &ylocal[ox1+npack-ol];
460  }
461  else
462  {
463  yl = y;
464  }
465  // the initial element is not aligned correctly ->
466  // use scalar routine until we are correctly aligned
467  if( ox1 > 0 )
468  {
469  vecfun2_partial(x1, x2l, yl, ilo, min(ilo+npack-ox1,nhi), npack, vecfun1);
470  ilo = min(ilo+npack-ox1,nhi);
471  }
472  // use vectorized routine as long as there are at least npack evaluations left
473  for( i=ilo; i < nhi-npack+1; i += npack )
474  {
475  const V& xx1 = *reinterpret_cast<const V*>(&x1[i]);
476  const V& xx2 = *reinterpret_cast<const V*>(&x2l[i]);
477  V& yy = *reinterpret_cast<V*>(&yl[i]);
478  yy = vecfun1(xx1, xx2);
479  }
480  ilo = i;
481  // use partial routine again for the remaining calls (if any)...
482  if( ilo < nhi )
483  vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
484  if( lgNonAligned1 )
485  delete[] x2local;
486  if( lgNonAligned2 )
487  {
488  for( i=nlo; i < nhi; ++i )
489  y[i] = yl[i];
490  delete[] ylocal;
491  }
492 }
493 
494 #ifdef __AVX512F__
495 #define V1FUN_PD_4(FUN, V) \
496  v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
497  v8df yy = v1##FUN##d(xx); \
498  memcpy(y, &yy, 4*sizeof(double))
499 #else
500 #define V1FUN_PD_4(FUN, V) \
501  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
502  v4df yy = v1##FUN##d(xx); \
503  memcpy(y, &yy, 4*sizeof(double))
504 #endif
505 
506 #ifdef __AVX512F__
507 #define V1FUN_PD_8(FUN, V) \
508  v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
509  v8df yy = v1##FUN##d(xx); \
510  memcpy(y, &yy, 8*sizeof(double))
511 #else
512 #define V1FUN_PD_8(FUN, V) \
513  v4df yy[2]; \
514  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
515  yy[0] = v1##FUN##d(xx); \
516  xx = _mm256_set_pd( x7, x6, x5, x4 ); \
517  yy[1] = v1##FUN##d(xx); \
518  memcpy(y, &yy, 8*sizeof(double))
519 #endif
520 
521 #ifdef __AVX512F__
522 #define V1FUN_PS_4(FUN, V) \
523  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
524  v16sf yy = v1##FUN##f(xx); \
525  memcpy(y, &yy, 4*sizeof(sys_float))
526 #else
527 #define V1FUN_PS_4(FUN, V) \
528  v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
529  v8sf yy = v1##FUN##f(xx); \
530  memcpy(y, &yy, 4*sizeof(sys_float))
531 #endif
532 
533 #ifdef __AVX512F__
534 #define V1FUN_PS_8(FUN, V) \
535  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
536  v16sf yy = v1##FUN##f(xx); \
537  memcpy(y, &yy, 8*sizeof(sys_float))
538 #else
539 #define V1FUN_PS_8(FUN, V) \
540  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
541  v8sf yy = v1##FUN##f(xx); \
542  memcpy(y, &yy, 8*sizeof(sys_float))
543 #endif
544 
545 #ifdef __AVX512F__
546 #define V1FUN_PS_16(FUN, V) \
547  v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
548  v16sf yy = v1##FUN##f(xx); \
549  memcpy(y, &yy, 16*sizeof(sys_float))
550 #else
551 #define V1FUN_PS_16(FUN, V) \
552  v8sf yy[2]; \
553  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
554  yy[0] = v1##FUN##f(xx); \
555  xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
556  yy[1] = v1##FUN##f(xx); \
557  memcpy(y, &yy, 16*sizeof(sys_float))
558 #endif
559 
560 #ifdef __AVX512F__
561 #define V1FUN2_PD_4(FUN, V) \
562  v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
563  v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
564  v8df zz = v1##FUN##d(xx, yy); \
565  memcpy(z, &zz, 4*sizeof(double))
566 #else
567 #define V1FUN2_PD_4(FUN, V) \
568  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
569  v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
570  v4df zz = v1##FUN##d(xx, yy); \
571  memcpy(z, &zz, 4*sizeof(double))
572 #endif
573 
574 #ifdef __AVX512F__
575 #define V1FUN2_PS_4(FUN, V) \
576  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
577  v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
578  v16sf zz = v1##FUN##f(xx, yy); \
579  memcpy(z, &zz, 4*sizeof(sys_float))
580 #else
581 #define V1FUN2_PS_4(FUN, V) \
582  v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
583  v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
584  v8sf zz = v1##FUN##f(xx, yy); \
585  memcpy(z, &zz, 4*sizeof(sys_float))
586 #endif
587 
588 #ifdef __AVX512F__
589 #define V1FUN2_PS_8(FUN, V) \
590  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
591  v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
592  v16sf zz = v1##FUN##f(xx, yy); \
593  memcpy(z, &zz, 8*sizeof(sys_float))
594 #else
595 #define V1FUN2_PS_8(FUN, V) \
596  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
597  v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
598  v8sf zz = v1##FUN##f(xx, yy); \
599  memcpy(z, &zz, 8*sizeof(sys_float))
600 #endif
601 
602 #else // __AVX__
603 
604 // fallback for non-AVX capable hardware
605 template<class T, class V>
606 inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*scalfun1)(T), V (*)(V))
607 {
608  for( long i=nlo; i < nhi; ++i )
609  y[i] = scalfun1(x[i]);
610 }
611 
612 template<class T, class V>
613 inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*scalfun1)(T, T), V (*)(V, V))
614 {
615  for( long i=nlo; i < nhi; ++i )
616  y[i] = scalfun1(x1[i], x2[i]);
617 }
618 
619 #define V1FUN_4(FUN, V) \
620  y[0] = FUN(x0); \
621  y[1] = FUN(x1); \
622  y[2] = FUN(x2); \
623  y[3] = FUN(x3)
624 
625 #define V1FUN_8(FUN, V) \
626  y[0] = FUN(x0); \
627  y[1] = FUN(x1); \
628  y[2] = FUN(x2); \
629  y[3] = FUN(x3); \
630  y[4] = FUN(x4); \
631  y[5] = FUN(x5); \
632  y[6] = FUN(x6); \
633  y[7] = FUN(x7)
634 
635 #define V1FUN_16(FUN, V) \
636  y[0] = FUN(x0); \
637  y[1] = FUN(x1); \
638  y[2] = FUN(x2); \
639  y[3] = FUN(x3); \
640  y[4] = FUN(x4); \
641  y[5] = FUN(x5); \
642  y[6] = FUN(x6); \
643  y[7] = FUN(x7); \
644  y[8] = FUN(x8); \
645  y[9] = FUN(x9); \
646  y[10] = FUN(x10); \
647  y[11] = FUN(x11); \
648  y[12] = FUN(x12); \
649  y[13] = FUN(x13); \
650  y[14] = FUN(x14); \
651  y[15] = FUN(x15)
652 
653 #define V1FUN_PD_4(FUN, V) \
654  V1FUN_4(FUN, V)
655 
656 #define V1FUN_PD_8(FUN, V) \
657  V1FUN_8(FUN, V)
658 
659 #define V1FUN_PS_4(FUN, V) \
660  V1FUN_4(FUN##f, V)
661 
662 #define V1FUN_PS_8(FUN, V) \
663  V1FUN_8(FUN##f, V)
664 
665 #define V1FUN_PS_16(FUN, V) \
666  V1FUN_16(FUN##f, V)
667 
668 #define V1FUN2_4(FUN, V) \
669  z[0] = FUN(x0, y0); \
670  z[1] = FUN(x1, y1); \
671  z[2] = FUN(x2, y2); \
672  z[3] = FUN(x3, y3)
673 
674 #define V1FUN2_8(FUN, V) \
675  z[0] = FUN(x0, y0); \
676  z[1] = FUN(x1, y1); \
677  z[2] = FUN(x2, y2); \
678  z[3] = FUN(x3, y3); \
679  z[4] = FUN(x4, y4); \
680  z[5] = FUN(x5, y5); \
681  z[6] = FUN(x6, y6); \
682  z[7] = FUN(x7, y7)
683 
684 #define V1FUN2_PD_4(FUN, V) \
685  V1FUN2_4(FUN, V)
686 
687 #define V1FUN2_PS_4(FUN, V) \
688  V1FUN2_4(FUN##f, V)
689 
690 #define V1FUN2_PS_8(FUN, V) \
691  V1FUN2_8(FUN##f, V)
692 
693 #endif // __AVX__
694 
695 #endif
static double x2[63]
static double x1[83]
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
void zero(void)
Definition: zero.cpp:43
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
float sys_float
Definition: cddefines.h:127
long min(int a, long b)
Definition: cddefines.h:766