4 #ifndef VECTORIZE_MATH_H
5 #define VECTORIZE_MATH_H
34 typedef __m512i v16si;
38 #define VECD_CONST(name,x) \
39 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
40 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
42 #define VECDI_CONST(name,x) \
43 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
44 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
46 #define VECF_CONST(name,x) \
47 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
48 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
50 #define VECFI_CONST(name,x) \
51 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
52 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
54 #define VECI_CONST(name,x) \
55 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
56 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
58 #define VECII_CONST(name,x) \
59 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
60 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
62 #define VECL_CONST(name,x) \
63 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
64 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
66 #define VECLL_CONST(name,x) \
67 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
68 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
80 inline
bool getmaskbit(
unsigned int mask,
unsigned int n)
82 return ( ((mask>>n)&1) == 1 );
86 inline string print_pack(
const T x[],
int npack,
unsigned int mask,
const string& name)
89 for(
int i=0; i < npack; ++i )
91 if( getmaskbit(mask, i) )
95 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
100 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask)
104 oss <<
"domain error in " << routine <<
".\n";
106 oss << print_pack(p.d, 8, mask,
"x");
110 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask)
114 oss <<
"domain error in " << routine <<
".\n";
116 oss << print_pack(p.f, 16, mask,
"x");
120 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
124 oss <<
"domain error in " << routine <<
".\n";
126 oss << print_pack(p.d, 8, mask1,
"x");
128 oss << print_pack(p.d, 8, mask2,
"y");
132 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
136 oss <<
"domain error in " << routine <<
".\n";
138 oss << print_pack(p.f, 16, mask1,
"x");
140 oss << print_pack(p.f, 16, mask2,
"y");
146 #define VECD_CONST(name,x) \
147 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
148 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
150 #define VECDI_CONST(name,x) \
151 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
152 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
154 #define VECF_CONST(name,x) \
155 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
156 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
158 #define VECFI_CONST(name,x) \
159 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
160 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
162 #define VECI_CONST(name,x) \
163 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
164 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
166 #define VECII_CONST(name,x) \
167 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
168 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
170 #define VECL_CONST(name,x) \
171 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
172 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
174 #define VECLL_CONST(name,x) \
175 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
176 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
188 template<class T, class U>
189 inline
string print_pack(const T x[], const U m[],
int npack, const
string& name)
192 for(
int i=0; i < npack; ++i )
198 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
203 inline string DEMsg(
const string& routine, v4df x, v4df mask)
207 oss <<
"domain error in " << routine <<
".\n";
210 oss << print_pack(p.d, m.l, 4,
"x");
214 inline string DEMsg(
const string& routine, v8sf x, v8sf mask)
218 oss <<
"domain error in " << routine <<
".\n";
221 oss << print_pack(p.f, m.i, 8,
"x");
225 inline string DEMsg(
const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
229 oss <<
"domain error in " << routine <<
".\n";
232 oss << print_pack(p.d, m.l, 4,
"x");
235 oss << print_pack(p.d, m.l, 4,
"y");
239 inline string DEMsg(
const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
243 oss <<
"domain error in " << routine <<
".\n";
246 oss << print_pack(p.f, m.i, 8,
"x");
249 oss << print_pack(p.f, m.i, 8,
"y");
253 #endif // __AVX512F__
255 VECD_CONST(mthree,-3.);
256 VECD_CONST(mone,-1.);
257 VECD_CONST(mhalf,-0.5);
259 VECDI_CONST(third,0x3fd5555555555555);
260 VECD_CONST(half,0.5);
262 VECD_CONST(c1p5,1.5);
264 VECD_CONST(three,3.);
267 VECD_CONST(c1023,1023.);
269 VECDI_CONST(ln_two,0x3fe62e42fefa39ef);
270 VECDI_CONST(log2e,0x3ff71547652b82fe);
271 VECDI_CONST(ln_ten,0x40026bb1bbb55516);
272 VECDI_CONST(log10e,0x3fdbcb7b1526e50e);
274 VECDI_CONST(dbl_min,0x0010000000000000);
275 VECDI_CONST(dbl_max,0x7fefffffffffffff);
276 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff);
278 VECF_CONST(mthreef,-3.f);
279 VECF_CONST(monef,-1.f);
280 VECF_CONST(mhalff,-0.5f);
281 VECF_CONST(zerof,0.f);
282 VECFI_CONST(thirdf,0x3eaaaaab);
283 VECF_CONST(halff,0.5f);
284 VECF_CONST(onef,1.f);
285 VECF_CONST(twof,2.f);
286 VECF_CONST(c1p5f,1.5f);
287 VECF_CONST(threef,3.f);
288 VECF_CONST(sixf,6.f);
289 VECF_CONST(tenf,10.f);
290 VECF_CONST(c127,127.f);
292 VECFI_CONST(ln_twof,0x3f317218);
293 VECFI_CONST(log2ef,0x3fb8aa3b);
294 VECFI_CONST(ln_tenf,0x40135d8e);
295 VECFI_CONST(log10ef,0x3ede5bd9);
297 VECFI_CONST(flt_min,0x00800000);
298 VECFI_CONST(flt_max,0x7f7fffff);
299 VECFI_CONST(sqrt_flt_max,0x5f7fffff);
302 inline void vecfun_partial(
const sys_float x[],
sys_float y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
305 for(
long i=0; i < npack; ++i )
306 xx.f[i] = x[
min(nlo+i,nhi-1)];
307 yy.ps = vecfun1(xx.ps);
308 for(
long i=nlo; i < nhi; ++i )
313 inline void vecfun_partial(
const double x[],
double y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
316 for(
long i=0; i < npack; ++i )
317 xx.d[i] = x[
min(nlo+i,nhi-1)];
318 yy.pd = vecfun1(xx.pd);
319 for(
long i=nlo; i < nhi; ++i )
325 long npack, V (*vecfun1)(V, V))
328 for(
long i=0; i < npack; ++i )
330 xx1.f[i] = x1[
min(nlo+i,nhi-1)];
331 xx2.f[i] = x2[
min(nlo+i,nhi-1)];
333 yy.ps = vecfun1(xx1.ps, xx2.ps);
334 for(
long i=nlo; i < nhi; ++i )
339 inline void vecfun2_partial(
const double x1[],
const double x2[],
double y[],
long nlo,
long nhi,
long npack,
343 for(
long i=0; i < npack; ++i )
345 xx1.d[i] = x1[
min(nlo+i,nhi-1)];
346 xx2.d[i] = x2[
min(nlo+i,nhi-1)];
348 yy.pd = vecfun1(xx1.pd, xx2.pd);
349 for(
long i=nlo; i < nhi; ++i )
357 template<
class T,
class V>
358 inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*)(T), V (*vecfun1)(V))
364 long npack =
sizeof(V)/
sizeof(T);
365 if( nhi-nlo < npack )
367 vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
371 long ox = (
reinterpret_cast<long>(x)/
sizeof(T)+nlo)%npack;
372 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
373 bool lgNonAligned = ( ox != oy );
374 T *yl, *ylocal = NULL;
377 ylocal =
new T[nhi+npack-1];
378 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
382 yl = &ylocal[ox+npack-ol];
392 vecfun_partial(x, yl, ilo,
min(ilo+npack-ox,nhi), npack, vecfun1);
393 ilo =
min(ilo+npack-ox,nhi);
396 for( i=ilo; i < nhi-npack+1; i += npack )
398 const V& xx = *
reinterpret_cast<const V*
>(&x[i]);
399 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
405 vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
408 for( i=nlo; i < nhi; ++i )
414 template<
class T,
class V>
415 inline void vecfun2(
const T x1[],
const T x2[], T y[],
long nlo,
long nhi, T (*)(T, T), V (*vecfun1)(V, V))
421 long npack =
sizeof(V)/
sizeof(T);
422 if( nhi-nlo < npack )
424 vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
428 long ox1 = (
reinterpret_cast<long>(
x1)/
sizeof(T)+nlo)%npack;
429 long ox2 = (
reinterpret_cast<long>(
x2)/
sizeof(T)+nlo)%npack;
430 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
431 bool lgNonAligned1 = ( ox1 != ox2 );
432 bool lgNonAligned2 = ( ox1 != oy );
437 x2local =
new T[nhi+npack-1];
438 long ol = (
reinterpret_cast<long>(x2local)/
sizeof(T)+nlo)%npack;
441 ptr = &x2local[ox1-ol];
443 ptr = &x2local[ox1+npack-ol];
444 memcpy(ptr+nlo, x2+nlo,
size_t((nhi-nlo)*
sizeof(T)));
451 T *yl, *ylocal = NULL;
454 ylocal =
new T[nhi+npack-1];
455 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
457 yl = &ylocal[ox1-ol];
459 yl = &ylocal[ox1+npack-ol];
469 vecfun2_partial(x1, x2l, yl, ilo,
min(ilo+npack-ox1,nhi), npack, vecfun1);
470 ilo =
min(ilo+npack-ox1,nhi);
473 for( i=ilo; i < nhi-npack+1; i += npack )
475 const V& xx1 = *
reinterpret_cast<const V*
>(&x1[i]);
476 const V& xx2 = *
reinterpret_cast<const V*
>(&x2l[i]);
477 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
478 yy = vecfun1(xx1, xx2);
483 vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
488 for( i=nlo; i < nhi; ++i )
495 #define V1FUN_PD_4(FUN, V) \
496 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
497 v8df yy = v1##FUN##d(xx); \
498 memcpy(y, &yy, 4*sizeof(double))
500 #define V1FUN_PD_4(FUN, V) \
501 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
502 v4df yy = v1##FUN##d(xx); \
503 memcpy(y, &yy, 4*sizeof(double))
507 #define V1FUN_PD_8(FUN, V) \
508 v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
509 v8df yy = v1##FUN##d(xx); \
510 memcpy(y, &yy, 8*sizeof(double))
512 #define V1FUN_PD_8(FUN, V) \
514 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
515 yy[0] = v1##FUN##d(xx); \
516 xx = _mm256_set_pd( x7, x6, x5, x4 ); \
517 yy[1] = v1##FUN##d(xx); \
518 memcpy(y, &yy, 8*sizeof(double))
522 #define V1FUN_PS_4(FUN, V) \
523 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
524 v16sf yy = v1##FUN##f(xx); \
525 memcpy(y, &yy, 4*sizeof(sys_float))
527 #define V1FUN_PS_4(FUN, V) \
528 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
529 v8sf yy = v1##FUN##f(xx); \
530 memcpy(y, &yy, 4*sizeof(sys_float))
534 #define V1FUN_PS_8(FUN, V) \
535 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
536 v16sf yy = v1##FUN##f(xx); \
537 memcpy(y, &yy, 8*sizeof(sys_float))
539 #define V1FUN_PS_8(FUN, V) \
540 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
541 v8sf yy = v1##FUN##f(xx); \
542 memcpy(y, &yy, 8*sizeof(sys_float))
546 #define V1FUN_PS_16(FUN, V) \
547 v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
548 v16sf yy = v1##FUN##f(xx); \
549 memcpy(y, &yy, 16*sizeof(sys_float))
551 #define V1FUN_PS_16(FUN, V) \
553 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
554 yy[0] = v1##FUN##f(xx); \
555 xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
556 yy[1] = v1##FUN##f(xx); \
557 memcpy(y, &yy, 16*sizeof(sys_float))
561 #define V1FUN2_PD_4(FUN, V) \
562 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
563 v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
564 v8df zz = v1##FUN##d(xx, yy); \
565 memcpy(z, &zz, 4*sizeof(double))
567 #define V1FUN2_PD_4(FUN, V) \
568 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
569 v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
570 v4df zz = v1##FUN##d(xx, yy); \
571 memcpy(z, &zz, 4*sizeof(double))
575 #define V1FUN2_PS_4(FUN, V) \
576 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
577 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
578 v16sf zz = v1##FUN##f(xx, yy); \
579 memcpy(z, &zz, 4*sizeof(sys_float))
581 #define V1FUN2_PS_4(FUN, V) \
582 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
583 v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
584 v8sf zz = v1##FUN##f(xx, yy); \
585 memcpy(z, &zz, 4*sizeof(sys_float))
589 #define V1FUN2_PS_8(FUN, V) \
590 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
591 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
592 v16sf zz = v1##FUN##f(xx, yy); \
593 memcpy(z, &zz, 8*sizeof(sys_float))
595 #define V1FUN2_PS_8(FUN, V) \
596 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
597 v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
598 v8sf zz = v1##FUN##f(xx, yy); \
599 memcpy(z, &zz, 8*sizeof(sys_float))
605 template<
class T,
class V>
606 inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*scalfun1)(T), V (*)(V))
608 for(
long i=nlo; i < nhi; ++i )
609 y[i] = scalfun1(x[i]);
612 template<
class T,
class V>
613 inline void vecfun2(
const T x1[],
const T x2[], T y[],
long nlo,
long nhi, T (*scalfun1)(T, T), V (*)(V, V))
615 for(
long i=nlo; i < nhi; ++i )
616 y[i] = scalfun1(x1[i], x2[i]);
619 #define V1FUN_4(FUN, V) \
625 #define V1FUN_8(FUN, V) \
635 #define V1FUN_16(FUN, V) \
653 #define V1FUN_PD_4(FUN, V) \
656 #define V1FUN_PD_8(FUN, V) \
659 #define V1FUN_PS_4(FUN, V) \
662 #define V1FUN_PS_8(FUN, V) \
665 #define V1FUN_PS_16(FUN, V) \
668 #define V1FUN2_4(FUN, V) \
669 z[0] = FUN(x0, y0); \
670 z[1] = FUN(x1, y1); \
671 z[2] = FUN(x2, y2); \
674 #define V1FUN2_8(FUN, V) \
675 z[0] = FUN(x0, y0); \
676 z[1] = FUN(x1, y1); \
677 z[2] = FUN(x2, y2); \
678 z[3] = FUN(x3, y3); \
679 z[4] = FUN(x4, y4); \
680 z[5] = FUN(x5, y5); \
681 z[6] = FUN(x6, y6); \
684 #define V1FUN2_PD_4(FUN, V) \
687 #define V1FUN2_PS_4(FUN, V) \
690 #define V1FUN2_PS_8(FUN, V) \
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))