2intersect.h |
|
8026 |
4dpwssd.h |
!defined(SIMDE_X86_AVX512_4DPWSSD_H) |
2637 |
4dpwssds.h |
!defined(SIMDE_X86_AVX512_4DPWSSDS_H) |
2672 |
abs.h |
|
19089 |
add.h |
|
21097 |
adds.h |
|
18352 |
and.h |
|
9570 |
andnot.h |
|
7564 |
avg.h |
|
8799 |
bitshuffle.h |
|
7989 |
blend.h |
|
9480 |
broadcast.h |
|
29208 |
cast.h |
|
9094 |
cmp.h |
|
24863 |
cmpeq.h |
|
7897 |
cmpge.h |
|
50531 |
cmpgt.h |
|
6982 |
cmple.h |
|
50422 |
cmplt.h |
!defined(SIMDE_X86_AVX512_CMPLT_H) |
4022 |
cmpneq.h |
|
17516 |
compress.h |
|
24852 |
conflict.h |
|
11984 |
copysign.h |
!defined(SIMDE_X86_AVX512_COPYSIGN_H) |
3008 |
cvt.h |
https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx |
10197 |
cvts.h |
|
23453 |
cvtt.h |
!defined(SIMDE_X86_AVX512_CVTT_H) |
3794 |
dbsad.h |
|
15983 |
div.h |
|
5289 |
dpbf16.h |
|
9991 |
dpbusd.h |
|
10824 |
dpbusds.h |
|
13835 |
dpwssd.h |
|
8962 |
dpwssds.h |
|
11188 |
expand.h |
!defined(SIMDE_X86_AVX512_EXPAND_H) |
3115 |
extract.h |
GCC 6 generates an ICE |
9572 |
fixupimm.h |
|
28836 |
fixupimm_round.h |
|
26373 |
flushsubnormal.h |
!defined(SIMDE_X86_AVX512_FLUSHSUBNORMAL_H) |
2380 |
fmadd.h |
!defined(SIMDE_X86_AVX512_FMADD_H) |
4563 |
fmsub.h |
|
9231 |
fnmadd.h |
!defined(SIMDE_X86_AVX512_FNMADD_H) |
3620 |
fnmsub.h |
!defined(SIMDE_X86_AVX512_FNMSUB_H) |
3620 |
insert.h |
|
18531 |
knot.h |
!defined(SIMDE_X86_AVX512_KNOT_H) |
3435 |
kshift.h |
|
6599 |
kxor.h |
!defined(SIMDE_X86_AVX512_KXOR_H) |
3559 |
load.h |
!defined(SIMDE_X86_AVX512_LOAD_H) |
3495 |
loadu.h |
|
6349 |
lzcnt.h |
|
7815 |
madd.h |
|
5934 |
maddubs.h |
|
6126 |
max.h |
|
18597 |
min.h |
|
18599 |
mov.h |
N.B. CM: No fallbacks as there are only two elements |
30273 |
mov_mask.h |
There is no 32-bit _mm_movemask_* function, so we use
_mm_movemask_epi8 then extract the odd bits. |
11929 |
movm.h |
|
15715 |
mul.h |
|
9151 |
mulhi.h |
!defined(SIMDE_X86_AVX512_MULHI_H) |
2282 |
mulhrs.h |
!defined(SIMDE_X86_AVX512_MULHRS_H) |
2272 |
mullo.h |
|
5509 |
multishift.h |
|
6519 |
negate.h |
!defined(SIMDE_X86_AVX512_NEGATE_H) |
2639 |
or.h |
|
9812 |
packs.h |
|
6819 |
packus.h |
|
6833 |
permutex2var.h |
The following generic code avoids many, nearly identical, repetitions of fairly complex code.
If the compiler optimizes well, in particular extracting invariant code from loops
and simplifying code involving constants passed as arguments, it should not be
significantly slower than specific code.
Note that when the original vector contains few elements, these implementations
may not be faster than portable code.
|
69833 |
permutexvar.h |
|
49807 |
popcnt.h |
v -= ((v >> 1) & UINT8_C(0x55)); |
46335 |
range.h |
|
28109 |
range_round.h |
|
25064 |
rol.h |
|
14839 |
rolv.h |
|
14812 |
ror.h |
|
14869 |
rorv.h |
|
14732 |
round.h |
For architectures which lack a current direction SIMD instruction.
Note that NEON actually has a current rounding mode instruction,
but in ARMv8+ the rounding mode is ignored and nearest is always
used, so we treat ARMv7 as having a rounding mode but ARMv8 as
not. |
13094 |
roundscale.h |
|
23297 |
roundscale_round.h |
|
26912 |
sad.h |
!defined(SIMDE_X86_AVX512_SAD_H) |
2638 |
scalef.h |
|
14399 |
set.h |
|
15717 |
set1.h |
|
9209 |
set4.h |
!defined(SIMDE_X86_AVX512_SET4_H) |
3705 |
setone.h |
!defined(SIMDE_X86_AVX512_SETONE_H) |
2104 |
setr.h |
!defined(SIMDE_X86_AVX512_SETR_H) |
4723 |
setr4.h |
!defined(SIMDE_X86_AVX512_SETR4_H) |
3735 |
setzero.h |
!defined(SIMDE_X86_AVX512_SETZERO_H) |
2906 |
shldv.h |
|
5050 |
shuffle.h |
|
11561 |
sll.h |
|
8169 |
slli.h |
I guess the restriction was added in 6.4, back-ported to 5.5, then
removed (fixed) in 7? |
6854 |
sllv.h |
!defined(SIMDE_X86_AVX512_SLLV_H) |
4081 |
sqrt.h |
!defined(SIMDE_X86_AVX512_SQRT_H) |
3977 |
sra.h |
!defined(SIMDE_X86_AVX512_SRA_H) |
2684 |
srai.h |
!defined(SIMDE_X86_AVX512_SRAI_H) |
2340 |
srav.h |
!defined(SIMDE_X86_AVX512_SRAV_H) |
2271 |
srl.h |
|
6928 |
srli.h |
|
6641 |
srlv.h |
|
9485 |
store.h |
!defined(SIMDE_X86_AVX512_STORE_H) |
3507 |
storeu.h |
|
6905 |
sub.h |
|
10888 |
subs.h |
|
7107 |
ternarylogic.h |
The ternarylogic implementation is based on Wojciech Muła's work at
https://github.com/WojciechMula/ternary-logic |
112262 |
test.h |
|
7540 |
testn.h |
!defined(SIMDE_X86_AVX512_TESTN_H) |
2130 |
types.h |
The problem is that Microsoft doesn't support 64-byte aligned parameters, except for
__m512/__m512i/__m512d. Since our private union has an __m512 member it will be 64-byte
aligned even if we reduce the alignment requirements of other members.
Even if we're on x86 and use the native AVX-512 types for arguments/return values, the
to/from private functions will break, and I'm not willing to change their APIs to use
pointers (which would also require more verbose code on the caller side) just to make
MSVC happy.
If you want to use AVX-512 in SIMDe, you'll need to either upgrade to MSVC 2017 or later,
or upgrade to a different compiler (clang-cl, perhaps?). If you have an idea of how to
fix this without requiring API changes (except transparently through macros), patches
are welcome.
|
32459 |
unpackhi.h |
|
28925 |
unpacklo.h |
|
28831 |
xor.h |
TODO: generate reduced case to give to Intel |
10505 |
xorsign.h |
This is a SIMDe extension which is not part of AVX-512. It exists
because a lot of numerical methods in SIMDe have algoriths which do
something like:
float sgn = input < 0 ? -1 : 1;
...
return res * sgn;
Which can be replaced with a much more efficient call to xorsign:
return simde_x_mm512_xorsign_ps(res, input);
While this was originally intended for use in SIMDe, please feel
free to use it in your code.
|
2426 |