Name Description Size
2intersect.h 8026
4dpwssd.h !defined(SIMDE_X86_AVX512_4DPWSSD_H) 2637
4dpwssds.h !defined(SIMDE_X86_AVX512_4DPWSSDS_H) 2672
abs.h 19089
add.h 21097
adds.h 18352
and.h 9570
andnot.h 7564
avg.h 8799
bitshuffle.h 7989
blend.h 9480
broadcast.h 29208
cast.h 9094
cmp.h 24863
cmpeq.h 7897
cmpge.h 50531
cmpgt.h 6982
cmple.h 50422
cmplt.h !defined(SIMDE_X86_AVX512_CMPLT_H) 4022
cmpneq.h 17516
compress.h 24852
conflict.h 11984
copysign.h !defined(SIMDE_X86_AVX512_COPYSIGN_H) 3008
cvt.h https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx 10197
cvts.h 23453
cvtt.h !defined(SIMDE_X86_AVX512_CVTT_H) 3794
dbsad.h 15983
div.h 5289
dpbf16.h 9991
dpbusd.h 10824
dpbusds.h 13835
dpwssd.h 8962
dpwssds.h 11188
expand.h !defined(SIMDE_X86_AVX512_EXPAND_H) 3115
extract.h GCC 6 generates an ICE 9572
fixupimm.h 28836
fixupimm_round.h 26373
flushsubnormal.h !defined(SIMDE_X86_AVX512_FLUSHSUBNORMAL_H) 2380
fmadd.h !defined(SIMDE_X86_AVX512_FMADD_H) 4563
fmsub.h 9231
fnmadd.h !defined(SIMDE_X86_AVX512_FNMADD_H) 3620
fnmsub.h !defined(SIMDE_X86_AVX512_FNMSUB_H) 3620
insert.h 18531
knot.h !defined(SIMDE_X86_AVX512_KNOT_H) 3435
kshift.h 6599
kxor.h !defined(SIMDE_X86_AVX512_KXOR_H) 3559
load.h !defined(SIMDE_X86_AVX512_LOAD_H) 3495
loadu.h 6349
lzcnt.h 7815
madd.h 5934
maddubs.h 6126
max.h 18597
min.h 18599
mov.h N.B. CM: No fallbacks as there are only two elements 30273
mov_mask.h There is no 32-bit _mm_movemask_* function, so we use _mm_movemask_epi8 then extract the odd bits. 11929
movm.h 15715
mul.h 9151
mulhi.h !defined(SIMDE_X86_AVX512_MULHI_H) 2282
mulhrs.h !defined(SIMDE_X86_AVX512_MULHRS_H) 2272
mullo.h 5509
multishift.h 6519
negate.h !defined(SIMDE_X86_AVX512_NEGATE_H) 2639
or.h 9812
packs.h 6819
packus.h 6833
permutex2var.h The following generic code avoids many, nearly identical, repetitions of fairly complex code. If the compiler optimizes well, in particular extracting invariant code from loops and simplifying code involving constants passed as arguments, it should not be significantly slower than specific code. Note that when the original vector contains few elements, these implementations may not be faster than portable code. 69833
permutexvar.h 49807
popcnt.h v -= ((v >> 1) & UINT8_C(0x55)); 46335
range.h 28109
range_round.h 25064
rol.h 14839
rolv.h 14812
ror.h 14869
rorv.h 14732
round.h For architectures which lack a current direction SIMD instruction. Note that NEON actually has a current rounding mode instruction, but in ARMv8+ the rounding mode is ignored and nearest is always used, so we treat ARMv7 as having a rounding mode but ARMv8 as not. 13094
roundscale.h 23297
roundscale_round.h 26912
sad.h !defined(SIMDE_X86_AVX512_SAD_H) 2638
scalef.h 14399
set.h 15717
set1.h 9209
set4.h !defined(SIMDE_X86_AVX512_SET4_H) 3705
setone.h !defined(SIMDE_X86_AVX512_SETONE_H) 2104
setr.h !defined(SIMDE_X86_AVX512_SETR_H) 4723
setr4.h !defined(SIMDE_X86_AVX512_SETR4_H) 3735
setzero.h !defined(SIMDE_X86_AVX512_SETZERO_H) 2906
shldv.h 5050
shuffle.h 11561
sll.h 8169
slli.h I guess the restriction was added in 6.4, back-ported to 5.5, then removed (fixed) in 7? 6854
sllv.h !defined(SIMDE_X86_AVX512_SLLV_H) 4081
sqrt.h !defined(SIMDE_X86_AVX512_SQRT_H) 3977
sra.h !defined(SIMDE_X86_AVX512_SRA_H) 2684
srai.h !defined(SIMDE_X86_AVX512_SRAI_H) 2340
srav.h !defined(SIMDE_X86_AVX512_SRAV_H) 2271
srl.h 6928
srli.h 6641
srlv.h 9485
store.h !defined(SIMDE_X86_AVX512_STORE_H) 3507
storeu.h 6905
sub.h 10888
subs.h 7107
ternarylogic.h The ternarylogic implementation is based on Wojciech Muła's work at https://github.com/WojciechMula/ternary-logic 112262
test.h 7540
testn.h !defined(SIMDE_X86_AVX512_TESTN_H) 2130
types.h The problem is that Microsoft doesn't support 64-byte aligned parameters, except for __m512/__m512i/__m512d. Since our private union has an __m512 member it will be 64-byte aligned even if we reduce the alignment requirements of other members. Even if we're on x86 and use the native AVX-512 types for arguments/return values, the to/from private functions will break, and I'm not willing to change their APIs to use pointers (which would also require more verbose code on the caller side) just to make MSVC happy. If you want to use AVX-512 in SIMDe, you'll need to either upgrade to MSVC 2017 or later, or upgrade to a different compiler (clang-cl, perhaps?). If you have an idea of how to fix this without requiring API changes (except transparently through macros), patches are welcome. 32459
unpackhi.h 28925
unpacklo.h 28831
xor.h TODO: generate reduced case to give to Intel 10505
xorsign.h This is a SIMDe extension which is not part of AVX-512. It exists because a lot of numerical methods in SIMDe have algoriths which do something like: float sgn = input < 0 ? -1 : 1; ... return res * sgn; Which can be replaced with a much more efficient call to xorsign: return simde_x_mm512_xorsign_ps(res, input); While this was originally intended for use in SIMDe, please feel free to use it in your code. 2426