Name Description Size Coverage
aba.h 6276 -
abd.h 15750 -
abdl.h !defined(SIMDE_ARM_NEON_ABDL_H) 4266 -
abs.h 13752 -
add.h 21266 -
addhn.h 7275 -
addl.h !defined(SIMDE_ARM_NEON_ADDL_H) 3912 -
addl_high.h !defined(SIMDE_ARM_NEON_ADDL_HIGH_H) 4141 -
addlv.h 9747 -
addv.h 10727 -
addw.h 7501 -
addw_high.h 6592 -
and.h 16286 -
bcax.h 5564 -
bic.h 14608 -
bsl.h 26650 -
cage.h 5902 -
cagt.h 5934 -
ceq.h 25317 -
ceqz.h 11098 -
cge.h 27527 -
cgez.h 13009 -
cgt.h 24738 -
cgtz.h 13030 -
cle.h 25850 -
clez.h 13121 -
cls.h !defined(SIMDE_ARM_NEON_CLS_H) 4854 -
clt.h 25069 -
cltz.h TODO: float fallbacks should use vclt(a, vdup_n(0.0)) 9949 -
clz.h 12547 -
cmla.h 5028 -
cmla_rot90.h 5891 -
cmla_rot180.h 5895 -
cmla_rot270.h 5909 -
cnt.h The x86 implementations are stolen from https://github.com/WebAssembly/simd/pull/379. They could be cleaned up a bit if someone is bored; they're mostly just direct translations from the assembly. 5869 -
combine.h Note: __builtin_shufflevector can have a the output contain twice the number of elements, __builtin_shuffle cannot. Using SIMDE_SHUFFLE_VECTOR_ here would not work. 11610 -
create.h N.B. CM: vcreate_f16 and vcreate_bf16 are omitted as SIMDe has no 16-bit floating point support. Idem for the poly types. 5174 -
cvt.h 39971 -
cvtn.h 5487 -
dot.h 6728 -
dot_lane.h 16404 -
dup_lane.h 45828 -
dup_n.h 19922 -
eor.h 16286 -
ext.h 41893 -
fma.h !defined(SIMDE_ARM_NEON_CMLA_H) 4189 -
fma_lane.h simde_vfmad_lane_f64 8714 -
fma_n.h !defined(SIMDE_ARM_NEON_CMLA_H) 3660 -
get_high.h 9471 -
get_lane.h 14801 -
get_low.h 10181 -
hadd.h TODO: the 128-bit versions only require AVX-512 because of the final conversions from larger types down to smaller ones. We could get the same results from AVX/AVX2 instructions with some shuffling to extract the low half of each input element to the low half of a 256-bit vector, then cast that to a 128-bit vector. 10597 -
hsub.h TODO: the 128-bit versions only require AVX-512 because of the final conversions from larger types down to smaller ones. We could get the same results from AVX/AVX2 instructions with some shuffling to extract the low half of each input element to the low half of a 256-bit vector, then cast that to a 128-bit vector. 10597 -
ld1.h 12207 -
ld1_dup.h 11068 -
ld1_lane.h 14058 -
ld1_x2.h 9771 -
ld1_x3.h 10390 -
ld1_x4.h 11119 -
ld1q_x2.h 9868 -
ld1q_x3.h 10489 -
ld1q_x4.h 11226 -
ld2.h 22947 -
ld3.h 16637 -
ld4.h 18030 -
ld4_lane.h In older versions of clang, __builtin_neon_vld4_lane_v would generate a diagnostic for most variants (those which didn't use signed 8-bit integers). I believe this was fixed by 78ad22e0cc6390fcd44b2b7b5132f1b960ff975d. Since we have to use macros (due to the immediate-mode parameter) we can't just disable it once in this file; we have to use statement exprs and push / pop the stack for each macro. 21587 -
max.h 19557 -
maxnm.h 7025 -
maxv.h 9895 -
min.h 21635 -
minnm.h 7249 -
minv.h 10751 -
mla.h 8908 -
mla_lane.h 5623 -
mla_n.h 10388 -
mlal.h 5025 -
mlal_high.h 5259 -
mlal_high_n.h !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H) 4299 -
mlal_lane.h !defined(SIMDE_ARM_NEON_MLAL_LANE_H) 4826 -
mlal_n.h !defined(SIMDE_ARM_NEON_MLAL_N_H) 4149 -
mls.h 8532 -
mls_n.h 5586 -
mlsl.h !defined(SIMDE_ARM_NEON_MLSL_H) 3888 -
mlsl_high.h !defined(SIMDE_ARM_NEON_MLSL_HIGH_H) 4092 -
mlsl_high_n.h !defined(SIMDE_ARM_NEON_MLSL_HIGH_N_H) 4299 -
mlsl_lane.h !defined(SIMDE_ARM_NEON_MLSL_LANE_H) 4826 -
mlsl_n.h !defined(SIMDE_ARM_NEON_MLSL_N_H) 3110 -
movl.h 7490 -
movl_high.h !defined(SIMDE_ARM_NEON_MOVL_HIGH_H) 3784 -
movn.h 5749 -
movn_high.h !defined(SIMDE_ARM_NEON_MOVN_HIGH_H) 3905 -
mul.h 16404 -
mul_lane.h 23046 -
mul_n.h 6102 -
mull.h 7990 -
mull_high.h !defined(SIMDE_ARM_NEON_MULL_HIGH_H) 4044 -
mull_lane.h !defined(SIMDE_ARM_NEON_MULL_LANE_H) 4634 -
mull_n.h 5368 -
mvn.h 12486 -
neg.h 11885 -
orn.h 14783 -
orr.h 16254 -
padal.h 6129 -
padd.h 11574 -
paddl.h 10862 -
pmax.h 8449 -
pmin.h 9213 -
qabs.h 8314 -
qadd.h 25575 -
qdmulh.h 5265 -
qdmulh_lane.h 6291 -
qdmulh_n.h !defined(SIMDE_ARM_NEON_QDMULH_N_H) 2873 -
qdmull.h Implementation notes (seanptmaher): It won't overflow during the multiplication, it'll ever only double the bit length, we only care about the overflow during the shift, so do the multiplication, then the shift with saturation 5689 -
qmovn.h 8266 -
qmovn_high.h !defined(SIMDE_ARM_NEON_QMOVN_HIGH_H) 4029 -
qmovun.h 5196 -
qneg.h 8600 -
qrdmulh.h https://github.com/WebAssembly/simd/pull/365 6167 -
qrdmulh_lane.h 6252 -
qrdmulh_n.h !defined(SIMDE_ARM_NEON_QRDMULH_H) 4088 -
qrshrn_n.h !defined(SIMDE_ARM_NEON_QRSHRN_N_H) 4913 -
qrshrun_n.h !defined(SIMDE_ARM_NEON_QRSHRUN_N_H) 3259 -
qshl.h https://github.com/llvm/llvm-project/commit/f0a78bdfdc6d56b25e0081884580b3960a3c2429 19189 -
qshlu_n.h 17502 -
qshrn_n.h !defined(SIMDE_ARM_NEON_QSHRN_N_H) 4888 -
qshrun_n.h !defined(SIMDE_ARM_NEON_QSHRUN_N_H) 3280 -
qsub.h 22099 -
qtbl.h 18234 -
qtbx.h 20301 -
rbit.h The GFNI implementation is based on Wojciech Muła's work at http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html#bit-shuffling via https://github.com/InstLatx64/InstLatX64_Demo/blob/49c27effdfd5a45f27e0ccb6e2f3be5f27c3845d/GFNI_Demo.h#L173 6633 -
recpe.h https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 7949 -
recps.h !defined(SIMDE_ARM_NEON_RECPS_H) 3893 -
reinterpret.h 97470 -
rev16.h !defined(SIMDE_ARM_NEON_REV16_H) 4718 -
rev32.h 8045 -
rev64.h N.B. CM: vrev64_f16 and vrev64q_f16 are omitted as SIMDe has no 16-bit floating point support. 11921 -
rhadd.h Formula to average two unsigned integers without overflow is from Hacker's Delight (ISBN 978-0-321-84268-8). https://web.archive.org/web/20180831033349/http://hackersdelight.org/basics2.pdf#G525596 avg_u = (x | y) - ((x ^ y) >> 1); Formula to average two signed integers (without widening): avg_s = (x >> 1) + (y >> 1) + ((x | y) & 1); // use arithmetic shifts If hardware has avg_u but not avg_s then rebase input to be unsigned. For example: s8 (-128..127) can be converted to u8 (0..255) by adding +128. Idea borrowed from Intel's ARM_NEON_2_x86_SSE project. https://github.com/intel/ARM_NEON_2_x86_SSE/blob/3c9879bf2dbef3274e0ed20f93cb8da3a2115ba1/NEON_2_SSE.h#L3171 avg_s8 = avg_u8(a ^ 0x80, b ^ 0x80) ^ 0x80; 17720 -
rnd.h !defined(SIMDE_ARM_NEON_RND_H) 4379 -
rndi.h !defined(SIMDE_ARM_NEON_RNDI_H) 4190 -
rndm.h !defined(SIMDE_ARM_NEON_RNDM_H) 4408 -
rndn.h !defined(SIMDE_ARM_NEON_RNDN_H) 4602 -
rndp.h !defined(SIMDE_ARM_NEON_RNDP_H) 4400 -
rshl.h Notes from the implementer (Christopher Moore aka rosbif) I have tried to exactly reproduce the documented behaviour of the ARM NEON rshl and rshlq intrinsics. This is complicated for the following reasons:- a) Negative shift counts shift right. b) Only the low byte of the shift count is used but the shift count is not limited to 8-bit values (-128 to 127). c) Overflow must be avoided when rounding, together with sign change warning/errors in the C versions. d) Intel SIMD is not nearly as complete as NEON and AltiVec. There were no intrisics with a vector shift count before AVX2 which only has 32 and 64-bit logical ones and only a 32-bit arithmetic one. The others need AVX512. There are no 8-bit shift intrinsics at all, even with a scalar shift count. It is surprising to use AVX2 and even AVX512 to implement a 64-bit vector operation. e) Many shift implementations, and the C standard, do not treat a shift count >= the object's size in bits as one would expect. (Personally I feel that > is silly but == can be useful.) Note that even the C17/18 standard does not define the behaviour of a right shift of a negative value. However Evan and I agree that all compilers likely to be used implement this as an arithmetic right shift with sign extension. If this is not the case it could be replaced by a logical right shift if negative values are complemented before and after the shift. Some of the SIMD translations may be slower than the portable code, particularly those for vectors with only one or two elements. But I had fun writing them ;-) 45253 -
rshr_n.h 18114 -
rshrn_n.h !defined(SIMDE_ARM_NEON_RSHRN_N_H) 3506 -
rsqrte.h https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf Pages 100 - 103 12020 -
rsqrts.h !defined(SIMDE_ARM_NEON_RSQRTS_H) 4450 -
rsra_n.h Remark: For these instructions 1 <= n <= data element size in bits so 0 <= n - 1 < data element size in bits 8089 -
set_lane.h 13789 -
shl.h Notes from the implementer (Christopher Moore aka rosbif) I have tried to exactly reproduce the documented behaviour of the ARM NEON shl and shlq intrinsics. This is complicated for the following reasons:- a) Negative shift counts shift right. b) Only the low byte of the shift count is used but the shift count is not limited to 8-bit values (-128 to 127). c) Intel SIMD is not nearly as complete as NEON and AltiVec. There were no intrisics with a vector shift count before AVX2 which only has 32 and 64-bit logical ones and only a 32-bit arithmetic one. The others need AVX512. There are no 8-bit shift intrinsics at all, even with a scalar shift count. It is surprising to use AVX2 and even AVX512 to implement a 64-bit vector operation. d) Many shift implementations, and the C standard, do not treat a shift count >= the object's size in bits as one would expect. (Personally I feel that > is silly but == can be useful.) Maybe it would be useful for SIMDe to have a flag enabling a fast implementation where the result is only guaranteed for shift counts conforming to the C standard. Note that even the C17/18 standard does not define the behaviour of a right shift of a negative value. However Evan and I agree that all compilers likely to be used implement this as an arithmetic right shift with sign extension. If this is not the case it could be replaced by a logical right shift if negative values are complemented before and after the shift. Some of the SIMD translations may be slower than the portable code, particularly those for vectors with only one or two elements. But I had fun writing them ;-) 37122 -
shl_n.h 19519 -
shll_n.h The constant range requirements for the shift amount *n* looks strange. The ARM Neon Intrinsics Reference states that for *_s8, 0 << n << 7. This does not match the actual instruction decoding in the ARM Reference manual, which states that the shift amount "must be equal to the source element width in bits" (ARM DDI 0487F.b C7-1959). So for *_s8 instructions, *n* must be 8, for *_s16, it must be 16, and *_s32 must be 32 (similarly for unsigned). 6147 -
shr_n.h 21291 -
shrn_n.h 5227 -
sqadd.h 10250 -
sra_n.h 7798 -
sri_n.h 9661 -
st1.h 12358 -
st1_lane.h 12450 -
st2.h 13088 -
st2_lane.h 14198 -
st3.h 35028 -
st3_lane.h 14199 -
st4.h 16018 -
st4_lane.h 14151 -
sub.h 19685 -
subhn.h 7275 -
subl.h !defined(SIMDE_ARM_NEON_SUBL_H) 3912 -
subl_high.h !defined(SIMDE_ARM_NEON_SUBL_HIGH_H) 4068 -
subw.h 7418 -
subw_high.h 7930 -
tbl.h 8375 -
tbx.h 9359 -
trn.h 7435 -
trn1.h 14460 -
trn2.h 14595 -
tst.h 18580 -
types.h 38668 -
uqadd.h TODO: I suspect there is room for improvement here. This is just the first thing that worked, and I don't feel like messing with it now. 10603 -
uzp.h 7435 -
uzp1.h 21179 -
uzp2.h 21357 -
xar.h !defined(SIMDE_ARM_NEON_XAR_H) 2270 -
zip.h 7435 -
zip1.h 22121 -
zip2.h 20911 -