aba.h |
|
6276 |
abd.h |
|
15750 |
abdl.h |
!defined(SIMDE_ARM_NEON_ABDL_H) |
4266 |
abs.h |
|
13752 |
add.h |
|
21266 |
addhn.h |
|
7275 |
addl.h |
!defined(SIMDE_ARM_NEON_ADDL_H) |
3912 |
addl_high.h |
!defined(SIMDE_ARM_NEON_ADDL_HIGH_H) |
4141 |
addlv.h |
|
9747 |
addv.h |
|
10727 |
addw.h |
|
7501 |
addw_high.h |
|
6592 |
and.h |
|
16286 |
bcax.h |
|
5564 |
bic.h |
|
14608 |
bsl.h |
|
26650 |
cage.h |
|
5902 |
cagt.h |
|
5934 |
ceq.h |
|
25317 |
ceqz.h |
|
11098 |
cge.h |
|
27527 |
cgez.h |
|
13009 |
cgt.h |
|
24738 |
cgtz.h |
|
13030 |
cle.h |
|
25850 |
clez.h |
|
13121 |
cls.h |
!defined(SIMDE_ARM_NEON_CLS_H) |
4854 |
clt.h |
|
25069 |
cltz.h |
TODO: float fallbacks should use vclt(a, vdup_n(0.0)) |
9949 |
clz.h |
|
12547 |
cmla.h |
|
5028 |
cmla_rot90.h |
|
5891 |
cmla_rot180.h |
|
5895 |
cmla_rot270.h |
|
5909 |
cnt.h |
The x86 implementations are stolen from
https://github.com/WebAssembly/simd/pull/379. They could be cleaned
up a bit if someone is bored; they're mostly just direct
translations from the assembly. |
5869 |
combine.h |
Note: __builtin_shufflevector can have a the output contain
twice the number of elements, __builtin_shuffle cannot.
Using SIMDE_SHUFFLE_VECTOR_ here would not work. |
11610 |
create.h |
N.B. CM: vcreate_f16 and vcreate_bf16 are omitted as
SIMDe has no 16-bit floating point support.
Idem for the poly types. |
5174 |
cvt.h |
|
39971 |
cvtn.h |
|
5487 |
dot.h |
|
6728 |
dot_lane.h |
|
16404 |
dup_lane.h |
|
45828 |
dup_n.h |
|
19922 |
eor.h |
|
16286 |
ext.h |
|
41893 |
fma.h |
!defined(SIMDE_ARM_NEON_CMLA_H) |
4189 |
fma_lane.h |
simde_vfmad_lane_f64 |
8714 |
fma_n.h |
!defined(SIMDE_ARM_NEON_CMLA_H) |
3660 |
get_high.h |
|
9471 |
get_lane.h |
|
14801 |
get_low.h |
|
10181 |
hadd.h |
TODO: the 128-bit versions only require AVX-512 because of the final
conversions from larger types down to smaller ones. We could get
the same results from AVX/AVX2 instructions with some shuffling
to extract the low half of each input element to the low half
of a 256-bit vector, then cast that to a 128-bit vector. |
10597 |
hsub.h |
TODO: the 128-bit versions only require AVX-512 because of the final
conversions from larger types down to smaller ones. We could get
the same results from AVX/AVX2 instructions with some shuffling
to extract the low half of each input element to the low half
of a 256-bit vector, then cast that to a 128-bit vector. |
10597 |
ld1.h |
|
12207 |
ld1_dup.h |
|
11068 |
ld1_lane.h |
|
14058 |
ld1_x2.h |
|
9771 |
ld1_x3.h |
|
10390 |
ld1_x4.h |
|
11119 |
ld1q_x2.h |
|
9868 |
ld1q_x3.h |
|
10489 |
ld1q_x4.h |
|
11226 |
ld2.h |
|
22947 |
ld3.h |
|
16637 |
ld4.h |
|
18030 |
ld4_lane.h |
In older versions of clang, __builtin_neon_vld4_lane_v would
generate a diagnostic for most variants (those which didn't
use signed 8-bit integers). I believe this was fixed by
78ad22e0cc6390fcd44b2b7b5132f1b960ff975d.
Since we have to use macros (due to the immediate-mode parameter)
we can't just disable it once in this file; we have to use statement
exprs and push / pop the stack for each macro. |
21587 |
max.h |
|
19557 |
maxnm.h |
|
7025 |
maxv.h |
|
9895 |
min.h |
|
21635 |
minnm.h |
|
7249 |
minv.h |
|
10751 |
mla.h |
|
8908 |
mla_lane.h |
|
5623 |
mla_n.h |
|
10388 |
mlal.h |
|
5025 |
mlal_high.h |
|
5259 |
mlal_high_n.h |
!defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H) |
4299 |
mlal_lane.h |
!defined(SIMDE_ARM_NEON_MLAL_LANE_H) |
4826 |
mlal_n.h |
!defined(SIMDE_ARM_NEON_MLAL_N_H) |
4149 |
mls.h |
|
8532 |
mls_n.h |
|
5586 |
mlsl.h |
!defined(SIMDE_ARM_NEON_MLSL_H) |
3888 |
mlsl_high.h |
!defined(SIMDE_ARM_NEON_MLSL_HIGH_H) |
4092 |
mlsl_high_n.h |
!defined(SIMDE_ARM_NEON_MLSL_HIGH_N_H) |
4299 |
mlsl_lane.h |
!defined(SIMDE_ARM_NEON_MLSL_LANE_H) |
4826 |
mlsl_n.h |
!defined(SIMDE_ARM_NEON_MLSL_N_H) |
3110 |
movl.h |
|
7490 |
movl_high.h |
!defined(SIMDE_ARM_NEON_MOVL_HIGH_H) |
3784 |
movn.h |
|
5749 |
movn_high.h |
!defined(SIMDE_ARM_NEON_MOVN_HIGH_H) |
3905 |
mul.h |
|
16404 |
mul_lane.h |
|
23046 |
mul_n.h |
|
6102 |
mull.h |
|
7990 |
mull_high.h |
!defined(SIMDE_ARM_NEON_MULL_HIGH_H) |
4044 |
mull_lane.h |
!defined(SIMDE_ARM_NEON_MULL_LANE_H) |
4634 |
mull_n.h |
|
5368 |
mvn.h |
|
12486 |
neg.h |
|
11885 |
orn.h |
|
14783 |
orr.h |
|
16254 |
padal.h |
|
6129 |
padd.h |
|
11574 |
paddl.h |
|
10862 |
pmax.h |
|
8449 |
pmin.h |
|
9213 |
qabs.h |
|
8314 |
qadd.h |
|
25575 |
qdmulh.h |
|
5265 |
qdmulh_lane.h |
|
6291 |
qdmulh_n.h |
!defined(SIMDE_ARM_NEON_QDMULH_N_H) |
2873 |
qdmull.h |
Implementation notes (seanptmaher):
It won't overflow during the multiplication, it'll ever only double
the bit length, we only care about the overflow during the shift,
so do the multiplication, then the shift with saturation
|
5689 |
qmovn.h |
|
8266 |
qmovn_high.h |
!defined(SIMDE_ARM_NEON_QMOVN_HIGH_H) |
4029 |
qmovun.h |
|
5196 |
qneg.h |
|
8600 |
qrdmulh.h |
https://github.com/WebAssembly/simd/pull/365 |
6167 |
qrdmulh_lane.h |
|
6252 |
qrdmulh_n.h |
!defined(SIMDE_ARM_NEON_QRDMULH_H) |
4088 |
qrshrn_n.h |
!defined(SIMDE_ARM_NEON_QRSHRN_N_H) |
4913 |
qrshrun_n.h |
!defined(SIMDE_ARM_NEON_QRSHRUN_N_H) |
3259 |
qshl.h |
https://github.com/llvm/llvm-project/commit/f0a78bdfdc6d56b25e0081884580b3960a3c2429 |
19189 |
qshlu_n.h |
|
17502 |
qshrn_n.h |
!defined(SIMDE_ARM_NEON_QSHRN_N_H) |
4888 |
qshrun_n.h |
!defined(SIMDE_ARM_NEON_QSHRUN_N_H) |
3280 |
qsub.h |
|
22099 |
qtbl.h |
|
18234 |
qtbx.h |
|
20301 |
rbit.h |
The GFNI implementation is based on Wojciech Muła's work at
http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html#bit-shuffling via
https://github.com/InstLatx64/InstLatX64_Demo/blob/49c27effdfd5a45f27e0ccb6e2f3be5f27c3845d/GFNI_Demo.h#L173 |
6633 |
recpe.h |
https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 |
7949 |
recps.h |
!defined(SIMDE_ARM_NEON_RECPS_H) |
3893 |
reinterpret.h |
|
97470 |
rev16.h |
!defined(SIMDE_ARM_NEON_REV16_H) |
4718 |
rev32.h |
|
8045 |
rev64.h |
N.B. CM: vrev64_f16 and vrev64q_f16 are omitted as
SIMDe has no 16-bit floating point support. |
11921 |
rhadd.h |
Formula to average two unsigned integers without overflow is from Hacker's Delight (ISBN 978-0-321-84268-8).
https://web.archive.org/web/20180831033349/http://hackersdelight.org/basics2.pdf#G525596
avg_u = (x | y) - ((x ^ y) >> 1);
Formula to average two signed integers (without widening):
avg_s = (x >> 1) + (y >> 1) + ((x | y) & 1); // use arithmetic shifts
If hardware has avg_u but not avg_s then rebase input to be unsigned.
For example: s8 (-128..127) can be converted to u8 (0..255) by adding +128.
Idea borrowed from Intel's ARM_NEON_2_x86_SSE project.
https://github.com/intel/ARM_NEON_2_x86_SSE/blob/3c9879bf2dbef3274e0ed20f93cb8da3a2115ba1/NEON_2_SSE.h#L3171
avg_s8 = avg_u8(a ^ 0x80, b ^ 0x80) ^ 0x80;
|
17720 |
rnd.h |
!defined(SIMDE_ARM_NEON_RND_H) |
4379 |
rndi.h |
!defined(SIMDE_ARM_NEON_RNDI_H) |
4190 |
rndm.h |
!defined(SIMDE_ARM_NEON_RNDM_H) |
4408 |
rndn.h |
!defined(SIMDE_ARM_NEON_RNDN_H) |
4602 |
rndp.h |
!defined(SIMDE_ARM_NEON_RNDP_H) |
4400 |
rshl.h |
Notes from the implementer (Christopher Moore aka rosbif)
I have tried to exactly reproduce the documented behaviour of the
ARM NEON rshl and rshlq intrinsics.
This is complicated for the following reasons:-
a) Negative shift counts shift right.
b) Only the low byte of the shift count is used but the shift count
is not limited to 8-bit values (-128 to 127).
c) Overflow must be avoided when rounding, together with sign change
warning/errors in the C versions.
d) Intel SIMD is not nearly as complete as NEON and AltiVec.
There were no intrisics with a vector shift count before AVX2 which
only has 32 and 64-bit logical ones and only a 32-bit arithmetic
one. The others need AVX512. There are no 8-bit shift intrinsics at
all, even with a scalar shift count. It is surprising to use AVX2
and even AVX512 to implement a 64-bit vector operation.
e) Many shift implementations, and the C standard, do not treat a
shift count >= the object's size in bits as one would expect.
(Personally I feel that > is silly but == can be useful.)
Note that even the C17/18 standard does not define the behaviour of
a right shift of a negative value.
However Evan and I agree that all compilers likely to be used
implement this as an arithmetic right shift with sign extension.
If this is not the case it could be replaced by a logical right shift
if negative values are complemented before and after the shift.
Some of the SIMD translations may be slower than the portable code,
particularly those for vectors with only one or two elements.
But I had fun writing them ;-)
|
45253 |
rshr_n.h |
|
18114 |
rshrn_n.h |
!defined(SIMDE_ARM_NEON_RSHRN_N_H) |
3506 |
rsqrte.h |
https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
Pages 100 - 103 |
12020 |
rsqrts.h |
!defined(SIMDE_ARM_NEON_RSQRTS_H) |
4450 |
rsra_n.h |
Remark: For these instructions
1 <= n <= data element size in bits
so 0 <= n - 1 < data element size in bits
|
8089 |
set_lane.h |
|
13789 |
shl.h |
Notes from the implementer (Christopher Moore aka rosbif)
I have tried to exactly reproduce the documented behaviour of the
ARM NEON shl and shlq intrinsics.
This is complicated for the following reasons:-
a) Negative shift counts shift right.
b) Only the low byte of the shift count is used but the shift count
is not limited to 8-bit values (-128 to 127).
c) Intel SIMD is not nearly as complete as NEON and AltiVec.
There were no intrisics with a vector shift count before AVX2 which
only has 32 and 64-bit logical ones and only a 32-bit arithmetic
one. The others need AVX512. There are no 8-bit shift intrinsics at
all, even with a scalar shift count. It is surprising to use AVX2
and even AVX512 to implement a 64-bit vector operation.
d) Many shift implementations, and the C standard, do not treat a
shift count >= the object's size in bits as one would expect.
(Personally I feel that > is silly but == can be useful.)
Maybe it would be useful for SIMDe to have a flag enabling a fast
implementation where the result is only guaranteed for shift counts
conforming to the C standard.
Note that even the C17/18 standard does not define the behaviour of
a right shift of a negative value.
However Evan and I agree that all compilers likely to be used
implement this as an arithmetic right shift with sign extension.
If this is not the case it could be replaced by a logical right shift
if negative values are complemented before and after the shift.
Some of the SIMD translations may be slower than the portable code,
particularly those for vectors with only one or two elements.
But I had fun writing them ;-)
|
37122 |
shl_n.h |
|
19519 |
shll_n.h |
The constant range requirements for the shift amount *n* looks strange.
The ARM Neon Intrinsics Reference states that for *_s8, 0 << n << 7. This
does not match the actual instruction decoding in the ARM Reference manual,
which states that the shift amount "must be equal to the source element width
in bits" (ARM DDI 0487F.b C7-1959). So for *_s8 instructions, *n* must be 8,
for *_s16, it must be 16, and *_s32 must be 32 (similarly for unsigned).
|
6147 |
shr_n.h |
|
21291 |
shrn_n.h |
|
5227 |
sqadd.h |
|
10250 |
sra_n.h |
|
7798 |
sri_n.h |
|
9661 |
st1.h |
|
12358 |
st1_lane.h |
|
12450 |
st2.h |
|
13088 |
st2_lane.h |
|
14198 |
st3.h |
|
35028 |
st3_lane.h |
|
14199 |
st4.h |
|
16018 |
st4_lane.h |
|
14151 |
sub.h |
|
19685 |
subhn.h |
|
7275 |
subl.h |
!defined(SIMDE_ARM_NEON_SUBL_H) |
3912 |
subl_high.h |
!defined(SIMDE_ARM_NEON_SUBL_HIGH_H) |
4068 |
subw.h |
|
7418 |
subw_high.h |
|
7930 |
tbl.h |
|
8375 |
tbx.h |
|
9359 |
trn.h |
|
7435 |
trn1.h |
|
14460 |
trn2.h |
|
14595 |
tst.h |
|
18580 |
types.h |
|
38668 |
uqadd.h |
TODO: I suspect there is room for improvement here. This is
just the first thing that worked, and I don't feel like messing
with it now. |
10603 |
uzp.h |
|
7435 |
uzp1.h |
|
21179 |
uzp2.h |
|
21357 |
xar.h |
!defined(SIMDE_ARM_NEON_XAR_H) |
2270 |
zip.h |
|
7435 |
zip1.h |
|
22121 |
zip2.h |
|
20911 |