Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2024, Arm Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#define PREP_BIAS 32, lsl #8 // 8192
#define PREP_BIAS_NEG 224, lsl #8 // -8192
#if HAVE_SVE2
ENABLE_SVE
ENABLE_SVE2
// No spaces in these expressions, due to gas-preprocessor. It is translated by
// -1 to save the negative offset when getting the address of `mc_subpel_filters`.
#define REGULAR1 (((0*15-1)<<7)|(3*15-1))
#define SMOOTH1 (((1*15-1)<<7)|(4*15-1))
#define SHARP1 (((2*15-1)<<7)|(3*15-1))
#define FUNC_ALIGN 2
#define JUMP_ALIGN 2
#define LOOP_ALIGN 2
// Shuffle indices to permute horizontal samples in preparation for input to
// 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample
// indices in the interval of [-3, 4] relative to the current sample position.
const h_tbl_sve, align=4
.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
.byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
endconst
// Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit
// registers contain a transposed 4x4 matrix of values. Subsequent iterations
// of the vertical convolution can reuse the 3x4 sub-matrix from the previous
// loop iteration. These shuffle indices shift and merge this 4x4 matrix with
// the values of a new line.
const v_tbl_sve, align=4
.byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25
.byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19
.byte 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23
.byte 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27
.byte 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31
endconst
.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
function \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN
mov x9, \type_h
mov x10, \type_v
.if \jump
b \op\()_8tap_\isa
.endif
endfunc
.endm
.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd
make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa
make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa
make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa
make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa
make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa
make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa
make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa
make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa
make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0
function \type\()_8tap_\isa, align=FUNC_ALIGN
clz w8, \w
mov w11, #0x4081 // (1<<14) | (1<<7) | 1
ptrue p0.b, vl16
sub w8, w8, #24 // for jump tables
movrel x12, X(mc_subpel_filters)
cbnz \mx, L(\type\()_8tap_h_hv_\isa)
.ifc \type, prep
cbz \my, prep_sve
.else // put
cbnz \my, L(\type\()_8tap_v_\isa)
mov w9, w8
b X(put_16bpc_neon)
.align JUMP_ALIGN
.endif
L(\type\()_8tap_v_\isa):
madd \my, \my, w11, w10
movrel x13, v_tbl_sve
.ifc \bdmax, w8 // put case, but skip
ld1r {v5.8h}, [sp] // loading into w8
.endif
sub \src, \src, \s_strd // src - s_strd
ubfx w11, \my, #7, #7
and \my, \my, #0x7F
ldr q6, [x13]
cmp \h, #4
csel \my, \my, w11, le
sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd
add \xmy, x12, \xmy, lsl #3 // subpel V filter address
ldp q28, q29, [x13, #16]
ld1sb {z7.h}, p0/z, [\xmy]
.ifc \type, prep
clz \bdmax, \bdmax
sub \bdmax, \bdmax, #24
dup v5.4s, \bdmax
.endif
cmp \w, #8
b.lt 40f
// .align JUMP_ALIGN // fallthrough
80: // V - 8xN+
ldp q30, q31, [x13, #48]
.ifc \type, prep
add \wd_strd, \w, \w // d_strd = 2 * w
.endif
.align LOOP_ALIGN
81:
add \lsrc, \src, \s_strd, lsl #1
ldr q16, [\src]
ldr q17, [\src, \s_strd]
ldr q18, [\lsrc]
ldr q19, [\lsrc, \s_strd]
add \lsrc, \lsrc, \s_strd, lsl #1
mov \ldst, \dst
ldr q20, [\lsrc]
ldr q21, [\lsrc, \s_strd]
add \lsrc, \lsrc, \s_strd, lsl #1
ldr q22, [\lsrc]
ldr q23, [\lsrc, \s_strd]
add \lsrc, \lsrc, \s_strd, lsl #1
sub w8, \h, #1
zip1 v0.8h, v16.8h, v17.8h
zip2 v1.8h, v16.8h, v17.8h
zip1 v2.8h, v18.8h, v19.8h
zip2 v3.8h, v18.8h, v19.8h
zip1 v18.8h, v20.8h, v21.8h
zip2 v21.8h, v20.8h, v21.8h
zip1 v24.8h, v22.8h, v23.8h
zip2 v27.8h, v22.8h, v23.8h
zip1 v16.4s, v0.4s, v2.4s
zip2 v19.4s, v0.4s, v2.4s
zip1 v22.4s, v1.4s, v3.4s
zip2 v25.4s, v1.4s, v3.4s
zip1 v17.4s, v18.4s, v24.4s
zip2 v20.4s, v18.4s, v24.4s
zip1 v23.4s, v21.4s, v27.4s
zip2 v26.4s, v21.4s, v27.4s
.align LOOP_ALIGN
8:
ld1 {v18.16b}, [\lsrc], \s_strd
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
mov v21.16b, v18.16b
mov v24.16b, v18.16b
mov v27.16b, v18.16b
sdot z0.d, z16.h, z7.h[0]
tbl v16.16b, {v16.16b, v17.16b}, v6.16b
sdot z1.d, z19.h, z7.h[0]
tbl v19.16b, {v19.16b, v20.16b}, v6.16b
sdot z2.d, z22.h, z7.h[0]
tbl v22.16b, {v22.16b, v23.16b}, v6.16b
subs w8, w8, #1
sdot z3.d, z25.h, z7.h[0]
tbl v25.16b, {v25.16b, v26.16b}, v6.16b
sdot z0.d, z17.h, z7.h[1]
tbl v17.16b, {v17.16b, v18.16b}, v28.16b
sdot z1.d, z20.h, z7.h[1]
tbl v20.16b, {v20.16b, v21.16b}, v29.16b
sdot z2.d, z23.h, z7.h[1]
tbl v23.16b, {v23.16b, v24.16b}, v30.16b
sdot z3.d, z26.h, z7.h[1]
tbl v26.16b, {v26.16b, v27.16b}, v31.16b
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
.else // put
sqrshrun v0.4h, v0.4s, #6
sqrshrun2 v0.8h, v1.4s, #6
umin v0.8h, v0.8h, v5.8h
.endif
st1 {v0.16b}, [\ldst], \d_strd
b.gt 8b
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
sdot z0.d, z16.h, z7.h[0]
sdot z1.d, z19.h, z7.h[0]
sdot z2.d, z22.h, z7.h[0]
sdot z3.d, z25.h, z7.h[0]
sdot z0.d, z17.h, z7.h[1]
sdot z1.d, z20.h, z7.h[1]
sdot z2.d, z23.h, z7.h[1]
sdot z3.d, z26.h, z7.h[1]
subs \w, \w, #8
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
.else // put
sqrshrun v0.4h, v0.4s, #6
sqrshrun2 v0.8h, v1.4s, #6
umin v0.8h, v0.8h, v5.8h
.endif
str q0, [\ldst]
add \dst, \dst, #16
add \src, \src, #16
b.gt 81b
ret
.align JUMP_ALIGN
40: // V - 4xN, put only: 2xN
.ifc \type, put
lsr \d_strd, \d_strd, #1 // hword index for `st1h`
whilelt p1.h, wzr, \w // masking for writes
.endif
cmp \h, #4
b.le 44f
ldr d16, [\src]
ldr d17, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
ldr d18, [\src]
ldr d19, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
ldr d20, [\src]
ldr d21, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
ldr d22, [\src]
ldr d23, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
sub \h, \h, #2
zip1 v0.8h, v16.8h, v17.8h
zip1 v2.8h, v18.8h, v19.8h
zip1 v18.8h, v20.8h, v21.8h
zip1 v24.8h, v22.8h, v23.8h
zip1 v16.4s, v0.4s, v2.4s
zip2 v19.4s, v0.4s, v2.4s
zip1 v17.4s, v18.4s, v24.4s
zip2 v20.4s, v18.4s, v24.4s
.align LOOP_ALIGN
4:
ldr d18, [\src]
ldr d24, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
mov v21.16b, v18.16b
mov v27.16b, v24.16b
sdot z0.d, z16.h, z7.h[0]
tbl v22.16b, {v16.16b, v17.16b}, v6.16b
sdot z1.d, z19.h, z7.h[0]
tbl v25.16b, {v19.16b, v20.16b}, v6.16b
sdot z0.d, z17.h, z7.h[1]
tbl v23.16b, {v17.16b, v18.16b}, v28.16b
sdot z1.d, z20.h, z7.h[1]
tbl v26.16b, {v20.16b, v21.16b}, v29.16b
subs \h, \h, #2
sdot z2.d, z22.h, z7.h[0]
tbl v16.16b, {v22.16b, v23.16b}, v6.16b
sdot z3.d, z25.h, z7.h[0]
tbl v19.16b, {v25.16b, v26.16b}, v6.16b
sdot z2.d, z23.h, z7.h[1]
tbl v17.16b, {v23.16b, v24.16b}, v28.16b
sdot z3.d, z26.h, z7.h[1]
tbl v20.16b, {v26.16b, v27.16b}, v29.16b
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
str q0, [\dst], #16
.else // put
sqrshrun v0.4h, v0.4s, #6
sqrshrun v1.4h, v1.4s, #6
umin v0.4h, v0.4h, v5.4h
umin v1.4h, v1.4h, v5.4h
st1h {z0.h}, p1, [\dst]
st1h {z1.h}, p1, [\dst, \d_strd, lsl #1]
add \dst, \dst, \d_strd, lsl #2
.endif
b.gt 4b
ldr d18, [\src]
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
mov v21.16b, v18.16b
sdot z0.d, z16.h, z7.h[0]
tbl v22.16b, {v16.16b, v17.16b}, v6.16b
sdot z1.d, z19.h, z7.h[0]
tbl v25.16b, {v19.16b, v20.16b}, v6.16b
sdot z0.d, z17.h, z7.h[1]
tbl v23.16b, {v17.16b, v18.16b}, v28.16b
sdot z1.d, z20.h, z7.h[1]
tbl v26.16b, {v20.16b, v21.16b}, v29.16b
sdot z2.d, z22.h, z7.h[0]
sdot z3.d, z25.h, z7.h[0]
sdot z2.d, z23.h, z7.h[1]
sdot z3.d, z26.h, z7.h[1]
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
str q0, [\dst]
.else // put
sqrshrun v0.4h, v0.4s, #6
sqrshrun v1.4h, v1.4s, #6
umin v0.4h, v0.4h, v5.4h
umin v1.4h, v1.4h, v5.4h
st1h {z0.h}, p1, [\dst]
st1h {z1.h}, p1, [\dst, \d_strd, lsl #1]
.endif
ret
.align JUMP_ALIGN
44: // V - 4x4, put only: 4x2, 2x4, 2x2
add \src, \src, \s_strd, lsl #1 // src - s_strd
subs \h, \h, #2
ldr d16, [\src]
ldr d17, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
ldr d18, [\src]
ldr d19, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
ext v7.16b, v7.16b, v7.16b, #4 // [\xmy + 2 * 2]
zip1 v0.8h, v16.8h, v17.8h
zip1 v2.8h, v18.8h, v19.8h
zip1 v16.4s, v0.4s, v2.4s
zip2 v19.4s, v0.4s, v2.4s
.ifc \type, put
b.eq 42f
.endif
ldr d17, [\src]
ldr d23, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
mov v20.16b, v17.16b
mov v26.16b, v23.16b
sdot z0.d, z16.h, z7.h[0]
tbl v22.16b, {v16.16b, v17.16b}, v28.16b
sdot z1.d, z19.h, z7.h[0]
tbl v25.16b, {v19.16b, v20.16b}, v29.16b
sdot z2.d, z22.h, z7.h[0]
tbl v16.16b, {v22.16b, v23.16b}, v28.16b
sdot z3.d, z25.h, z7.h[0]
tbl v19.16b, {v25.16b, v26.16b}, v29.16b
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
str q0, [\dst], #16
.else // put
sqrshrun v0.4h, v0.4s, #6
sqrshrun v1.4h, v1.4s, #6
umin v0.4h, v0.4h, v5.4h
umin v1.4h, v1.4h, v5.4h
st1h {z0.h}, p1, [\dst]
st1h {z1.h}, p1, [\dst, \d_strd, lsl #1]
add \dst, \dst, \d_strd, lsl #2
.endif
.ifc \type, put
.align JUMP_ALIGN
42:
.endif
ldr d17, [\src]
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
mov v20.16b, v17.16b
sdot z0.d, z16.h, z7.h[0]
tbl v22.16b, {v16.16b, v17.16b}, v28.16b
sdot z1.d, z19.h, z7.h[0]
tbl v25.16b, {v19.16b, v20.16b}, v29.16b
sdot z2.d, z22.h, z7.h[0]
sdot z3.d, z25.h, z7.h[0]
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
str q0, [\dst]
.else // put
sqrshrun v0.4h, v0.4s, #6
sqrshrun v1.4h, v1.4s, #6
umin v0.4h, v0.4h, v5.4h
umin v1.4h, v1.4h, v5.4h
st1h {z0.h}, p1, [\dst]
st1h {z1.h}, p1, [\dst, \d_strd, lsl #1]
.endif
ret
.align JUMP_ALIGN
L(\type\()_8tap_h_hv_\isa):
madd \mx, \mx, w11, w9
movrel x13, h_tbl_sve
sub \src, \src, #6 // src - 3 * 2
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7F
cmp \w, #4
csel \mx, \mx, w9, le
ldp q30, q31, [x13]
add \xmx, x12, \xmx, lsl #3 // subpel H filter address
cbz \my, L(\type\()_8tap_h_\isa)
// HV cases
madd w14, \my, w11, w10
.ifc \bdmax, w8
ldr \bdmax, [sp]
.endif
ubfx w11, w14, #7, #7
and w14, w14, #0x7F
ld1sb {z4.h}, p0/z, [\xmx]
cmp \h, #4
csel w14, w14, w11, le
.ifc \type, put
dup v29.8h, \bdmax
.endif
clz \bdmax, \bdmax
add \xmy, x12, x14, lsl #3 // subpel V filter address
ld1sb {z7.h}, p0/z, [\xmy]
.ifc \type, put
mov w9, #12
sub w9, w9, \bdmax
dup v6.4s, w9
.endif
sub \bdmax, \bdmax, #24
mov x15, x30
sub \src, \src, \s_strd // src - s_strd - 3 * 2
dup v5.4s, \bdmax
cmp w10, SHARP1
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
// HV 8-tap cases
cmp \w, #4
b.le 40f
// .align JUMP_ALIGN // fallthrough
80: // HV8 - 8xN+
.ifc \type, prep
add \wd_strd, \w, \w // d_strd = 2 * w
.endif
cmp \h, #4
b.le 84f
sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2
.align LOOP_ALIGN
81:
mov \lsrc, \src
mov \ldst, \dst
mov w8, \h
bl L(\type\()_hv_filter8_\isa)
uzp1 v16.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v17.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v18.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v19.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v20.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v21.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v22.8h, v23.8h, v24.8h
.align LOOP_ALIGN
8:
ldp q24, q28, [\lsrc]
smull v0.4s, v16.4h, v7.h[0]
smull2 v1.4s, v16.8h, v7.h[0]
mov v16.16b, v17.16b
movi v2.2d, #0
movi v3.2d, #0
tbl v23.16b, {v24.16b}, v30.16b
tbl v24.16b, {v24.16b}, v31.16b
ldur q26, [\lsrc, #8]
smlal v0.4s, v17.4h, v7.h[1]
smlal2 v1.4s, v17.8h, v7.h[1]
mov v17.16b, v18.16b
add \lsrc, \lsrc, \s_strd
sdot z2.d, z23.h, z4.h[0]
sdot z3.d, z24.h, z4.h[0]
movi v23.2d, #0
movi v24.2d, #0
tbl v25.16b, {v26.16b}, v30.16b
tbl v26.16b, {v26.16b}, v31.16b
smlal v0.4s, v18.4h, v7.h[2]
smlal2 v1.4s, v18.8h, v7.h[2]
mov v18.16b, v19.16b
sdot z23.d, z25.h, z4.h[0]
sdot z24.d, z26.h, z4.h[0]
tbl v27.16b, {v28.16b}, v30.16b
tbl v28.16b, {v28.16b}, v31.16b
smlal v0.4s, v19.4h, v7.h[3]
smlal2 v1.4s, v19.8h, v7.h[3]
mov v19.16b, v20.16b
subs w8, w8, #1
sdot z2.d, z25.h, z4.h[1]
sdot z3.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
sdot z24.d, z28.h, z4.h[1]
smlal v0.4s, v20.4h, v7.h[4]
smlal2 v1.4s, v20.8h, v7.h[4]
mov v20.16b, v21.16b
uzp1 v3.4s, v2.4s, v3.4s
uzp1 v24.4s, v23.4s, v24.4s
smlal v0.4s, v21.4h, v7.h[5]
smlal2 v1.4s, v21.8h, v7.h[5]
mov v21.16b, v22.16b
srshl v23.4s, v3.4s, v5.4s
srshl v24.4s, v24.4s, v5.4s
smlal v0.4s, v22.4h, v7.h[6]
smlal2 v1.4s, v22.8h, v7.h[6]
uzp1 v22.8h, v23.8h, v24.8h
smlal v0.4s, v22.4h, v7.h[7]
smlal2 v1.4s, v22.8h, v7.h[7]
.ifc \type, prep
rshrn v0.4h, v0.4s, #6
rshrn2 v0.8h, v1.4s, #6
sub z0.h, z0.h, #PREP_BIAS
.else // put
srshl v0.4s, v0.4s, v6.4s
srshl v1.4s, v1.4s, v6.4s
sqxtun v0.4h, v0.4s
sqxtun2 v0.8h, v1.4s
umin v0.8h, v0.8h, v29.8h
.endif
st1 {v0.8h}, [\ldst], \d_strd
b.gt 8b
subs \w, \w, #8
add \src, \src, #16
add \dst, \dst, #16
b.gt 81b
ret x15
.align JUMP_ALIGN
40: // HV8 - 4xN, put only: 2xN
.ifc \type, put
lsr \d_strd, \d_strd, #1 // hword index for `st1h`
whilelt p1.h, wzr, \w // masking for writes
.endif
ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2]
add \src, \src, #4
cmp \h, #4
b.le 44f
sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2
bl L(\type\()_hv_filter4_\isa)
xtn v16.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v17.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v18.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v19.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v20.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v21.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v22.4h, v0.4s
.align LOOP_ALIGN
4:
ld1 {v3.16b}, [\src], \s_strd
smull v24.4s, v16.4h, v7.h[0]
smlal v24.4s, v17.4h, v7.h[1]
tbl v2.16b, {v3.16b}, v30.16b
tbl v3.16b, {v3.16b}, v31.16b
movi v0.2d, #0
movi v1.2d, #0
mov v16.16b, v17.16b
mov v17.16b, v18.16b
smlal v24.4s, v18.4h, v7.h[2]
smlal v24.4s, v19.4h, v7.h[3]
sdot z0.d, z2.h, z4.h[0]
sdot z1.d, z3.h, z4.h[0]
mov v18.16b, v19.16b
mov v19.16b, v20.16b
uzp1 v0.4s, v0.4s, v1.4s
smlal v24.4s, v20.4h, v7.h[4]
smlal v24.4s, v21.4h, v7.h[5]
srshl v0.4s, v0.4s, v5.4s
mov v20.16b, v21.16b
mov v21.16b, v22.16b
subs \h, \h, #1
smlal v24.4s, v22.4h, v7.h[6]
xtn v22.4h, v0.4s
smlal v24.4s, v22.4h, v7.h[7]
.ifc \type, prep
rshrn v0.4h, v24.4s, #6
sub z0.h, z0.h, #PREP_BIAS
str d0, [\dst], #8
.else // put
srshl v0.4s, v24.4s, v6.4s
sqxtun v0.4h, v0.4s
umin v0.4h, v0.4h, v29.4h
st1h {z0.h}, p1, [\dst]
add \dst, \dst, \d_strd, lsl #1
.endif
b.gt 4b
ret x15
.align JUMP_ALIGN
L(\type\()_6tap_hv_\isa):
cmp \w, #4
b.le 46f
// .align JUMP_ALIGN // fallthrough
80: // HV6 - 8xN+
.ifc \type, prep
add \wd_strd, \w, \w // d_strd = 2 * w
.endif
cmp \h, #4
b.le 84f
sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2
.align LOOP_ALIGN
81:
mov \lsrc, \src
mov \ldst, \dst
mov w8, \h
bl L(\type\()_hv_filter8_\isa)
uzp1 v16.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v17.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v18.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v19.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v20.8h, v23.8h, v24.8h
.align LOOP_ALIGN
8:
ldp q24, q28, [\lsrc]
smull v0.4s, v16.4h, v7.h[1]
smull2 v1.4s, v16.8h, v7.h[1]
mov v16.16b, v17.16b
tbl v23.16b, {v24.16b}, v30.16b
tbl v24.16b, {v24.16b}, v31.16b
movi v2.2d, #0
movi v3.2d, #0
ldur q26, [\lsrc, #8]
add \lsrc, \lsrc, \s_strd
sdot z2.d, z23.h, z4.h[0]
sdot z3.d, z24.h, z4.h[0]
tbl v25.16b, {v26.16b}, v30.16b
tbl v26.16b, {v26.16b}, v31.16b
movi v23.2d, #0
movi v24.2d, #0
sdot z23.d, z25.h, z4.h[0]
sdot z24.d, z26.h, z4.h[0]
tbl v27.16b, {v28.16b}, v30.16b
tbl v28.16b, {v28.16b}, v31.16b
smlal v0.4s, v17.4h, v7.h[2]
smlal2 v1.4s, v17.8h, v7.h[2]
mov v17.16b, v18.16b
sdot z2.d, z25.h, z4.h[1]
sdot z3.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
sdot z24.d, z28.h, z4.h[1]
smlal v0.4s, v18.4h, v7.h[3]
smlal2 v1.4s, v18.8h, v7.h[3]
mov v18.16b, v19.16b
uzp1 v3.4s, v2.4s, v3.4s
uzp1 v24.4s, v23.4s, v24.4s
smlal v0.4s, v19.4h, v7.h[4]
smlal2 v1.4s, v19.8h, v7.h[4]
mov v19.16b, v20.16b
srshl v23.4s, v3.4s, v5.4s
srshl v24.4s, v24.4s, v5.4s
smlal v0.4s, v20.4h, v7.h[5]
smlal2 v1.4s, v20.8h, v7.h[5]
subs w8, w8, #1
uzp1 v20.8h, v23.8h, v24.8h
smlal v0.4s, v20.4h, v7.h[6]
smlal2 v1.4s, v20.8h, v7.h[6]
.ifc \type, prep
rshrn v0.4h, v0.4s, #6
rshrn2 v0.8h, v1.4s, #6
sub z0.h, z0.h, #PREP_BIAS
.else // put
srshl v0.4s, v0.4s, v6.4s
srshl v1.4s, v1.4s, v6.4s
sqxtun v0.4h, v0.4s
sqxtun2 v0.8h, v1.4s
umin v0.8h, v0.8h, v29.8h
.endif
st1 {v0.8h}, [\ldst], \d_strd
b.gt 8b
add \dst, \dst, #16
subs \w, \w, #8
add \src, \src, #16
b.gt 81b
ret x15
.align LOOP_ALIGN
84: // HV4 - 8x4, 8x2
mov \lsrc, \src
mov \ldst, \dst
mov w8, \h
bl L(\type\()_hv_filter8_\isa)
uzp1 v17.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v18.8h, v23.8h, v24.8h
bl L(\type\()_hv_filter8_\isa)
uzp1 v19.8h, v23.8h, v24.8h
.align LOOP_ALIGN
81:
ldp q24, q28, [\lsrc]
ldur q26, [\lsrc, #8]
add \lsrc, \lsrc, \s_strd
tbl v23.16b, {v24.16b}, v30.16b
tbl v24.16b, {v24.16b}, v31.16b
movi v2.2d, #0
movi v3.2d, #0
sdot z2.d, z23.h, z4.h[0]
sdot z3.d, z24.h, z4.h[0]
tbl v25.16b, {v26.16b}, v30.16b
tbl v26.16b, {v26.16b}, v31.16b
movi v23.2d, #0
movi v24.2d, #0
sdot z23.d, z25.h, z4.h[0]
sdot z24.d, z26.h, z4.h[0]
tbl v27.16b, {v28.16b}, v30.16b
tbl v28.16b, {v28.16b}, v31.16b
sdot z2.d, z25.h, z4.h[1]
sdot z3.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
sdot z24.d, z28.h, z4.h[1]
smull v0.4s, v17.4h, v7.h[2]
smull2 v1.4s, v17.8h, v7.h[2]
mov v17.16b, v18.16b
subs w8, w8, #1
uzp1 v3.4s, v2.4s, v3.4s
uzp1 v24.4s, v23.4s, v24.4s
smlal v0.4s, v18.4h, v7.h[3]
smlal2 v1.4s, v18.8h, v7.h[3]
mov v18.16b, v19.16b
srshl v23.4s, v3.4s, v5.4s
srshl v24.4s, v24.4s, v5.4s
smlal v0.4s, v19.4h, v7.h[4]
smlal2 v1.4s, v19.8h, v7.h[4]
uzp1 v19.8h, v23.8h, v24.8h
smlal v0.4s, v19.4h, v7.h[5]
smlal2 v1.4s, v19.8h, v7.h[5]
.ifc \type, prep
rshrn v0.4h, v0.4s, #6
rshrn2 v0.8h, v1.4s, #6
sub z0.h, z0.h, #PREP_BIAS
.else // put
srshl v0.4s, v0.4s, v6.4s
srshl v1.4s, v1.4s, v6.4s
sqxtun v0.4h, v0.4s
sqxtun2 v0.8h, v1.4s
umin v0.8h, v0.8h, v29.8h
.endif
st1 {v0.8h}, [\ldst], \d_strd
b.gt 81b
subs \w, \w, #8
add \dst, \dst, #16
add \src, \src, #16
b.gt 84b
ret x15
.align FUNC_ALIGN
L(\type\()_hv_filter8_\isa):
ldp q24, q28, [\lsrc]
ldur q26, [\lsrc, #8]
add \lsrc, \lsrc, \s_strd
tbl v23.16b, {v24.16b}, v30.16b
tbl v24.16b, {v24.16b}, v31.16b
movi v2.2d, #0
movi v3.2d, #0
sdot z2.d, z23.h, z4.h[0]
sdot z3.d, z24.h, z4.h[0]
tbl v25.16b, {v26.16b}, v30.16b
tbl v26.16b, {v26.16b}, v31.16b
movi v23.2d, #0
movi v24.2d, #0
sdot z23.d, z25.h, z4.h[0]
sdot z24.d, z26.h, z4.h[0]
tbl v27.16b, {v28.16b}, v30.16b
tbl v28.16b, {v28.16b}, v31.16b
sdot z2.d, z25.h, z4.h[1]
sdot z3.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
sdot z24.d, z28.h, z4.h[1]
uzp1 v3.4s, v2.4s, v3.4s
uzp1 v24.4s, v23.4s, v24.4s
srshl v23.4s, v3.4s, v5.4s
srshl v24.4s, v24.4s, v5.4s
ret
.align FUNC_ALIGN
L(\type\()_hv_filter4_\isa):
ld1 {v3.16b}, [\src], \s_strd
tbl v2.16b, {v3.16b}, v30.16b
tbl v3.16b, {v3.16b}, v31.16b
movi v0.2d, #0
movi v1.2d, #0
sdot z0.d, z2.h, z4.h[0]
sdot z1.d, z3.h, z4.h[0]
uzp1 v0.4s, v0.4s, v1.4s
srshl v0.4s, v0.4s, v5.4s
ret
.align JUMP_ALIGN
46: // H4V6 - 4xN, put only: 2xN
.ifc \type, put
lsr \d_strd, \d_strd, #1 // hword index for `st1h`
whilelt p1.h, wzr, \w // masking for writes
.endif
ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2]
add \src, \src, #4
cmp \h, #4
b.le 44f
sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2
bl L(\type\()_hv_filter4_\isa)
xtn v16.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v17.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v18.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v19.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v20.4h, v0.4s
.align LOOP_ALIGN
4:
ld1 {v3.16b}, [\src], \s_strd
smull v24.4s, v16.4h, v7.h[1]
smlal v24.4s, v17.4h, v7.h[2]
tbl v2.16b, {v3.16b}, v30.16b
tbl v3.16b, {v3.16b}, v31.16b
movi v0.2d, #0
movi v1.2d, #0
sdot z0.d, z2.h, z4.h[0]
sdot z1.d, z3.h, z4.h[0]
mov v16.16b, v17.16b
mov v17.16b, v18.16b
smlal v24.4s, v18.4h, v7.h[3]
smlal v24.4s, v19.4h, v7.h[4]
uzp1 v0.4s, v0.4s, v1.4s
mov v18.16b, v19.16b
mov v19.16b, v20.16b
subs \h, \h, #1
srshl v0.4s, v0.4s, v5.4s
smlal v24.4s, v20.4h, v7.h[5]
xtn v20.4h, v0.4s
smlal v24.4s, v20.4h, v7.h[6]
.ifc \type, prep
rshrn v0.4h, v24.4s, #6
sub z0.h, z0.h, #PREP_BIAS
str d0, [\dst], #8
.else // put
srshl v0.4s, v24.4s, v6.4s
sqxtun v0.4h, v0.4s
umin v0.4h, v0.4h, v29.4h
st1h {z0.h}, p1, [\dst]
add \dst, \dst, \d_strd, lsl #1
.endif
b.gt 4b
ret x15
.align JUMP_ALIGN
44: // H4V4 - 4x4, put only: 4x2, 2x4, 2x2
bl L(\type\()_hv_filter4_\isa)
xtn v17.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v18.4h, v0.4s
bl L(\type\()_hv_filter4_\isa)
xtn v19.4h, v0.4s
.align LOOP_ALIGN
4:
ld1 {v3.16b}, [\src], \s_strd
smull v24.4s, v17.4h, v7.h[2]
smlal v24.4s, v18.4h, v7.h[3]
tbl v2.16b, {v3.16b}, v30.16b
tbl v3.16b, {v3.16b}, v31.16b
movi v0.2d, #0
movi v1.2d, #0
sdot z0.d, z2.h, z4.h[0]
sdot z1.d, z3.h, z4.h[0]
uzp1 v0.4s, v0.4s, v1.4s
mov v17.16b, v18.16b
mov v18.16b, v19.16b
subs \h, \h, #1
srshl v0.4s, v0.4s, v5.4s
smlal v24.4s, v19.4h, v7.h[4]
xtn v19.4h, v0.4s
smlal v24.4s, v19.4h, v7.h[5]
.ifc \type, prep
rshrn v0.4h, v24.4s, #6
sub z0.h, z0.h, #PREP_BIAS
str d0, [\dst], #8
.else // put
srshl v0.4s, v24.4s, v6.4s
sqxtun v0.4h, v0.4s
umin v0.4h, v0.4h, v29.4h
st1h {z0.h}, p1, [\dst]
add \dst, \dst, \d_strd, lsl #1
.endif
b.gt 4b
ret x15
.align JUMP_ALIGN
L(\type\()_8tap_h_\isa):
movrel x11, \type\()_8tap_h_\isa\()_tbl
ldrsw x12, [x11, x8, lsl #2]
.ifc \bdmax, w8
ldr \bdmax, [sp]
.endif
.ifc \type, prep
clz \bdmax, \bdmax
sub \bdmax, \bdmax, #24
dup v5.4s, \bdmax
.else // put
mov w9, #34 // rounding for 10-bit case
mov w10, #40 // rounding for 12-bit case
cmp \bdmax, #0xFFF
csel w9, w9, w10, ne // select rounding based on \bdmax
dup v5.8h, \bdmax
dup v6.2d, x9
.endif
add x11, x11, x12
ld1sb {z4.h}, p0/z, [\xmx]
br x11
.align JUMP_ALIGN
20: // H - 4xN, put only: 2xN
40:
AARCH64_VALID_JUMP_TARGET
add \src, \src, #4 // src - 1 * 2
ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2]
.ifc \type, put
lsr \d_strd, \d_strd, #1 // hword index for `st1h`
whilelt p1.h, wzr, \w // masking for writes
.endif
.align LOOP_ALIGN
4:
ldr q17, [\src]
ldr q19, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
.ifc \type, prep
movi v0.2d, #0
movi v1.2d, #0
movi v2.2d, #0
movi v3.2d, #0
.else
mov v0.16b, v6.16b
mov v1.16b, v6.16b
mov v2.16b, v6.16b
mov v3.16b, v6.16b
.endif
tbl v16.16b, {v17.16b}, v30.16b
tbl v17.16b, {v17.16b}, v31.16b
sdot z0.d, z16.h, z4.h[0]
sdot z1.d, z17.h, z4.h[0]
subs \h, \h, #2
tbl v18.16b, {v19.16b}, v30.16b
tbl v19.16b, {v19.16b}, v31.16b
sdot z2.d, z18.h, z4.h[0]
sdot z3.d, z19.h, z4.h[0]
uzp1 v0.4s, v0.4s, v1.4s
uzp1 v1.4s, v2.4s, v3.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
uzp1 v0.8h, v0.8h, v1.8h
sub z0.h, z0.h, #PREP_BIAS
str q0, [\dst], #16
.else // put
sqshrun v0.4h, v0.4s, #6
sqshrun v1.4h, v1.4s, #6
umin v0.4h, v0.4h, v5.4h
umin v1.4h, v1.4h, v5.4h
st1h {z0.h}, p1, [\dst]
st1h {z1.h}, p1, [\dst, \d_strd, lsl #1]
add \dst, \dst, \d_strd, lsl #2
.endif
b.gt 4b
ret
.align JUMP_ALIGN
80: // H - 8xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
8:
ldp q17, q21, [\src]
ldur q19, [\src, #8]
.ifc \type, prep
movi v0.2d, #0
movi v2.2d, #0
.else
mov v0.16b, v6.16b
mov v2.16b, v6.16b
.endif
tbl v16.16b, {v17.16b}, v30.16b
tbl v17.16b, {v17.16b}, v31.16b
add \src, \src, \s_strd
sdot z0.d, z16.h, z4.h[0]
sdot z2.d, z17.h, z4.h[0]
tbl v18.16b, {v19.16b}, v30.16b
tbl v19.16b, {v19.16b}, v31.16b
.ifc \type, prep
movi v16.2d, #0
movi v17.2d, #0
.else
mov v16.16b, v6.16b
mov v17.16b, v6.16b
.endif
ldp q23, q27, [\src]
ldur q25, [\src, #8]
sdot z16.d, z18.h, z4.h[0]
sdot z17.d, z19.h, z4.h[0]
tbl v22.16b, {v23.16b}, v30.16b
tbl v23.16b, {v23.16b}, v31.16b
.ifc \type, prep
movi v1.2d, #0
movi v3.2d, #0
.else
mov v1.16b, v6.16b
mov v3.16b, v6.16b
.endif
add \src, \src, \s_strd
sdot z1.d, z22.h, z4.h[0]
sdot z3.d, z23.h, z4.h[0]
tbl v24.16b, {v25.16b}, v30.16b
tbl v25.16b, {v25.16b}, v31.16b
.ifc \type, prep
movi v22.2d, #0
movi v23.2d, #0
.else
mov v22.16b, v6.16b
mov v23.16b, v6.16b
.endif
sdot z22.d, z24.h, z4.h[0]
sdot z23.d, z25.h, z4.h[0]
tbl v20.16b, {v21.16b}, v30.16b
tbl v21.16b, {v21.16b}, v31.16b
sdot z0.d, z18.h, z4.h[1]
sdot z2.d, z19.h, z4.h[1]
tbl v26.16b, {v27.16b}, v30.16b
tbl v27.16b, {v27.16b}, v31.16b
sdot z16.d, z20.h, z4.h[1]
sdot z17.d, z21.h, z4.h[1]
sdot z1.d, z24.h, z4.h[1]
sdot z3.d, z25.h, z4.h[1]
sdot z22.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
subs \h, \h, #2
uzp1 v0.4s, v0.4s, v2.4s
uzp1 v2.4s, v16.4s, v17.4s
uzp1 v1.4s, v1.4s, v3.4s
uzp1 v3.4s, v22.4s, v23.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v2.4s, v2.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
srshl v3.4s, v3.4s, v5.4s
uzp1 v0.8h, v0.8h, v2.8h
uzp1 v1.8h, v1.8h, v3.8h
sub z0.h, z0.h, #PREP_BIAS
sub z1.h, z1.h, #PREP_BIAS
stp q0, q1, [\dst], #32
.else // put
sqshrun v0.4h, v0.4s, #6
sqshrun2 v0.8h, v2.4s, #6
sqshrun v1.4h, v1.4s, #6
sqshrun2 v1.8h, v3.4s, #6
umin v0.8h, v0.8h, v5.8h
umin v1.8h, v1.8h, v5.8h
st1 {v0.16b}, [\dst], \d_strd
st1 {v1.16b}, [\dst], \d_strd
.endif
b.gt 8b
ret
.align JUMP_ALIGN
160: // H - 16xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
16:
ldp q17, q21, [\src]
ldur q19, [\src, #8]
.ifc \type, prep
movi v0.2d, #0
movi v2.2d, #0
.else
mov v0.16b, v6.16b
mov v2.16b, v6.16b
.endif
tbl v16.16b, {v17.16b}, v30.16b
tbl v17.16b, {v17.16b}, v31.16b
sdot z0.d, z16.h, z4.h[0]
sdot z2.d, z17.h, z4.h[0]
tbl v18.16b, {v19.16b}, v30.16b
tbl v19.16b, {v19.16b}, v31.16b
.ifc \type, prep
movi v16.2d, #0
movi v17.2d, #0
.else
mov v16.16b, v6.16b
mov v17.16b, v6.16b
.endif
ldur q25, [\src, #24]
ldr q27, [\src, #32]
sdot z16.d, z18.h, z4.h[0]
sdot z17.d, z19.h, z4.h[0]
tbl v22.16b, {v21.16b}, v30.16b
tbl v23.16b, {v21.16b}, v31.16b
.ifc \type, prep
movi v1.2d, #0
movi v3.2d, #0
.else
mov v1.16b, v6.16b
mov v3.16b, v6.16b
.endif
add \src, \src, \s_strd
sdot z1.d, z22.h, z4.h[0]
sdot z3.d, z23.h, z4.h[0]
tbl v24.16b, {v25.16b}, v30.16b
tbl v25.16b, {v25.16b}, v31.16b
.ifc \type, prep
movi v22.2d, #0
movi v23.2d, #0
.else
mov v22.16b, v6.16b
mov v23.16b, v6.16b
.endif
sdot z22.d, z24.h, z4.h[0]
sdot z23.d, z25.h, z4.h[0]
tbl v20.16b, {v21.16b}, v30.16b
tbl v21.16b, {v21.16b}, v31.16b
sdot z0.d, z18.h, z4.h[1]
sdot z2.d, z19.h, z4.h[1]
tbl v26.16b, {v27.16b}, v30.16b
tbl v27.16b, {v27.16b}, v31.16b
sdot z16.d, z20.h, z4.h[1]
sdot z17.d, z21.h, z4.h[1]
sdot z1.d, z24.h, z4.h[1]
sdot z3.d, z25.h, z4.h[1]
sdot z22.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
subs \h, \h, #1
uzp1 v0.4s, v0.4s, v2.4s
uzp1 v2.4s, v16.4s, v17.4s
uzp1 v1.4s, v1.4s, v3.4s
uzp1 v3.4s, v22.4s, v23.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v2.4s, v2.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
srshl v3.4s, v3.4s, v5.4s
uzp1 v0.8h, v0.8h, v2.8h
uzp1 v1.8h, v1.8h, v3.8h
sub z0.h, z0.h, #PREP_BIAS
sub z1.h, z1.h, #PREP_BIAS
stp q0, q1, [\dst], #32
.else // put
sqshrun v0.4h, v0.4s, #6
sqshrun2 v0.8h, v2.4s, #6
sqshrun v1.4h, v1.4s, #6
sqshrun2 v1.8h, v3.4s, #6
umin v0.8h, v0.8h, v5.8h
umin v1.8h, v1.8h, v5.8h
st1 {v0.16b, v1.16b}, [\dst], \d_strd
.endif
b.gt 16b
ret
.align JUMP_ALIGN
320: // H - 32xN+
640:
1280:
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
sub \d_strd, \d_strd, \w, uxtw #1
.endif
sub \s_strd, \s_strd, \w, uxtw #1
mov w8, \w
.align LOOP_ALIGN
32:
ldp q17, q21, [\src]
ldur q19, [\src, #8]
.ifc \type, prep
movi v0.2d, #0
movi v2.2d, #0
.else
mov v0.16b, v6.16b
mov v2.16b, v6.16b
.endif
tbl v16.16b, {v17.16b}, v30.16b
tbl v17.16b, {v17.16b}, v31.16b
sdot z0.d, z16.h, z4.h[0]
sdot z2.d, z17.h, z4.h[0]
tbl v18.16b, {v19.16b}, v30.16b
tbl v19.16b, {v19.16b}, v31.16b
.ifc \type, prep
movi v16.2d, #0
movi v17.2d, #0
.else
mov v16.16b, v6.16b
mov v17.16b, v6.16b
.endif
ldur q25, [\src, #24]
sdot z16.d, z18.h, z4.h[0]
sdot z17.d, z19.h, z4.h[0]
ldr q27, [\src, #32]!
tbl v22.16b, {v21.16b}, v30.16b
tbl v23.16b, {v21.16b}, v31.16b
.ifc \type, prep
movi v1.2d, #0
movi v3.2d, #0
.else
mov v1.16b, v6.16b
mov v3.16b, v6.16b
.endif
sdot z1.d, z22.h, z4.h[0]
sdot z3.d, z23.h, z4.h[0]
tbl v24.16b, {v25.16b}, v30.16b
tbl v25.16b, {v25.16b}, v31.16b
.ifc \type, prep
movi v22.2d, #0
movi v23.2d, #0
.else
mov v22.16b, v6.16b
mov v23.16b, v6.16b
.endif
sdot z22.d, z24.h, z4.h[0]
sdot z23.d, z25.h, z4.h[0]
tbl v20.16b, {v21.16b}, v30.16b
tbl v21.16b, {v21.16b}, v31.16b
sdot z0.d, z18.h, z4.h[1]
sdot z2.d, z19.h, z4.h[1]
tbl v26.16b, {v27.16b}, v30.16b
tbl v27.16b, {v27.16b}, v31.16b
sdot z16.d, z20.h, z4.h[1]
sdot z17.d, z21.h, z4.h[1]
sdot z1.d, z24.h, z4.h[1]
sdot z3.d, z25.h, z4.h[1]
sdot z22.d, z26.h, z4.h[1]
sdot z23.d, z27.h, z4.h[1]
subs w8, w8, #16
uzp1 v0.4s, v0.4s, v2.4s
uzp1 v2.4s, v16.4s, v17.4s
uzp1 v1.4s, v1.4s, v3.4s
uzp1 v3.4s, v22.4s, v23.4s
.ifc \type, prep
srshl v0.4s, v0.4s, v5.4s
srshl v2.4s, v2.4s, v5.4s
srshl v1.4s, v1.4s, v5.4s
srshl v3.4s, v3.4s, v5.4s
uzp1 v0.8h, v0.8h, v2.8h
uzp1 v1.8h, v1.8h, v3.8h
sub z0.h, z0.h, #PREP_BIAS
sub z1.h, z1.h, #PREP_BIAS
.else // put
sqshrun v0.4h, v0.4s, #6
sqshrun2 v0.8h, v2.4s, #6
sqshrun v1.4h, v1.4s, #6
sqshrun2 v1.8h, v3.4s, #6
umin v0.8h, v0.8h, v5.8h
umin v1.8h, v1.8h, v5.8h
.endif
stp q0, q1, [\dst], #32
b.gt 32b
add \src, \src, \s_strd
.ifc \type, put
add \dst, \dst, \d_strd
.endif
subs \h, \h, #1
mov w8, \w
b.gt 32b
ret
endfunc
jumptable \type\()_8tap_h_\isa\()_tbl
.word 1280b - \type\()_8tap_h_\isa\()_tbl
.word 640b - \type\()_8tap_h_\isa\()_tbl
.word 320b - \type\()_8tap_h_\isa\()_tbl
.word 160b - \type\()_8tap_h_\isa\()_tbl
.word 80b - \type\()_8tap_h_\isa\()_tbl
.word 40b - \type\()_8tap_h_\isa\()_tbl
.ifc \type, put
.word 20b - \type\()_8tap_h_\isa\()_tbl
.endif
endjumptable
.endm
function prep_sve
movrel x9, prep_tbl
mov w6, #19
ldrsw x8, [x9, x8, lsl #2]
sub w6, w6, w7, lsr #8 // 19 - bdmax / 256
add x9, x9, x8
movi v30.8h, #PREP_BIAS_NEG
dup v29.8h, w6 // 10b: 1 << 4, 12b: 1 << 2
br x9
.align JUMP_ALIGN
40: // prep - 4xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
4:
ldr d0, [x1]
ldr d1, [x1, x2]
add x1, x1, x2, lsl #1
subs w4, w4, #2
mad z0.h, p0/m, z29.h, z30.h
mad z1.h, p0/m, z29.h, z30.h
stp d0, d1, [x0], #16
b.gt 4b
ret
.align JUMP_ALIGN
80: // prep - 8xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
8:
ld1 {v0.8h}, [x1], x2
ld1 {v1.8h}, [x1], x2
subs w4, w4, #2
mad z0.h, p0/m, z29.h, z30.h
mad z1.h, p0/m, z29.h, z30.h
stp q0, q1, [x0], #32
b.gt 8b
ret
.align JUMP_ALIGN
160: // prep - 16xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
16:
ld1 {v0.8h, v1.8h}, [x1], x2
mad z0.h, p0/m, z29.h, z30.h
mad z1.h, p0/m, z29.h, z30.h
subs w4, w4, #2
ld1 {v2.8h, v3.8h}, [x1], x2
mad z2.h, p0/m, z29.h, z30.h
mad z3.h, p0/m, z29.h, z30.h
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
add x0, x0, #64
b.gt 16b
ret
.align JUMP_ALIGN
320: // prep - 32xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
32:
ldp q0, q1, [x1]
mad z0.h, p0/m, z29.h, z30.h
mad z1.h, p0/m, z29.h, z30.h
ldp q2, q3, [x1, #32]
subs w4, w4, #1
mad z2.h, p0/m, z29.h, z30.h
mad z3.h, p0/m, z29.h, z30.h
add x1, x1, x2
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
add x0, x0, #64
b.gt 32b
ret
.align JUMP_ALIGN
640: // prep - 64xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
64:
ldp q0, q1, [x1]
mad z0.h, p0/m, z29.h, z30.h
mad z1.h, p0/m, z29.h, z30.h
ldp q2, q3, [x1, #32]
mad z2.h, p0/m, z29.h, z30.h
mad z3.h, p0/m, z29.h, z30.h
ldp q4, q5, [x1, #64]
mad z4.h, p0/m, z29.h, z30.h
mad z5.h, p0/m, z29.h, z30.h
ldp q6, q7, [x1, #96]
add x1, x1, x2
subs w4, w4, #1
mad z6.h, p0/m, z29.h, z30.h
mad z7.h, p0/m, z29.h, z30.h
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
add x0, x0, #128
b.gt 64b
ret
.align JUMP_ALIGN
1280: // prep - 128xN
AARCH64_VALID_JUMP_TARGET
.align LOOP_ALIGN
128:
ldp q0, q1, [x1]
mad z0.h, p0/m, z29.h, z30.h
mad z1.h, p0/m, z29.h, z30.h
ldp q2, q3, [x1, #32]
mad z2.h, p0/m, z29.h, z30.h
mad z3.h, p0/m, z29.h, z30.h
ldp q4, q5, [x1, #64]
mad z4.h, p0/m, z29.h, z30.h
mad z5.h, p0/m, z29.h, z30.h
ldp q6, q7, [x1, #96]
mad z6.h, p0/m, z29.h, z30.h
mad z7.h, p0/m, z29.h, z30.h
ldp q16, q17, [x1, #128]
mad z16.h, p0/m, z29.h, z30.h
mad z17.h, p0/m, z29.h, z30.h
ldp q18, q19, [x1, #160]
mad z18.h, p0/m, z29.h, z30.h
mad z19.h, p0/m, z29.h, z30.h
ldp q20, q21, [x1, #192]
mad z20.h, p0/m, z29.h, z30.h
mad z21.h, p0/m, z29.h, z30.h
ldp q22, q23, [x1, #224]
add x1, x1, x2
mad z22.h, p0/m, z29.h, z30.h
mad z23.h, p0/m, z29.h, z30.h
subs w4, w4, #1
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
stp q16, q17, [x0, #128]
stp q18, q19, [x0, #160]
stp q20, q21, [x0, #192]
stp q22, q23, [x0, #224]
add x0, x0, #256
b.gt 128b
ret
endfunc
jumptable prep_tbl
.word 1280b - prep_tbl
.word 640b - prep_tbl
.word 320b - prep_tbl
.word 160b - prep_tbl
.word 80b - prep_tbl
.word 40b - prep_tbl
endjumptable
// dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7)
// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2)
filter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2
// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8)
// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3)
filter_8tap_fn put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3
DISABLE_SVE2
DISABLE_SVE
#endif // HAVE_SVE2