mc16.S - mozsearch

/*

 * Copyright © 2018, VideoLAN and dav1d authors

 * Copyright © 2018, Janne Grunau

 * Copyright © 2020, Martin Storsjo

 * All rights reserved.

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 * 1. Redistributions of source code must retain the above copyright notice, this

 *    list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 *    this list of conditions and the following disclaimer in the documentation

 *    and/or other materials provided with the distribution.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "src/arm/asm.S"

#include "util.S"

#define PREP_BIAS 8192

.macro avg d0, d1, t0, t1, t2, t3

        ld1             {\t0\().8h,\t1\().8h},  [x2],  32

        ld1             {\t2\().8h,\t3\().8h},  [x3],  32

        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h

        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h

        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits

        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits

        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits

        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits

        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)

        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)

.endm

.macro w_avg d0, d1, t0, t1, t2, t3

        ld1             {\t0\().8h,\t1\().8h},  [x2],  32

        ld1             {\t2\().8h,\t3\().8h},  [x3],  32

        // This difference requires a 17 bit range, and all bits are

        // significant for the following multiplication.

        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h

        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h

        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h

        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h

        mul             \d0\().4s,  \d0\().4s,  v27.4s

        mul             \t0\().4s,  \t0\().4s,  v27.4s

        mul             \d1\().4s,  \d1\().4s,  v27.4s

        mul             \t1\().4s,  \t1\().4s,  v27.4s

        sshr            \d0\().4s,  \d0\().4s,  #4

        sshr            \t0\().4s,  \t0\().4s,  #4

        sshr            \d1\().4s,  \d1\().4s,  #4

        sshr            \t1\().4s,  \t1\().4s,  #4

        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h

        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h

        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h

        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h

        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2

        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto

        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits

        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits

        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits

        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits

        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max

        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max

        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0

        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0

.endm

.macro mask d0, d1, t0, t1, t2, t3

        ld1             {v27.16b}, [x6],  16

        ld1             {\t0\().8h,\t1\().8h},  [x2],  32

        neg             v27.16b, v27.16b

        ld1             {\t2\().8h,\t3\().8h},  [x3],  32

        sxtl            v26.8h,  v27.8b

        sxtl2           v27.8h,  v27.16b

        sxtl            v24.4s,  v26.4h

        sxtl2           v25.4s,  v26.8h

        sxtl            v26.4s,  v27.4h

        sxtl2           v27.4s,  v27.8h

        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h

        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h

        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h

        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h

        mul             \d0\().4s,  \d0\().4s,  v24.4s

        mul             \t0\().4s,  \t0\().4s,  v25.4s

        mul             \d1\().4s,  \d1\().4s,  v26.4s

        mul             \t1\().4s,  \t1\().4s,  v27.4s

        sshr            \d0\().4s,  \d0\().4s,  #6

        sshr            \t0\().4s,  \t0\().4s,  #6

        sshr            \d1\().4s,  \d1\().4s,  #6

        sshr            \t1\().4s,  \t1\().4s,  #6

        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h

        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h

        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h

        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h

        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2

        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto

        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits

        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits

        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits

        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits

        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max

        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max

        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0

        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0

.endm

.macro bidir_fn type, bdmax

function \type\()_16bpc_neon, export=1

        clz             w4,  w4

.ifnc \type, avg

        dup             v31.8h,  \bdmax // bitdepth_max

        movi            v30.8h,  #0

.endif

        clz             w7,  \bdmax

        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18

.ifc \type, avg

        mov             w9,  #1

        mov             w8,  #-2*PREP_BIAS

        lsl             w9,  w9,  w7    // 1 << intermediate_bits

        add             w7,  w7,  #1

        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits

        neg             w7,  w7         // -(intermediate_bits+1)

        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits

        dup             v29.8h,   w7    // -(intermediate_bits+1)

.else

        mov             w8,  #PREP_BIAS

        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits

        neg             w7,  w7         // -intermediate_bits

        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits

        dup             v29.8h,  w7     // -intermediate_bits

.endif

.ifc \type, w_avg

        dup             v27.4s,  w6

        neg             v27.4s,  v27.4s

.endif

        movrel          x7,  \type\()_tbl

        sub             w4,  w4,  #24

        \type           v4,  v5,  v0,  v1,  v2,  v3

        ldrsw           x4,  [x7, x4, lsl #2]

        add             x7,  x7,  x4

        br              x7

40:

        AARCH64_VALID_JUMP_TARGET

        add             x7,  x0,  x1

        lsl             x1,  x1,  #1

4:

        subs            w5,  w5,  #4

        st1             {v4.8b},    [x0], x1

        st1             {v4.d}[1],  [x7], x1

        st1             {v5.8b},    [x0], x1

        st1             {v5.d}[1],  [x7], x1

        b.le            0f

        \type           v4,  v5,  v0,  v1,  v2,  v3

        b               4b

80:

        AARCH64_VALID_JUMP_TARGET

        add             x7,  x0,  x1

        lsl             x1,  x1,  #1

8:

        st1             {v4.8h},  [x0], x1

        subs            w5,  w5,  #2

        st1             {v5.8h},  [x7], x1

        b.le            0f

        \type           v4,  v5,  v0,  v1,  v2,  v3

        b               8b

160:

        AARCH64_VALID_JUMP_TARGET

16:

        \type           v6,  v7,  v0,  v1,  v2,  v3

        st1             {v4.8h, v5.8h}, [x0], x1

        subs            w5,  w5,  #2

        st1             {v6.8h, v7.8h}, [x0], x1

        b.le            0f

        \type           v4,  v5,  v0,  v1,  v2,  v3

        b               16b

320:

        AARCH64_VALID_JUMP_TARGET

32:

        \type           v6,  v7,  v0,  v1,  v2,  v3

        subs            w5,  w5,  #1

        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1

        b.le            0f

        \type           v4,  v5,  v0,  v1,  v2,  v3

        b               32b

640:

        AARCH64_VALID_JUMP_TARGET

        add             x7,  x0,  #64

64:

        \type           v6,  v7,  v0,  v1,  v2,  v3

        \type           v16, v17, v0,  v1,  v2,  v3

        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1

        \type           v18, v19, v0,  v1,  v2,  v3

        subs            w5,  w5,  #1

        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1

        b.le            0f

        \type           v4,  v5,  v0,  v1,  v2,  v3

        b               64b

1280:

        AARCH64_VALID_JUMP_TARGET

        add             x7,  x0,  #64

        mov             x8,  #128

        sub             x1,  x1,  #128

128:

        \type           v6,  v7,  v0,  v1,  v2,  v3

        \type           v16, v17, v0,  v1,  v2,  v3

        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8

        \type           v18, v19, v0,  v1,  v2,  v3

        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8

        \type           v4,  v5,  v0,  v1,  v2,  v3

        \type           v6,  v7,  v0,  v1,  v2,  v3

        \type           v16, v17, v0,  v1,  v2,  v3

        subs            w5,  w5,  #1

        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1

        \type           v18, v19, v0,  v1,  v2,  v3

        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1

        b.le            0f

        \type           v4,  v5,  v0,  v1,  v2,  v3

        b               128b

0:

ret

endfunc

jumptable \type\()_tbl

        .word 1280b - \type\()_tbl

        .word 640b  - \type\()_tbl

        .word 320b  - \type\()_tbl

        .word 160b  - \type\()_tbl

        .word 80b   - \type\()_tbl

        .word 40b   - \type\()_tbl

endjumptable

.endm

bidir_fn avg, w6

bidir_fn w_avg, w7

bidir_fn mask, w7

.macro w_mask_fn type

function w_mask_\type\()_16bpc_neon, export=1

        ldr             w8,  [sp]

        clz             w9,  w4

        movrel          x10, w_mask_\type\()_tbl

        dup             v31.8h,  w8   // bitdepth_max

        sub             w9,  w9,  #24

        clz             w8,  w8       // clz(bitdepth_max)

        ldrsw           x9,  [x10,  x9,  lsl #2]

        add             x10, x10, x9

        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12

        mov             w9,  #PREP_BIAS*64

        neg             w8,  w8       // -sh

        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd

        dup             v30.4s,  w9   // PREP_BIAS*64

        dup             v29.4s,  w8   // -sh

        dup             v0.8h,   w11

.if \type == 444

        movi            v1.16b,  #64

.elseif \type == 422

        dup             v2.8b,   w7

        movi            v3.8b,   #129

        sub             v3.8b,   v3.8b,   v2.8b

.elseif \type == 420

        dup             v2.8h,   w7

        movi            v3.8h,   #1, lsl #8

        sub             v3.8h,   v3.8h,   v2.8h

.endif

        add             x12,  x0,  x1

        lsl             x1,   x1,  #1

        br              x10

40:

        AARCH64_VALID_JUMP_TARGET

4:

        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)

        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)

        subs            w5,  w5,  #4

        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)

        sabd            v21.8h,  v5.8h,   v7.8h

        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)

        ssubl2          v17.4s,  v6.8h,   v4.8h

        ssubl           v18.4s,  v7.4h,   v5.4h

        ssubl2          v19.4s,  v7.8h,   v5.8h

        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()

        uqsub           v21.8h,  v0.8h,   v21.8h

        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6

        sshll           v6.4s,   v5.4h,   #6

        sshll2          v5.4s,   v4.8h,   #6

        sshll           v4.4s,   v4.4h,   #6

        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh

        ushr            v21.8h,  v21.8h,  #10

        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64

        add             v5.4s,   v5.4s,   v30.4s

        add             v6.4s,   v6.4s,   v30.4s

        add             v7.4s,   v7.4s,   v30.4s

        uxtl            v22.4s,  v20.4h

        uxtl2           v23.4s,  v20.8h

        uxtl            v24.4s,  v21.4h

        uxtl2           v25.4s,  v21.8h

        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)

        mla             v5.4s,   v17.4s,  v23.4s

        mla             v6.4s,   v18.4s,  v24.4s

        mla             v7.4s,   v19.4s,  v25.4s

        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh

        srshl           v5.4s,   v5.4s,   v29.4s

        srshl           v6.4s,   v6.4s,   v29.4s

        srshl           v7.4s,   v7.4s,   v29.4s

        sqxtun          v4.4h,   v4.4s            // iclip_pixel

        sqxtun2         v4.8h,   v5.4s

        sqxtun          v5.4h,   v6.4s

        sqxtun2         v5.8h,   v7.4s

        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel

        umin            v5.8h,   v5.8h,   v31.8h

.if \type == 444

        uzp1            v20.16b, v20.16b, v21.16b // 64 - m

        sub             v20.16b, v1.16b,  v20.16b // m

        st1             {v20.16b}, [x6], #16

.elseif \type == 422

        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)

        xtn             v20.8b,  v20.8h

        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1

        st1             {v20.8b}, [x6], #8

.elseif \type == 420

        trn1            v24.2d,  v20.2d,  v21.2d

        trn2            v25.2d,  v20.2d,  v21.2d

        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)

        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)

        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))

        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2

        str             s20,        [x6],  #4

.endif

        st1             {v4.8b},    [x0],  x1

        st1             {v4.d}[1],  [x12], x1

        st1             {v5.8b},    [x0],  x1

        st1             {v5.d}[1],  [x12], x1

        b.gt            4b

ret

80:

        AARCH64_VALID_JUMP_TARGET

8:

        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1

        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2

        subs            w5,  w5,  #2

        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)

        sabd            v21.8h,  v5.8h,   v7.8h

        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)

        ssubl2          v17.4s,  v6.8h,   v4.8h

        ssubl           v18.4s,  v7.4h,   v5.4h

        ssubl2          v19.4s,  v7.8h,   v5.8h

        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()

        uqsub           v21.8h,  v0.8h,   v21.8h

        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6

        sshll           v6.4s,   v5.4h,   #6

        sshll2          v5.4s,   v4.8h,   #6

        sshll           v4.4s,   v4.4h,   #6

        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh

        ushr            v21.8h,  v21.8h,  #10

        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64

        add             v5.4s,   v5.4s,   v30.4s

        add             v6.4s,   v6.4s,   v30.4s

        add             v7.4s,   v7.4s,   v30.4s

        uxtl            v22.4s,  v20.4h

        uxtl2           v23.4s,  v20.8h

        uxtl            v24.4s,  v21.4h

        uxtl2           v25.4s,  v21.8h

        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)

        mla             v5.4s,   v17.4s,  v23.4s

        mla             v6.4s,   v18.4s,  v24.4s

        mla             v7.4s,   v19.4s,  v25.4s

        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh

        srshl           v5.4s,   v5.4s,   v29.4s

        srshl           v6.4s,   v6.4s,   v29.4s

        srshl           v7.4s,   v7.4s,   v29.4s

        sqxtun          v4.4h,   v4.4s            // iclip_pixel

        sqxtun2         v4.8h,   v5.4s

        sqxtun          v5.4h,   v6.4s

        sqxtun2         v5.8h,   v7.4s

        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel

        umin            v5.8h,   v5.8h,   v31.8h

.if \type == 444

        uzp1            v20.16b, v20.16b, v21.16b // 64 - m

        sub             v20.16b, v1.16b,  v20.16b // m

        st1             {v20.16b}, [x6], #16

.elseif \type == 422

        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)

        xtn             v20.8b,  v20.8h

        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1

        st1             {v20.8b}, [x6], #8

.elseif \type == 420

        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)

        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)

        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))

        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2

        str             s20,     [x6],  #4

.endif

        st1             {v4.8h}, [x0],  x1

        st1             {v5.8h}, [x12], x1

        b.gt            8b

ret

1280:

640:

320:

160:

        AARCH64_VALID_JUMP_TARGET

        mov             w11, w4

        sub             x1,  x1,  w4,  uxtw #1

.if \type == 444

        add             x10, x6,  w4,  uxtw

.elseif \type == 422

        add             x10, x6,  x11, lsr #1

.endif

        add             x9,  x3,  w4,  uxtw #1

        add             x7,  x2,  w4,  uxtw #1

161:

        mov             w8,  w4

16:

        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1

        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2

        ld1             {v6.8h,   v7.8h},  [x7], #32

        ld1             {v18.8h,  v19.8h}, [x9], #32

        subs            w8,  w8,  #16

        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)

        sabd            v21.8h,  v5.8h,   v17.8h

        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)

        ssubl2          v23.4s,  v16.8h,  v4.8h

        ssubl           v24.4s,  v17.4h,  v5.4h

        ssubl2          v25.4s,  v17.8h,  v5.8h

        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()

        uqsub           v21.8h,  v0.8h,   v21.8h

        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6

        sshll           v26.4s,  v5.4h,   #6

        sshll2          v5.4s,   v4.8h,   #6

        sshll           v4.4s,   v4.4h,   #6

        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh

        ushr            v21.8h,  v21.8h,  #10

        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64

        add             v5.4s,   v5.4s,   v30.4s

        add             v26.4s,  v26.4s,  v30.4s

        add             v27.4s,  v27.4s,  v30.4s

        uxtl            v16.4s,  v20.4h

        uxtl2           v17.4s,  v20.8h

        uxtl            v28.4s,  v21.4h

        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)

        uxtl2           v16.4s,  v21.8h

        mla             v5.4s,   v23.4s,  v17.4s

        mla             v26.4s,  v24.4s,  v28.4s

        mla             v27.4s,  v25.4s,  v16.4s

        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh

        srshl           v5.4s,   v5.4s,   v29.4s

        srshl           v26.4s,  v26.4s,  v29.4s

        srshl           v27.4s,  v27.4s,  v29.4s

        sqxtun          v4.4h,   v4.4s            // iclip_pixel

        sqxtun2         v4.8h,   v5.4s

        sqxtun          v5.4h,   v26.4s

        sqxtun2         v5.8h,   v27.4s

        // Start of other half

        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)

        sabd            v23.8h,  v7.8h,   v19.8h

        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel

        umin            v5.8h,   v5.8h,   v31.8h

        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)

        ssubl2          v17.4s,  v18.8h,  v6.8h

        ssubl           v18.4s,  v19.4h,  v7.4h

        ssubl2          v19.4s,  v19.8h,  v7.8h

        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()

        uqsub           v23.8h,  v0.8h,   v23.8h

        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6

        sshll2          v25.4s,  v6.8h,   #6

        sshll           v26.4s,  v7.4h,   #6

        sshll2          v27.4s,  v7.8h,   #6

        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh

        ushr            v23.8h,  v23.8h,  #10

        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64

        add             v25.4s,  v25.4s,  v30.4s

        add             v26.4s,  v26.4s,  v30.4s

        add             v27.4s,  v27.4s,  v30.4s

        uxtl            v6.4s,   v22.4h

        uxtl2           v7.4s,   v22.8h

        uxtl            v28.4s,  v23.4h

        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)

        uxtl2           v6.4s,   v23.8h

        mla             v25.4s,  v17.4s,  v7.4s

        mla             v26.4s,  v18.4s,  v28.4s

        mla             v27.4s,  v19.4s,  v6.4s

        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh

        srshl           v25.4s,  v25.4s,  v29.4s

        srshl           v26.4s,  v26.4s,  v29.4s

        srshl           v27.4s,  v27.4s,  v29.4s

        sqxtun          v6.4h,   v24.4s           // iclip_pixel

        sqxtun2         v6.8h,   v25.4s

        sqxtun          v7.4h,   v26.4s

        sqxtun2         v7.8h,   v27.4s

        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel

        umin            v7.8h,   v7.8h,   v31.8h

.if \type == 444

        uzp1            v20.16b, v20.16b, v21.16b // 64 - m

        uzp1            v21.16b, v22.16b, v23.16b

        sub             v20.16b, v1.16b,  v20.16b // m

        sub             v21.16b, v1.16b,  v21.16b

        st1             {v20.16b}, [x6],  #16

        st1             {v21.16b}, [x10], #16

.elseif \type == 422

        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)

        addp            v21.8h,  v22.8h,  v23.8h

        xtn             v20.8b,  v20.8h

        xtn             v21.8b,  v21.8h

        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1

        uhsub           v21.8b,  v3.8b,   v21.8b

        st1             {v20.8b}, [x6],  #8

        st1             {v21.8b}, [x10], #8

.elseif \type == 420

        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)

        add             v21.8h,  v21.8h,  v23.8h

        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)

        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))

        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2

        st1             {v20.8b}, [x6], #8

.endif

        st1             {v4.8h, v5.8h}, [x0],  #32

        st1             {v6.8h, v7.8h}, [x12], #32

        b.gt            16b

        subs            w5,  w5,  #2

        add             x2,  x2,  w4,  uxtw #1

        add             x3,  x3,  w4,  uxtw #1

        add             x7,  x7,  w4,  uxtw #1

        add             x9,  x9,  w4,  uxtw #1

.if \type == 444

        add             x6,  x6,  w4,  uxtw

        add             x10, x10, w4,  uxtw

.elseif \type == 422

        add             x6,  x6,  x11, lsr #1

        add             x10, x10, x11, lsr #1

.endif

        add             x0,  x0,  x1

        add             x12, x12, x1

        b.gt            161b

ret

endfunc

jumptable w_mask_\type\()_tbl

        .word 1280b - w_mask_\type\()_tbl

        .word 640b  - w_mask_\type\()_tbl

        .word 320b  - w_mask_\type\()_tbl

        .word 160b  - w_mask_\type\()_tbl

        .word 80b   - w_mask_\type\()_tbl

        .word 40b   - w_mask_\type\()_tbl

endjumptable

.endm

w_mask_fn 444

w_mask_fn 422

w_mask_fn 420

function blend_16bpc_neon, export=1

        movrel          x6,  blend_tbl

        clz             w3,  w3

        sub             w3,  w3,  #26

        ldrsw           x3,  [x6,  x3,  lsl #2]

        add             x6,  x6,  x3

        add             x8,  x0,  x1

        br              x6

40:

        AARCH64_VALID_JUMP_TARGET

        lsl             x1,  x1,  #1

4:

        ld1             {v2.8b},   [x5], #8

        ld1             {v1.8h},   [x2], #16

        ldr             d0,        [x0]

        neg             v2.8b,   v2.8b            // -m

        subs            w4,  w4,  #2

        ld1             {v0.d}[1], [x8]

        sxtl            v2.8h,   v2.8b

        shl             v2.8h,   v2.8h,   #9      // -m << 9

        sub             v1.8h,   v0.8h,   v1.8h   // a - b

        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6

        add             v0.8h,   v0.8h,   v1.8h

        st1             {v0.8b},   [x0], x1

        st1             {v0.d}[1], [x8], x1

        b.gt            4b

ret

80:

        AARCH64_VALID_JUMP_TARGET

        lsl             x1,  x1,  #1

8:

        ld1             {v4.16b},       [x5], #16

        ld1             {v2.8h, v3.8h}, [x2], #32

        neg             v5.16b,  v4.16b           // -m

        ld1             {v0.8h},   [x0]

        ld1             {v1.8h},   [x8]

        sxtl            v4.8h,   v5.8b

        sxtl2           v5.8h,   v5.16b

        shl             v4.8h,   v4.8h,   #9      // -m << 9

        shl             v5.8h,   v5.8h,   #9

        sub             v2.8h,   v0.8h,   v2.8h   // a - b

        sub             v3.8h,   v1.8h,   v3.8h

        subs            w4,  w4,  #2

        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6

        sqrdmulh        v3.8h,   v3.8h,   v5.8h

        add             v0.8h,   v0.8h,   v2.8h

        add             v1.8h,   v1.8h,   v3.8h

        st1             {v0.8h}, [x0], x1

        st1             {v1.8h}, [x8], x1

        b.gt            8b

ret

160:

        AARCH64_VALID_JUMP_TARGET

        lsl             x1,  x1,  #1

16:

        ld1             {v16.16b, v17.16b},           [x5], #32

        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64

        subs            w4,  w4,  #2

        neg             v18.16b, v16.16b          // -m

        neg             v19.16b, v17.16b

        ld1             {v0.8h, v1.8h}, [x0]

        sxtl            v16.8h,  v18.8b

        sxtl2           v17.8h,  v18.16b

        sxtl            v18.8h,  v19.8b

        sxtl2           v19.8h,  v19.16b

        ld1             {v2.8h, v3.8h}, [x8]

        shl             v16.8h,  v16.8h,  #9      // -m << 9

        shl             v17.8h,  v17.8h,  #9

        shl             v18.8h,  v18.8h,  #9

        shl             v19.8h,  v19.8h,  #9

        sub             v4.8h,   v0.8h,   v4.8h   // a - b

        sub             v5.8h,   v1.8h,   v5.8h

        sub             v6.8h,   v2.8h,   v6.8h

        sub             v7.8h,   v3.8h,   v7.8h

        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v5.8h,   v5.8h,   v17.8h

        sqrdmulh        v6.8h,   v6.8h,   v18.8h

        sqrdmulh        v7.8h,   v7.8h,   v19.8h

        add             v0.8h,   v0.8h,   v4.8h

        add             v1.8h,   v1.8h,   v5.8h

        add             v2.8h,   v2.8h,   v6.8h

        add             v3.8h,   v3.8h,   v7.8h

        st1             {v0.8h, v1.8h}, [x0], x1

        st1             {v2.8h, v3.8h}, [x8], x1

        b.gt            16b

ret

320:

        AARCH64_VALID_JUMP_TARGET

32:

        ld1             {v16.16b, v17.16b},           [x5], #32

        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64

        subs            w4,  w4,  #1

        neg             v18.16b, v16.16b          // -m

        neg             v19.16b, v17.16b

        sxtl            v16.8h,  v18.8b

        sxtl2           v17.8h,  v18.16b

        sxtl            v18.8h,  v19.8b

        sxtl2           v19.8h,  v19.16b

        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]

        shl             v16.8h,  v16.8h,  #9      // -m << 9

        shl             v17.8h,  v17.8h,  #9

        shl             v18.8h,  v18.8h,  #9

        shl             v19.8h,  v19.8h,  #9

        sub             v4.8h,   v0.8h,   v4.8h   // a - b

        sub             v5.8h,   v1.8h,   v5.8h

        sub             v6.8h,   v2.8h,   v6.8h

        sub             v7.8h,   v3.8h,   v7.8h

        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v5.8h,   v5.8h,   v17.8h

        sqrdmulh        v6.8h,   v6.8h,   v18.8h

        sqrdmulh        v7.8h,   v7.8h,   v19.8h

        add             v0.8h,   v0.8h,   v4.8h

        add             v1.8h,   v1.8h,   v5.8h

        add             v2.8h,   v2.8h,   v6.8h

        add             v3.8h,   v3.8h,   v7.8h

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

        b.gt            32b

ret

endfunc

jumptable blend_tbl

        .word 320b - blend_tbl

        .word 160b - blend_tbl

        .word 80b  - blend_tbl

        .word 40b  - blend_tbl

endjumptable

function blend_h_16bpc_neon, export=1

        movrel          x6,  blend_h_tbl

        movrel          x5,  X(obmc_masks)

        add             x5,  x5,  w4,  uxtw

        sub             w4,  w4,  w4,  lsr #2

        clz             w7,  w3

        add             x8,  x0,  x1

        lsl             x1,  x1,  #1

        sub             w7,  w7,  #24

        ldrsw           x7,  [x6,  x7,  lsl #2]

        add             x6,  x6,  x7

        br              x6

20:

        AARCH64_VALID_JUMP_TARGET

2:

        ld2r            {v2.8b, v3.8b}, [x5], #2

        ld1             {v1.4h},        [x2], #8

        ext             v2.8b,   v2.8b,   v3.8b,   #6

        subs            w4,  w4,  #2

        neg             v2.8b,   v2.8b            // -m

        ldr             s0,        [x0]

        ld1             {v0.s}[1], [x8]

        sxtl            v2.8h,   v2.8b

        shl             v2.4h,   v2.4h,   #9      // -m << 9

        sub             v1.4h,   v0.4h,   v1.4h   // a - b

        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6

        add             v0.4h,   v0.4h,   v1.4h

        st1             {v0.s}[0], [x0], x1

        st1             {v0.s}[1], [x8], x1

        b.gt            2b

ret

40:

        AARCH64_VALID_JUMP_TARGET

4:

        ld2r            {v2.8b, v3.8b}, [x5], #2

        ld1             {v1.8h},        [x2], #16

        ext             v2.8b,   v2.8b,   v3.8b,   #4

        subs            w4,  w4,  #2

        neg             v2.8b,   v2.8b            // -m

        ldr             d0,          [x0]

        ld1             {v0.d}[1],   [x8]

        sxtl            v2.8h,   v2.8b

        shl             v2.8h,   v2.8h,   #9      // -m << 9

        sub             v1.8h,   v0.8h,   v1.8h   // a - b

        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6

        add             v0.8h,   v0.8h,   v1.8h

        st1             {v0.8b},   [x0], x1

        st1             {v0.d}[1], [x8], x1

        b.gt            4b

ret

80:

        AARCH64_VALID_JUMP_TARGET

8:

        ld2r            {v4.8b, v5.8b}, [x5], #2

        ld1             {v2.8h, v3.8h}, [x2], #32

        neg             v4.8b,   v4.8b            // -m

        neg             v5.8b,   v5.8b

        ld1             {v0.8h}, [x0]

        subs            w4,  w4,  #2

        sxtl            v4.8h,   v4.8b

        sxtl            v5.8h,   v5.8b

        ld1             {v1.8h}, [x8]

        shl             v4.8h,   v4.8h,   #9      // -m << 9

        shl             v5.8h,   v5.8h,   #9

        sub             v2.8h,   v0.8h,   v2.8h   // a - b

        sub             v3.8h,   v1.8h,   v3.8h

        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6

        sqrdmulh        v3.8h,   v3.8h,   v5.8h

        add             v0.8h,   v0.8h,   v2.8h

        add             v1.8h,   v1.8h,   v3.8h

        st1             {v0.8h}, [x0], x1

        st1             {v1.8h}, [x8], x1

        b.gt            8b

ret

160:

        AARCH64_VALID_JUMP_TARGET

16:

        ld2r            {v16.8b, v17.8b}, [x5], #2

        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64

        neg             v16.8b,  v16.8b           // -m

        neg             v17.8b,  v17.8b

        ld1             {v0.8h, v1.8h},  [x0]

        ld1             {v2.8h, v3.8h},  [x8]

        subs            w4,  w4,  #2

        sxtl            v16.8h,  v16.8b

        sxtl            v17.8h,  v17.8b

        shl             v16.8h,  v16.8h,  #9      // -m << 9

        shl             v17.8h,  v17.8h,  #9

        sub             v4.8h,   v0.8h,   v4.8h   // a - b

        sub             v5.8h,   v1.8h,   v5.8h

        sub             v6.8h,   v2.8h,   v6.8h

        sub             v7.8h,   v3.8h,   v7.8h

        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v5.8h,   v5.8h,   v16.8h

        sqrdmulh        v6.8h,   v6.8h,   v17.8h

        sqrdmulh        v7.8h,   v7.8h,   v17.8h

        add             v0.8h,   v0.8h,   v4.8h

        add             v1.8h,   v1.8h,   v5.8h

        add             v2.8h,   v2.8h,   v6.8h

        add             v3.8h,   v3.8h,   v7.8h

        st1             {v0.8h, v1.8h}, [x0], x1

        st1             {v2.8h, v3.8h}, [x8], x1

        b.gt            16b

ret

1280:

640:

320:

        AARCH64_VALID_JUMP_TARGET

        sub             x1,  x1,  w3,  uxtw #1

        add             x7,  x2,  w3,  uxtw #1

321:

        ld2r            {v24.8b, v25.8b}, [x5], #2

        mov             w6,  w3

        neg             v24.8b,  v24.8b           // -m

        neg             v25.8b,  v25.8b

        sxtl            v24.8h,  v24.8b

        sxtl            v25.8h,  v25.8b

        shl             v24.8h,  v24.8h,  #9      // -m << 9

        shl             v25.8h,  v25.8h,  #9

32:

        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64

        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]

        subs            w6,  w6,  #32

        sub             v16.8h,  v0.8h,   v16.8h  // a - b

        sub             v17.8h,  v1.8h,   v17.8h

        sub             v18.8h,  v2.8h,   v18.8h

        sub             v19.8h,  v3.8h,   v19.8h

        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64

        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]

        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v17.8h,  v17.8h,  v24.8h

        sqrdmulh        v18.8h,  v18.8h,  v24.8h

        sqrdmulh        v19.8h,  v19.8h,  v24.8h

        sub             v20.8h,  v4.8h,   v20.8h  // a - b

        sub             v21.8h,  v5.8h,   v21.8h

        sub             v22.8h,  v6.8h,   v22.8h

        sub             v23.8h,  v7.8h,   v23.8h

        add             v0.8h,   v0.8h,   v16.8h

        add             v1.8h,   v1.8h,   v17.8h

        add             v2.8h,   v2.8h,   v18.8h

        add             v3.8h,   v3.8h,   v19.8h

        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v21.8h,  v21.8h,  v25.8h

        sqrdmulh        v22.8h,  v22.8h,  v25.8h

        sqrdmulh        v23.8h,  v23.8h,  v25.8h

        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64

        add             v4.8h,   v4.8h,   v20.8h

        add             v5.8h,   v5.8h,   v21.8h

        add             v6.8h,   v6.8h,   v22.8h

        add             v7.8h,   v7.8h,   v23.8h

        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64

        b.gt            32b

        subs            w4,  w4,  #2

        add             x0,  x0,  x1

        add             x8,  x8,  x1

        add             x2,  x2,  w3,  uxtw #1

        add             x7,  x7,  w3,  uxtw #1

        b.gt            321b

ret

endfunc

jumptable blend_h_tbl

        .word 1280b - blend_h_tbl

        .word 640b  - blend_h_tbl

        .word 320b  - blend_h_tbl

        .word 160b  - blend_h_tbl

        .word 80b   - blend_h_tbl

        .word 40b   - blend_h_tbl

        .word 20b   - blend_h_tbl

endjumptable

function blend_v_16bpc_neon, export=1

        movrel          x6,  blend_v_tbl

        movrel          x5,  X(obmc_masks)

        add             x5,  x5,  w3,  uxtw

        clz             w3,  w3

        add             x8,  x0,  x1

        lsl             x1,  x1,  #1

        sub             w3,  w3,  #26

        ldrsw           x3,  [x6,  x3,  lsl #2]

        add             x6,  x6,  x3

        br              x6

20:

        AARCH64_VALID_JUMP_TARGET

        ld1r            {v2.8b}, [x5]

        neg             v2.8b,   v2.8b            // -m

        sxtl            v2.8h,   v2.8b

        shl             v2.4h,   v2.4h,   #9      // -m << 9

2:

        ldr             s1,  [x2],  #4

        ldr             h0,  [x0]

        subs            w4,  w4,  #2

        ld1             {v1.h}[1], [x2]

        ld1             {v0.h}[1], [x8]

        add             x2,  x2,  #4

        sub             v1.4h,   v0.4h,   v1.4h   // a - b

        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6

        add             v0.4h,   v0.4h,   v1.4h

        st1             {v0.h}[0], [x0],  x1

        st1             {v0.h}[1], [x8],  x1

        b.gt            2b

ret

40:

        AARCH64_VALID_JUMP_TARGET

        ld1r            {v2.2s}, [x5]

        sub             x1,  x1,  #4

        neg             v2.8b,   v2.8b            // -m

        sxtl            v2.8h,   v2.8b

        shl             v2.8h,   v2.8h,   #9      // -m << 9

4:

        ld1             {v1.8h},   [x2], #16

        ldr             d0,        [x0]

        ld1             {v0.d}[1], [x8]

        subs            w4,  w4,  #2

        sub             v1.8h,   v0.8h,   v1.8h   // a - b

        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6

        add             v0.8h,   v0.8h,   v1.8h

        str             s0,        [x0], #4

        st1             {v0.s}[2], [x8], #4

        st1             {v0.h}[2], [x0], x1

        st1             {v0.h}[6], [x8], x1

        b.gt            4b

ret

80:

        AARCH64_VALID_JUMP_TARGET

        ld1             {v4.8b}, [x5]

        sub             x1,  x1,  #8

        neg             v4.8b,   v4.8b            // -m

        sxtl            v4.8h,   v4.8b

        shl             v4.8h,   v4.8h,   #9      // -m << 9

8:

        ld1             {v2.8h, v3.8h}, [x2], #32

        ld1             {v0.8h}, [x0]

        ld1             {v1.8h}, [x8]

        subs            w4,  w4,  #2

        sub             v2.8h,   v0.8h,   v2.8h   // a - b

        sub             v3.8h,   v1.8h,   v3.8h

        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6

        sqrdmulh        v3.8h,   v3.8h,   v4.8h

        add             v0.8h,   v0.8h,   v2.8h

        add             v1.8h,   v1.8h,   v3.8h

        str             d0,        [x0], #8

        str             d1,        [x8], #8

        st1             {v0.s}[2], [x0], x1

        st1             {v1.s}[2], [x8], x1

        b.gt            8b

ret

160:

        AARCH64_VALID_JUMP_TARGET

        ld1             {v16.16b}, [x5]

        sub             x1,  x1,  #16

        neg             v17.16b, v16.16b          // -m

        sxtl            v16.8h,  v17.8b

        sxtl2           v17.8h,  v17.16b

        shl             v16.8h,  v16.8h,  #9      // -m << 9

        shl             v17.4h,  v17.4h,  #9

16:

        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64

        ld1             {v0.8h, v1.8h}, [x0]

        subs            w4,  w4,  #2

        ld1             {v2.8h, v3.8h}, [x8]

        sub             v4.8h,   v0.8h,   v4.8h   // a - b

        sub             v5.4h,   v1.4h,   v5.4h

        sub             v6.8h,   v2.8h,   v6.8h

        sub             v7.4h,   v3.4h,   v7.4h

        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v5.4h,   v5.4h,   v17.4h

        sqrdmulh        v6.8h,   v6.8h,   v16.8h

        sqrdmulh        v7.4h,   v7.4h,   v17.4h

        add             v0.8h,   v0.8h,   v4.8h

        add             v1.4h,   v1.4h,   v5.4h

        add             v2.8h,   v2.8h,   v6.8h

        add             v3.4h,   v3.4h,   v7.4h

        st1             {v0.8h}, [x0], #16

        st1             {v2.8h}, [x8], #16

        st1             {v1.4h}, [x0], x1

        st1             {v3.4h}, [x8], x1

        b.gt            16b

ret

320:

        AARCH64_VALID_JUMP_TARGET

        ld1             {v24.16b, v25.16b},  [x5]

        neg             v26.16b, v24.16b          // -m

        neg             v27.8b,  v25.8b

        sxtl            v24.8h,  v26.8b

        sxtl2           v25.8h,  v26.16b

        sxtl            v26.8h,  v27.8b

        shl             v24.8h,  v24.8h,  #9      // -m << 9

        shl             v25.8h,  v25.8h,  #9

        shl             v26.8h,  v26.8h,  #9

32:

        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64

        ld1             {v0.8h, v1.8h, v2.8h}, [x0]

        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64

        ld1             {v4.8h, v5.8h, v6.8h}, [x8]

        subs            w4,  w4,  #2

        sub             v16.8h,  v0.8h,   v16.8h  // a - b

        sub             v17.8h,  v1.8h,   v17.8h

        sub             v18.8h,  v2.8h,   v18.8h

        sub             v20.8h,  v4.8h,   v20.8h

        sub             v21.8h,  v5.8h,   v21.8h

        sub             v22.8h,  v6.8h,   v22.8h

        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6

        sqrdmulh        v17.8h,  v17.8h,  v25.8h

        sqrdmulh        v18.8h,  v18.8h,  v26.8h

        sqrdmulh        v20.8h,  v20.8h,  v24.8h

        sqrdmulh        v21.8h,  v21.8h,  v25.8h

        sqrdmulh        v22.8h,  v22.8h,  v26.8h

        add             v0.8h,   v0.8h,   v16.8h

        add             v1.8h,   v1.8h,   v17.8h

        add             v2.8h,   v2.8h,   v18.8h

        add             v4.8h,   v4.8h,   v20.8h

        add             v5.8h,   v5.8h,   v21.8h

        add             v6.8h,   v6.8h,   v22.8h

        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1

        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1

        b.gt            32b

ret

endfunc

jumptable blend_v_tbl

        .word 320b - blend_v_tbl

        .word 160b - blend_v_tbl

        .word 80b  - blend_v_tbl

        .word 40b  - blend_v_tbl

        .word 20b  - blend_v_tbl

endjumptable

// This has got the same signature as the put_8tap functions,

// and assumes that x9 is set to (clz(w)-24).

function put_16bpc_neon, export=1

        movrel          x10, put_16bpc_tbl

        ldrsw           x9, [x10, x9, lsl #2]

        add             x10, x10, x9

        br              x10

20:

        AARCH64_VALID_JUMP_TARGET

2:

        ld1r            {v0.4s},   [x2], x3

        ld1r            {v1.4s},   [x2], x3

        subs            w5,  w5,  #2

        st1             {v0.s}[0], [x0], x1

        st1             {v1.s}[0], [x0], x1

        b.gt            2b

ret

40:

        AARCH64_VALID_JUMP_TARGET

4:

        ld1             {v0.4h}, [x2], x3

        ld1             {v1.4h}, [x2], x3

        subs            w5,  w5,  #2

        st1             {v0.4h}, [x0], x1

        st1             {v1.4h}, [x0], x1

        b.gt            4b

ret

80:

        AARCH64_VALID_JUMP_TARGET

        add             x8,  x0,  x1

        lsl             x1,  x1,  #1

        add             x9,  x2,  x3

        lsl             x3,  x3,  #1

8:

        ld1             {v0.8h}, [x2], x3

        ld1             {v1.8h}, [x9], x3

        subs            w5,  w5,  #2

        st1             {v0.8h}, [x0], x1

        st1             {v1.8h}, [x8], x1

        b.gt            8b

ret

160:

        AARCH64_VALID_JUMP_TARGET

16:

        ldp             x6,  x7,  [x2]

        ldp             x8,  x9,  [x2, #16]

        stp             x6,  x7,  [x0]

        subs            w5,  w5,  #1

        stp             x8,  x9,  [x0, #16]

        add             x2,  x2,  x3

        add             x0,  x0,  x1

        b.gt            16b

ret

320:

        AARCH64_VALID_JUMP_TARGET

32:

        ldp             x6,  x7,  [x2]

        ldp             x8,  x9,  [x2, #16]

        stp             x6,  x7,  [x0]

        ldp             x10, x11, [x2, #32]

        stp             x8,  x9,  [x0, #16]

        subs            w5,  w5,  #1

        ldp             x12, x13, [x2, #48]

        stp             x10, x11, [x0, #32]

        stp             x12, x13, [x0, #48]

        add             x2,  x2,  x3

        add             x0,  x0,  x1

        b.gt            32b

ret

640:

        AARCH64_VALID_JUMP_TARGET

64:

        ldp             q0,  q1,  [x2]

        ldp             q2,  q3,  [x2, #32]

        stp             q0,  q1,  [x0]

        ldp             q4,  q5,  [x2, #64]

        stp             q2,  q3,  [x0, #32]

        ldp             q6,  q7,  [x2, #96]

        subs            w5,  w5,  #1

        stp             q4,  q5,  [x0, #64]

        stp             q6,  q7,  [x0, #96]

        add             x2,  x2,  x3

        add             x0,  x0,  x1

        b.gt            64b

ret

1280:

        AARCH64_VALID_JUMP_TARGET

128:

        ldp             q0,  q1,  [x2]

        ldp             q2,  q3,  [x2, #32]

        stp             q0,  q1,  [x0]

        ldp             q4,  q5,  [x2, #64]

        stp             q2,  q3,  [x0, #32]

        ldp             q6,  q7,  [x2, #96]

        subs            w5,  w5,  #1

        stp             q4,  q5,  [x0, #64]

        ldp             q16, q17, [x2, #128]

        stp             q6,  q7,  [x0, #96]

        ldp             q18, q19, [x2, #160]

        stp             q16, q17, [x0, #128]

        ldp             q20, q21, [x2, #192]

        stp             q18, q19, [x0, #160]

        ldp             q22, q23, [x2, #224]

        stp             q20, q21, [x0, #192]

        stp             q22, q23, [x0, #224]

        add             x2,  x2,  x3

        add             x0,  x0,  x1

        b.gt            128b

ret

endfunc

jumptable put_16bpc_tbl

        .word 1280b - put_16bpc_tbl

        .word 640b  - put_16bpc_tbl

        .word 320b  - put_16bpc_tbl

        .word 160b  - put_16bpc_tbl

        .word 80b   - put_16bpc_tbl

        .word 40b   - put_16bpc_tbl

        .word 20b   - put_16bpc_tbl

endjumptable

// This has got the same signature as the prep_8tap functions,

// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and

// x8 to w*2.

function prep_16bpc_neon

        movrel          x10, prep_16bpc_tbl

        ldrsw           x9, [x10, x9, lsl #2]

        dup             v31.8h,  w7   // intermediate_bits

        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8

        add             x10, x10, x9

        br              x10

40:

        AARCH64_VALID_JUMP_TARGET

        add             x9,  x1,  x2

        lsl             x2,  x2,  #1

4:

        ld1             {v0.8b},   [x1], x2

        ld1             {v0.d}[1], [x9], x2

        subs            w4,  w4,  #2

        sshl            v0.8h,   v0.8h,   v31.8h

        sub             v0.8h,   v0.8h,   v30.8h

        st1             {v0.8h}, [x0], #16

        b.gt            4b

ret

80:

        AARCH64_VALID_JUMP_TARGET

        add             x9,  x1,  x2

        lsl             x2,  x2,  #1

8:

        ld1             {v0.8h}, [x1], x2

        ld1             {v1.8h}, [x9], x2

        subs            w4,  w4,  #2

        sshl            v0.8h,   v0.8h,   v31.8h

        sshl            v1.8h,   v1.8h,   v31.8h

        sub             v0.8h,   v0.8h,   v30.8h

        sub             v1.8h,   v1.8h,   v30.8h

        st1             {v0.8h, v1.8h}, [x0], #32

        b.gt            8b

ret

160:

        AARCH64_VALID_JUMP_TARGET

16:

        ldp             q0,  q1,  [x1]

        add             x1,  x1,  x2

        sshl            v0.8h,   v0.8h,   v31.8h

        ldp             q2,  q3,  [x1]

        add             x1,  x1,  x2

        subs            w4,  w4,  #2

        sshl            v1.8h,   v1.8h,   v31.8h

        sshl            v2.8h,   v2.8h,   v31.8h

        sshl            v3.8h,   v3.8h,   v31.8h

        sub             v0.8h,   v0.8h,   v30.8h

        sub             v1.8h,   v1.8h,   v30.8h

        sub             v2.8h,   v2.8h,   v30.8h

        sub             v3.8h,   v3.8h,   v30.8h

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

        b.gt            16b

ret

320:

        AARCH64_VALID_JUMP_TARGET

32:

        ldp             q0,  q1,  [x1]

        sshl            v0.8h,   v0.8h,   v31.8h

        ldp             q2,  q3,  [x1, #32]

        add             x1,  x1,  x2

        sshl            v1.8h,   v1.8h,   v31.8h

        sshl            v2.8h,   v2.8h,   v31.8h

        sshl            v3.8h,   v3.8h,   v31.8h

        subs            w4,  w4,  #1

        sub             v0.8h,   v0.8h,   v30.8h

        sub             v1.8h,   v1.8h,   v30.8h

        sub             v2.8h,   v2.8h,   v30.8h

        sub             v3.8h,   v3.8h,   v30.8h

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

        b.gt            32b

ret

640:

        AARCH64_VALID_JUMP_TARGET

64:

        ldp             q0,  q1,  [x1]

        subs            w4,  w4,  #1

        sshl            v0.8h,   v0.8h,   v31.8h

        ldp             q2,  q3,  [x1, #32]

        sshl            v1.8h,   v1.8h,   v31.8h

        ldp             q4,  q5,  [x1, #64]

        sshl            v2.8h,   v2.8h,   v31.8h

        sshl            v3.8h,   v3.8h,   v31.8h

        ldp             q6,  q7,  [x1, #96]

        add             x1,  x1,  x2

        sshl            v4.8h,   v4.8h,   v31.8h

        sshl            v5.8h,   v5.8h,   v31.8h

        sshl            v6.8h,   v6.8h,   v31.8h

        sshl            v7.8h,   v7.8h,   v31.8h

        sub             v0.8h,   v0.8h,   v30.8h

        sub             v1.8h,   v1.8h,   v30.8h

        sub             v2.8h,   v2.8h,   v30.8h

        sub             v3.8h,   v3.8h,   v30.8h

        stp             q0,  q1,  [x0]

        sub             v4.8h,   v4.8h,   v30.8h

        sub             v5.8h,   v5.8h,   v30.8h

        stp             q2,  q3,  [x0, #32]

        sub             v6.8h,   v6.8h,   v30.8h

        sub             v7.8h,   v7.8h,   v30.8h

        stp             q4,  q5,  [x0, #64]

        stp             q6,  q7,  [x0, #96]

        add             x0,  x0,  x8

        b.gt            64b

ret

1280:

        AARCH64_VALID_JUMP_TARGET

128:

        ldp             q0,  q1,  [x1]

        subs            w4,  w4,  #1

        sshl            v0.8h,   v0.8h,   v31.8h

        ldp             q2,  q3,  [x1, #32]

        sshl            v1.8h,   v1.8h,   v31.8h

        ldp             q4,  q5,  [x1, #64]

        sshl            v2.8h,   v2.8h,   v31.8h

        sshl            v3.8h,   v3.8h,   v31.8h

        ldp             q6,  q7,  [x1, #96]

        sshl            v4.8h,   v4.8h,   v31.8h

        sshl            v5.8h,   v5.8h,   v31.8h

        ldp             q16, q17, [x1, #128]

        sshl            v6.8h,   v6.8h,   v31.8h

        sshl            v7.8h,   v7.8h,   v31.8h

        ldp             q18, q19, [x1, #160]

        sshl            v16.8h,  v16.8h,  v31.8h

        sshl            v17.8h,  v17.8h,  v31.8h

        ldp             q20, q21, [x1, #192]

        sshl            v18.8h,  v18.8h,  v31.8h

        sshl            v19.8h,  v19.8h,  v31.8h

        ldp             q22, q23, [x1, #224]

        add             x1,  x1,  x2

        sshl            v20.8h,  v20.8h,  v31.8h

        sshl            v21.8h,  v21.8h,  v31.8h

        sshl            v22.8h,  v22.8h,  v31.8h

        sshl            v23.8h,  v23.8h,  v31.8h

        sub             v0.8h,   v0.8h,   v30.8h

        sub             v1.8h,   v1.8h,   v30.8h

        sub             v2.8h,   v2.8h,   v30.8h

        sub             v3.8h,   v3.8h,   v30.8h

        stp             q0,  q1,  [x0]

        sub             v4.8h,   v4.8h,   v30.8h

        sub             v5.8h,   v5.8h,   v30.8h

        stp             q2,  q3,  [x0, #32]

        sub             v6.8h,   v6.8h,   v30.8h

        sub             v7.8h,   v7.8h,   v30.8h

        stp             q4,  q5,  [x0, #64]

        sub             v16.8h,  v16.8h,  v30.8h

        sub             v17.8h,  v17.8h,  v30.8h

        stp             q6,  q7,  [x0, #96]

        sub             v18.8h,  v18.8h,  v30.8h

        sub             v19.8h,  v19.8h,  v30.8h

        stp             q16, q17, [x0, #128]

        sub             v20.8h,  v20.8h,  v30.8h

        sub             v21.8h,  v21.8h,  v30.8h

        stp             q18, q19, [x0, #160]

        sub             v22.8h,  v22.8h,  v30.8h

        sub             v23.8h,  v23.8h,  v30.8h

        stp             q20, q21, [x0, #192]

        stp             q22, q23, [x0, #224]

        add             x0,  x0,  x8

        b.gt            128b

ret

endfunc

jumptable prep_16bpc_tbl

        .word 1280b - prep_16bpc_tbl

        .word 640b  - prep_16bpc_tbl

        .word 320b  - prep_16bpc_tbl

        .word 160b  - prep_16bpc_tbl

        .word 80b   - prep_16bpc_tbl

        .word 40b   - prep_16bpc_tbl

endjumptable

.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6

        ld1             {\d0\wd}[0], [\s0], \strd

        ld1             {\d1\wd}[0], [\s1], \strd

.ifnb \d2

        ld1             {\d2\wd}[0], [\s0], \strd

        ld1             {\d3\wd}[0], [\s1], \strd

.endif

.ifnb \d4

        ld1             {\d4\wd}[0], [\s0], \strd

.endif

.ifnb \d5

        ld1             {\d5\wd}[0], [\s1], \strd

.endif

.ifnb \d6

        ld1             {\d6\wd}[0], [\s0], \strd

.endif

.endm

.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6

        ld1             {\d0\wd}, [\s0], \strd

        ld1             {\d1\wd}, [\s1], \strd

.ifnb \d2

        ld1             {\d2\wd}, [\s0], \strd

        ld1             {\d3\wd}, [\s1], \strd

.endif

.ifnb \d4

        ld1             {\d4\wd}, [\s0], \strd

.endif

.ifnb \d5

        ld1             {\d5\wd}, [\s1], \strd

.endif

.ifnb \d6

        ld1             {\d6\wd}, [\s0], \strd

.endif

.endm

.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5

        ld1             {\d0\wd, \d1\wd}, [\s0], \strd

.ifnb \d2

        ld1             {\d2\wd, \d3\wd}, [\s1], \strd

.endif

.ifnb \d4

        ld1             {\d4\wd, \d5\wd}, [\s0], \strd

.endif

.endm

.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6

        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6

.endm

.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6

        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6

.endm

.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6

        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6

.endm

.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5

        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5

.endm

.macro interleave_1 wd, r0, r1, r2, r3, r4

        trn1            \r0\wd, \r0\wd, \r1\wd

        trn1            \r1\wd, \r1\wd, \r2\wd

.ifnb \r3

        trn1            \r2\wd, \r2\wd, \r3\wd

        trn1            \r3\wd, \r3\wd, \r4\wd

.endif

.endm

.macro interleave_1_s r0, r1, r2, r3, r4

        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4

.endm

.macro umin_h c, wd, r0, r1, r2, r3

        umin            \r0\wd,  \r0\wd,  \c\wd

.ifnb \r1

        umin            \r1\wd,  \r1\wd,  \c\wd

.endif

.ifnb \r2

        umin            \r2\wd,  \r2\wd,  \c\wd

        umin            \r3\wd,  \r3\wd,  \c\wd

.endif

.endm

.macro sub_h c, wd, r0, r1, r2, r3

        sub             \r0\wd,  \r0\wd,  \c\wd

.ifnb \r1

        sub             \r1\wd,  \r1\wd,  \c\wd

.endif

.ifnb \r2

        sub             \r2\wd,  \r2\wd,  \c\wd

        sub             \r3\wd,  \r3\wd,  \c\wd

.endif

.endm

.macro smull_smlal_4tap d, s0, s1, s2, s3

        smull           \d\().4s,  \s0\().4h,  v0.h[0]

        smlal           \d\().4s,  \s1\().4h,  v0.h[1]

        smlal           \d\().4s,  \s2\().4h,  v0.h[2]

        smlal           \d\().4s,  \s3\().4h,  v0.h[3]

.endm

.macro smull2_smlal2_4tap d, s0, s1, s2, s3

        smull2          \d\().4s,  \s0\().8h,  v0.h[0]

        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]

        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]

        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]

.endm

.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7

        smull           \d\().4s,  \s1\().4h,  v0.h[1]

        smlal           \d\().4s,  \s2\().4h,  v0.h[2]

        smlal           \d\().4s,  \s3\().4h,  v0.h[3]

        smlal           \d\().4s,  \s4\().4h,  v0.h[4]

        smlal           \d\().4s,  \s5\().4h,  v0.h[5]

        smlal           \d\().4s,  \s6\().4h,  v0.h[6]

.endm

.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7

        smull2          \d\().4s,  \s1\().8h,  v0.h[1]

        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]

        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]

        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]

        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]

        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]

.endm

.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7

        smull           \d\().4s,  \s0\().4h,  v0.h[0]

        smlal           \d\().4s,  \s1\().4h,  v0.h[1]

        smlal           \d\().4s,  \s2\().4h,  v0.h[2]

        smlal           \d\().4s,  \s3\().4h,  v0.h[3]

        smlal           \d\().4s,  \s4\().4h,  v0.h[4]

        smlal           \d\().4s,  \s5\().4h,  v0.h[5]

        smlal           \d\().4s,  \s6\().4h,  v0.h[6]

        smlal           \d\().4s,  \s7\().4h,  v0.h[7]

.endm

.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7

        smull2          \d\().4s,  \s0\().8h,  v0.h[0]

        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]

        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]

        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]

        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]

        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]

        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]

        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]

.endm

.macro sqrshrun_h shift, r0, r1, r2, r3

        sqrshrun        \r0\().4h, \r0\().4s,  #\shift

.ifnb \r1

        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift

.endif

.ifnb \r2

        sqrshrun        \r2\().4h, \r2\().4s,  #\shift

        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift

.endif

.endm

.macro xtn_h r0, r1, r2, r3

        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2

.ifnb \r2

        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto

.endif

.endm

.macro srshl_s shift, r0, r1, r2, r3

        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s

        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s

.ifnb \r2

        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s

        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s

.endif

.endm

.macro st_s strd, reg, lanes

        st1             {\reg\().s}[0], [x0], \strd

        st1             {\reg\().s}[1], [x9], \strd

.if \lanes > 2

        st1             {\reg\().s}[2], [x0], \strd

        st1             {\reg\().s}[3], [x9], \strd

.endif

.endm

.macro st_d strd, r0, r1

        st1             {\r0\().8b},   [x0], \strd

        st1             {\r0\().d}[1], [x9], \strd

.ifnb \r1

        st1             {\r1\().8b},   [x0], \strd

        st1             {\r1\().d}[1], [x9], \strd

.endif

.endm

.macro shift_store_4 type, strd, r0, r1, r2, r3

.ifc \type, put

        sqrshrun_h      6,   \r0, \r1, \r2, \r3

        umin_h          v31, .8h, \r0, \r2

.else

        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)

        xtn_h           \r0, \r1, \r2, \r3

        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS

.endif

        st_d            \strd, \r0, \r2

.endm

.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7

        st1             {\r0\wd}, [x0], \strd

        st1             {\r1\wd}, [x9], \strd

.ifnb \r2

        st1             {\r2\wd}, [x0], \strd

        st1             {\r3\wd}, [x9], \strd

.endif

.ifnb \r4

        st1             {\r4\wd}, [x0], \strd

        st1             {\r5\wd}, [x9], \strd

        st1             {\r6\wd}, [x0], \strd

        st1             {\r7\wd}, [x9], \strd

.endif

.endm

.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7

        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7

.endm

.macro shift_store_8 type, strd, r0, r1, r2, r3

.ifc \type, put

        sqrshrun_h      6,   \r0, \r1, \r2, \r3

        umin_h          v31, .8h, \r0, \r2

.else

        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)

        xtn_h           \r0, \r1, \r2, \r3

        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS

.endif

        st_8h           \strd, \r0, \r2

.endm

.macro shift_store_16 type, strd, dst, r0, r1, r2, r3

.ifc \type, put

        sqrshrun_h      6,   \r0, \r1, \r2, \r3

        umin            \r0\().8h, \r0\().8h, v31.8h

        umin            \r1\().8h, \r2\().8h, v31.8h

.else

        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)

        xtn_h           \r0, \r1, \r2, \r3

        sub             \r0\().8h, \r0\().8h, v29.8h

        sub             \r1\().8h, \r2\().8h, v29.8h

.endif

        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd

.endm

.macro make_8tap_fn op, type, type_h, type_v, taps

function \op\()_8tap_\type\()_16bpc_neon, export=1

        mov             w9,  \type_h

        mov             w10, \type_v

        b               \op\()_\taps\()_neon

endfunc

.endm

// No spaces in these expressions, due to gas-preprocessor.

#define REGULAR ((0*15<<7)|3*15)

#define SMOOTH  ((1*15<<7)|4*15)

#define SHARP   ((2*15<<7)|3*15)

.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps

function \type\()_\taps\()_neon

.ifc \bdmax, w8

        ldr             w8,  [sp]

.endif

        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)

        mul             \mx,  \mx, w11

        mul             \my,  \my, w11

        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h

        add             \my,  \my, w10 // my, 8tap_v, 4tap_v

.ifc \type, prep

        uxtw            \d_strd, \w

        lsl             \d_strd, \d_strd, #1

.endif

        dup             v31.8h,  \bdmax        // bitdepth_max

        clz             \bdmax,  \bdmax

        clz             w9,  \w

        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18

        mov             w12, #6

        tst             \mx, #(0x7f << 14)

        sub             w9,  w9,  #24

        add             w13, w12, \bdmax       // 6 + intermediate_bits

        sub             w12, w12, \bdmax       // 6 - intermediate_bits

        movrel          x11, X(mc_subpel_filters), -8

        b.ne            L(\type\()_\taps\()_h)

        tst             \my, #(0x7f << 14)

        b.ne            L(\type\()_\taps\()_v)

        b               \type\()_16bpc_neon

L(\type\()_\taps\()_h):

        cmp             \w,   #4

        ubfx            w10,  \mx, #7, #7

        and             \mx,  \mx, #0x7f

        b.le            4f

        mov             \mx,  w10

4:

        tst             \my,  #(0x7f << 14)

        add             \xmx, x11, \mx, uxtw #3

        b.ne            L(\type\()_\taps\()_hv)

        movrel          x10, \type\()_\taps\()_h_tbl

        ldrsw           x9,  [x10, x9, lsl #2]

.ifc \type, put

        mov             w12,  #34              // rounding for 10-bit

        mov             w13,  #40              // rounding for 12-bit

        cmp             \bdmax, #2             // 10-bit: 4, 12-bit: 2

        csel            w12,  w12,  w13,  ne   // select rounding based on \bdmax

.else

        neg             w12,  w12              // -(6 - intermediate_bits)

        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8

.endif

        add             x10, x10, x9

        dup             v30.4s,  w12           // rounding or shift amount

        br              x10

20:     // 2xN h

        AARCH64_VALID_JUMP_TARGET

.ifc \type, put

        ldur            s0,  [\xmx, #2]

        sub             \src,  \src,  #2

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd,  \d_strd,  #1

        lsl             \s_strd,  \s_strd,  #1

        sxtl            v0.8h,   v0.8b

2:

        ld1             {v4.8h},  [\src], \s_strd

        ld1             {v6.8h},  [\sr2], \s_strd

        mov             v2.16b,  v30.16b

        ext             v5.16b,  v4.16b,  v4.16b,  #2

        ext             v7.16b,  v6.16b,  v6.16b,  #2

        subs            \h,  \h,  #2

        trn1            v3.2s,   v4.2s,   v6.2s

        trn2            v6.2s,   v4.2s,   v6.2s

        trn1            v4.2s,   v5.2s,   v7.2s

        trn2            v7.2s,   v5.2s,   v7.2s

        smlal           v2.4s,   v3.4h,   v0.h[0]

        smlal           v2.4s,   v4.4h,   v0.h[1]

        smlal           v2.4s,   v6.4h,   v0.h[2]

        smlal           v2.4s,   v7.4h,   v0.h[3]

        sqshrun         v2.4h,   v2.4s,   #6

        umin            v2.4h,   v2.4h,   v31.4h

        st1             {v2.s}[0], [\dst], \d_strd

        st1             {v2.s}[1], [\ds2], \d_strd

        b.gt            2b

ret

.endif

40:     // 4xN h

        AARCH64_VALID_JUMP_TARGET

        ldur            s0,  [\xmx, #2]

        sub             \src,  \src,  #2

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd,  \d_strd,  #1

        lsl             \s_strd,  \s_strd,  #1

        sxtl            v0.8h,   v0.8b

4:

        ld1             {v16.8h}, [\src], \s_strd

        ld1             {v20.8h}, [\sr2], \s_strd

.ifc \type, put

        mov             v2.16b,  v30.16b

        mov             v3.16b,  v30.16b

.endif

        ext             v17.16b, v16.16b, v16.16b, #2

        ext             v18.16b, v16.16b, v16.16b, #4

        ext             v19.16b, v16.16b, v16.16b, #6

        ext             v21.16b, v20.16b, v20.16b, #2

        ext             v22.16b, v20.16b, v20.16b, #4

        ext             v23.16b, v20.16b, v20.16b, #6

        subs            \h,  \h,  #2

.ifc \type, put

        smlal           v2.4s,   v16.4h,  v0.h[0]

.else

        smull           v2.4s,   v16.4h,  v0.h[0]

.endif

        smlal           v2.4s,   v17.4h,  v0.h[1]

        smlal           v2.4s,   v18.4h,  v0.h[2]

        smlal           v2.4s,   v19.4h,  v0.h[3]

.ifc \type, put

        smlal           v3.4s,   v20.4h,  v0.h[0]

.else

        smull           v3.4s,   v20.4h,  v0.h[0]

.endif

        smlal           v3.4s,   v21.4h,  v0.h[1]

        smlal           v3.4s,   v22.4h,  v0.h[2]

        smlal           v3.4s,   v23.4h,  v0.h[3]

.ifc \type, put

        sqshrun         v16.4h,  v2.4s,   #6

        sqshrun2        v16.8h,  v3.4s,   #6

        umin            v16.8h,  v16.8h,  v31.8h

.else

        srshl           v16.4s,  v2.4s,   v30.4s // -(6-intermediate_bits)

        srshl           v20.4s,  v3.4s,   v30.4s // -(6-intermediate_bits)

        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2

        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS

.endif

        st1             {v16.8b},   [\dst], \d_strd

        st1             {v16.d}[1], [\ds2], \d_strd

        b.gt            4b

ret

80:

160:

320:

640:

1280:   // 8xN, 16xN, 32xN, ... h

        AARCH64_VALID_JUMP_TARGET

        ld1             {v0.8b}, [\xmx]

.ifc \taps, 6tap

        sub             \src,  \src,  #4

.else

        sub             \src,  \src,  #6

.endif

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \s_strd,  \s_strd,  #1

        sxtl            v0.8h,   v0.8b

        sub             \s_strd,  \s_strd,  \w, uxtw #1

        sub             \s_strd,  \s_strd,  #16

.ifc \type, put

        lsl             \d_strd,  \d_strd,  #1

        sub             \d_strd,  \d_strd,  \w, uxtw #1

.endif

81:

        ld1             {v16.8h, v17.8h},  [\src], #32

        ld1             {v20.8h, v21.8h},  [\sr2], #32

        mov             \mx, \w

8:

.ifc \taps, 6tap

    .ifc \type, put

        mov             v18.16b, v30.16b

        mov             v19.16b, v30.16b

        smlal           v18.4s,  v16.4h,  v0.h[1]

        smlal2          v19.4s,  v16.8h,  v0.h[1]

        mov             v22.16b, v30.16b

        mov             v23.16b, v30.16b

        smlal           v22.4s,  v20.4h,  v0.h[1]

        smlal2          v23.4s,  v20.8h,  v0.h[1]

    .else

        smull           v18.4s,  v16.4h,  v0.h[1]

        smull2          v19.4s,  v16.8h,  v0.h[1]

        smull           v22.4s,  v20.4h,  v0.h[1]

        smull2          v23.4s,  v20.8h,  v0.h[1]

    .endif

    .irpc i, 23456

        ext             v24.16b, v16.16b, v17.16b, #(2*\i-2)

        ext             v25.16b, v20.16b, v21.16b, #(2*\i-2)

        smlal           v18.4s,  v24.4h,  v0.h[\i]

        smlal2          v19.4s,  v24.8h,  v0.h[\i]

        smlal           v22.4s,  v25.4h,  v0.h[\i]

        smlal2          v23.4s,  v25.8h,  v0.h[\i]

    .endr

.else   // 8tap

    .ifc \type, put

        mov             v18.16b, v30.16b

        mov             v19.16b, v30.16b

        smlal           v18.4s,  v16.4h,  v0.h[0]

        smlal2          v19.4s,  v16.8h,  v0.h[0]

        mov             v22.16b, v30.16b

        mov             v23.16b, v30.16b

        smlal           v22.4s,  v20.4h,  v0.h[0]

        smlal2          v23.4s,  v20.8h,  v0.h[0]

    .else

        smull           v18.4s,  v16.4h,  v0.h[0]

        smull2          v19.4s,  v16.8h,  v0.h[0]

        smull           v22.4s,  v20.4h,  v0.h[0]

        smull2          v23.4s,  v20.8h,  v0.h[0]

    .endif

    .irpc i, 1234567

        ext             v24.16b, v16.16b, v17.16b, #(2*\i)

        ext             v25.16b, v20.16b, v21.16b, #(2*\i)

        smlal           v18.4s,  v24.4h,  v0.h[\i]

        smlal2          v19.4s,  v24.8h,  v0.h[\i]

        smlal           v22.4s,  v25.4h,  v0.h[\i]

        smlal2          v23.4s,  v25.8h,  v0.h[\i]

    .endr

.endif

        subs            \mx, \mx, #8

.ifc \type, put

        sqshrun         v18.4h,  v18.4s,  #6

        sqshrun2        v18.8h,  v19.4s,  #6

        sqshrun         v22.4h,  v22.4s,  #6

        sqshrun2        v22.8h,  v23.4s,  #6

        umin            v18.8h,  v18.8h,  v31.8h

        umin            v22.8h,  v22.8h,  v31.8h

.else

        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)

        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2

        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto

        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS

        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS

.endif

        st1             {v18.8h}, [\dst], #16

        st1             {v22.8h}, [\ds2], #16

        b.le            9f

        mov             v16.16b, v17.16b

        mov             v20.16b, v21.16b

        ld1             {v17.8h}, [\src], #16

        ld1             {v21.8h}, [\sr2], #16

        b               8b

9:

        add             \dst,  \dst,  \d_strd

        add             \ds2,  \ds2,  \d_strd

        add             \src,  \src,  \s_strd

        add             \sr2,  \sr2,  \s_strd

        subs            \h,  \h,  #2

        b.gt            81b

ret

endfunc

jumptable \type\()_\taps\()_h_tbl

        .word 1280b - \type\()_\taps\()_h_tbl

        .word 640b  - \type\()_\taps\()_h_tbl

        .word 320b  - \type\()_\taps\()_h_tbl

        .word 160b  - \type\()_\taps\()_h_tbl

        .word 80b   - \type\()_\taps\()_h_tbl

        .word 40b   - \type\()_\taps\()_h_tbl

        .word 20b   - \type\()_\taps\()_h_tbl

endjumptable

function L(\type\()_\taps\()_v)

        cmp             \h,  #4

        ubfx            w10, \my, #7, #7

        and             \my, \my, #0x7f

        b.le            4f

        mov             \my, w10

4:

        add             \xmy, x11, \my, uxtw #3

.ifc \type, prep

        dup             v30.4s,  w12           // 6 - intermediate_bits

        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8

.endif

        movrel          x10, \type\()_\taps\()_v_tbl

        ldrsw           x9,  [x10, x9, lsl #2]

.ifc \type, prep

        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)

.endif

        add             x10, x10, x9

        br              x10

20:     // 2xN v

        AARCH64_VALID_JUMP_TARGET

.ifc \type, put

        b.gt            28f

        cmp             \h,  #2

        ldur            s0,  [\xmy, #2]

        sub             \src,  \src,  \s_strd

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \s_strd,  \s_strd,  #1

        lsl             \d_strd,  \d_strd,  #1

        sxtl            v0.8h,   v0.8b

        // 2x2 v

        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5

        interleave_1_s  v1,  v2,  v3,  v4,  v5

        b.gt            24f

        smull_smlal_4tap v6, v1,  v2,  v3,  v4

        sqrshrun_h      6,   v6

        umin_h          v31, .8h, v6

        st_s            \d_strd, v6, 2

ret

24:     // 2x4 v

        load_s          \sr2, \src, \s_strd, v6, v7

        interleave_1_s  v5,  v6,  v7

        smull_smlal_4tap v16, v1, v2, v3, v4

        smull_smlal_4tap v17, v3, v4, v5, v6

        sqrshrun_h      6,   v16, v17

        umin_h          v31, .8h, v16

        st_s            \d_strd, v16, 4

ret

28:     // 2x6, 2x8, 2x12, 2x16 v

        ld1             {v0.8b}, [\xmy]

        sub             \sr2,  \src,  \s_strd, lsl #1

        add             \ds2,  \dst,  \d_strd

        sub             \src,  \sr2,  \s_strd

        lsl             \d_strd,  \d_strd,  #1

        lsl             \s_strd,  \s_strd,  #1

        sxtl            v0.8h,   v0.8b

        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7

        interleave_1_s  v1,  v2,  v3,  v4,  v5

        interleave_1_s  v5,  v6,  v7

216:

        subs            \h,  \h,  #4

        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19

        interleave_1_s  v7,  v16, v17, v18, v19

        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16

        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18

        sqrshrun_h      6,   v24, v25

        umin_h          v31, .8h, v24

        st_s            \d_strd, v24, 4

        b.le            0f

        cmp             \h,  #2

        mov             v1.16b,  v5.16b

        mov             v2.16b,  v6.16b

        mov             v3.16b,  v7.16b

        mov             v4.16b,  v16.16b

        mov             v5.16b,  v17.16b

        mov             v6.16b,  v18.16b

        mov             v7.16b,  v19.16b

        b.eq            26f

        b               216b

26:

        load_s          \sr2, \src, \s_strd, v16, v17

        interleave_1_s  v7,  v16, v17

        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16

        sqrshrun_h      6,   v24

        umin_h          v31, .4h, v24

        st_s            \d_strd, v24, 2

0:

ret

.endif

40:

        AARCH64_VALID_JUMP_TARGET

        b.gt            480f

        // 4x2, 4x4 v

        cmp             \h,  #2

        ldur            s0,  [\xmy, #2]

        sub             \src, \src, \s_strd

        add             \ds2, \dst, \d_strd

        add             \sr2, \src, \s_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5

        smull_smlal_4tap v6,  v1,  v2,  v3,  v4

        smull_smlal_4tap v7,  v2,  v3,  v4,  v5

        shift_store_4   \type, \d_strd, v6, v7

        b.le            0f

        load_4h         \sr2, \src, \s_strd, v6, v7

        smull_smlal_4tap v1,  v3,  v4,  v5,  v6

        smull_smlal_4tap v2,  v4,  v5,  v6,  v7

        shift_store_4   \type, \d_strd, v1, v2

0:

ret

480:    // 4x6, 4x8, 4x12, 4x16 v

        ld1             {v0.8b}, [\xmy]

        sub             \sr2, \src, \s_strd, lsl #1

        add             \ds2, \dst, \d_strd

        sub             \src, \sr2, \s_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22

48:

        subs            \h,  \h,  #4

        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26

        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23

        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24

        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25

        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26

        shift_store_4   \type, \d_strd, v1, v2, v3, v4

        b.le            0f

        cmp             \h,  #2

        mov             v16.8b,  v20.8b

        mov             v17.8b,  v21.8b

        mov             v18.8b,  v22.8b

        mov             v19.8b,  v23.8b

        mov             v20.8b,  v24.8b

        mov             v21.8b,  v25.8b

        mov             v22.8b,  v26.8b

        b.eq            46f

        b               48b

46:

        load_4h         \sr2, \src, \s_strd, v23, v24

        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23

        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24

        shift_store_4   \type, \d_strd, v1, v2

0:

ret

80:

        AARCH64_VALID_JUMP_TARGET

        b.gt            880f

        // 8x2, 8x4 v

        cmp             \h,  #2

        ldur            s0,  [\xmy, #2]

        sub             \src, \src, \s_strd

        add             \ds2, \dst, \d_strd

        add             \sr2, \src, \s_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5

        smull_smlal_4tap   v16, v1,  v2,  v3,  v4

        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4

        smull_smlal_4tap   v18, v2,  v3,  v4,  v5

        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5

        shift_store_8   \type, \d_strd, v16, v17, v18, v19

        b.le            0f

        load_8h         \sr2, \src, \s_strd, v6, v7

        smull_smlal_4tap   v16, v3,  v4,  v5,  v6

        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6

        smull_smlal_4tap   v18, v4,  v5,  v6,  v7

        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7

        shift_store_8   \type, \d_strd, v16, v17, v18, v19

0:

ret

880:    // 8x6, 8x8, 8x16, 8x32 v

1680:   // 16x8, 16x16, ...

320:    // 32x8, 32x16, ...

640:

1280:

        AARCH64_VALID_JUMP_TARGET

        ld1             {v0.8b}, [\xmy]

        sub             \src, \src, \s_strd

        sub             \src, \src, \s_strd, lsl #1

        sxtl            v0.8h,   v0.8b

        mov             \my,  \h

168:

        add             \ds2, \dst, \d_strd

        add             \sr2, \src, \s_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22

88:

        subs            \h,  \h,  #2

        load_8h         \sr2, \src, \s_strd, v23, v24

        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23

        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23

        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24

        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24

        shift_store_8   \type, \d_strd, v1, v2, v3, v4

        b.le            9f

        subs            \h,  \h,  #2

        load_8h         \sr2, \src, \s_strd, v25, v26

        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25

        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25

        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26

        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26

        shift_store_8   \type, \d_strd, v1, v2, v3, v4

        b.le            9f

        mov             v16.16b, v20.16b

        mov             v17.16b, v21.16b

        mov             v18.16b, v22.16b

        mov             v19.16b, v23.16b

        mov             v20.16b, v24.16b

        mov             v21.16b, v25.16b

        mov             v22.16b, v26.16b

        b               88b

9:

        subs            \w,  \w,  #8

        b.le            0f

        asr             \s_strd, \s_strd, #1

        asr             \d_strd, \d_strd, #1

        msub            \src, \s_strd, \xmy, \src

        msub            \dst, \d_strd, \xmy, \dst

        sub             \src, \src, \s_strd, lsl #3

        mov             \h,  \my

        add             \src, \src, #16

        add             \dst, \dst, #16

        b               168b

0:

ret

160:

        AARCH64_VALID_JUMP_TARGET

        b.gt            1680b

        // 16x2, 16x4 v

        ldur            s0,  [\xmy, #2]

        sub             \src, \src, \s_strd

        sxtl            v0.8h,   v0.8b

        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21

16:

        load_16h        \src, \src, \s_strd, v22, v23

        subs            \h,  \h,  #1

        smull_smlal_4tap   v1, v16, v18, v20, v22

        smull2_smlal2_4tap v2, v16, v18, v20, v22

        smull_smlal_4tap   v3, v17, v19, v21, v23

        smull2_smlal2_4tap v4, v17, v19, v21, v23

        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4

        b.le            0f

        mov             v16.16b, v18.16b

        mov             v17.16b, v19.16b

        mov             v18.16b, v20.16b

        mov             v19.16b, v21.16b

        mov             v20.16b, v22.16b

        mov             v21.16b, v23.16b

        b               16b

0:

ret

endfunc

jumptable \type\()_\taps\()_v_tbl

        .word 1280b - \type\()_\taps\()_v_tbl

        .word 640b  - \type\()_\taps\()_v_tbl

        .word 320b  - \type\()_\taps\()_v_tbl

        .word 160b  - \type\()_\taps\()_v_tbl

        .word 80b   - \type\()_\taps\()_v_tbl

        .word 40b   - \type\()_\taps\()_v_tbl

        .word 20b   - \type\()_\taps\()_v_tbl

endjumptable

function L(\type\()_\taps\()_hv)

        cmp             \h,  #4

        ubfx            w10, \my, #7, #7

        and             \my, \my, #0x7f

        b.le            4f

        mov             \my,  w10

4:

        add             \xmy, x11, \my, uxtw #3

        movrel          x10, \type\()_\taps\()_hv_tbl

        dup             v30.4s,  w12           // 6 - intermediate_bits

        ldrsw           x9,  [x10, x9, lsl #2]

        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)

.ifc \type, put

        dup             v29.4s,  w13           // 6 + intermediate_bits

.else

        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8

.endif

        add             x10, x10, x9

.ifc \type, put

        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)

.endif

        br              x10

20:

        AARCH64_VALID_JUMP_TARGET

.ifc \type, put

        ldur            s0,  [\xmx, #2]

        b.gt            280f

        ldur            s1,  [\xmy, #2]

        // 2x2, 2x4 hv

        sub             \sr2, \src, #2

        sub             \src, \sr2, \s_strd

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        sxtl            v1.8h,   v1.8b

        mov             x15, x30

        ld1             {v27.8h}, [\src], \s_strd

        ext             v28.16b, v27.16b, v27.16b, #2

        smull           v27.4s,  v27.4h,  v0.4h

        smull           v28.4s,  v28.4h,  v0.4h

        addp            v27.4s,  v27.4s,  v28.4s

        addp            v16.4s,  v27.4s,  v27.4s

        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)

        bl              L(\type\()_\taps\()_filter_2)

        // The intermediates from the horizontal pass fit in 16 bit without

        // any bias; we could just as well keep them as .4s, but narrowing

        // them to .4h gives a significant speedup on out of order cores

        // (at the cost of a smaller slowdown on in-order cores such as A53).

        xtn             v16.4h,  v16.4s

        trn1            v16.2s,  v16.2s,  v24.2s

        mov             v17.8b,  v24.8b

2:

        bl              L(\type\()_\taps\()_filter_2)

        ext             v18.8b,  v17.8b,  v24.8b,  #4

        smull           v2.4s,   v16.4h,  v1.h[0]

        smlal           v2.4s,   v17.4h,  v1.h[1]

        smlal           v2.4s,   v18.4h,  v1.h[2]

        smlal           v2.4s,   v24.4h,  v1.h[3]

        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)

        sqxtun          v2.4h,   v2.4s

        umin            v2.4h,   v2.4h,   v31.4h

        subs            \h,  \h,  #2

        st1             {v2.s}[0], [\dst], \d_strd

        st1             {v2.s}[1], [\ds2], \d_strd

        b.le            0f

        mov             v16.8b,  v18.8b

        mov             v17.8b,  v24.8b

        b               2b

280:    // 2x8, 2x16, 2x32 hv

        ld1             {v1.8b},  [\xmy]

        sub             \src, \src, #2

        sub             \sr2, \src, \s_strd, lsl #1

        sub             \src, \sr2, \s_strd

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        sxtl            v1.8h,   v1.8b

        mov             x15, x30

        ld1             {v27.8h}, [\src], \s_strd

        ext             v28.16b, v27.16b, v27.16b, #2

        smull           v27.4s,  v27.4h,  v0.4h

        smull           v28.4s,  v28.4h,  v0.4h

        addp            v27.4s,  v27.4s,  v28.4s

        addp            v16.4s,  v27.4s,  v27.4s

        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)

        // The intermediates from the horizontal pass fit in 16 bit without

        // any bias; we could just as well keep them as .4s, but narrowing

        // them to .4h gives a significant speedup on out of order cores

        // (at the cost of a smaller slowdown on in-order cores such as A53).

        bl              L(\type\()_\taps\()_filter_2)

        xtn             v16.4h,  v16.4s

        trn1            v16.2s,  v16.2s,  v24.2s

        mov             v17.8b,  v24.8b

        bl              L(\type\()_\taps\()_filter_2)

        ext             v18.8b,  v17.8b,  v24.8b,  #4

        mov             v19.8b,  v24.8b

        bl              L(\type\()_\taps\()_filter_2)

        ext             v20.8b,  v19.8b,  v24.8b,  #4

        mov             v21.8b,  v24.8b

28:

        bl              L(\type\()_\taps\()_filter_2)

        ext             v22.8b,  v21.8b,  v24.8b,  #4

.ifc \taps, 6tap

        smull           v3.4s,   v17.4h,  v1.h[1]

        smlal           v3.4s,   v18.4h,  v1.h[2]

        smlal           v3.4s,   v19.4h,  v1.h[3]

        smlal           v3.4s,   v20.4h,  v1.h[4]

        smlal           v3.4s,   v21.4h,  v1.h[5]

        smlal           v3.4s,   v22.4h,  v1.h[6]

.else   // 8tap

        smull           v3.4s,   v16.4h,  v1.h[0]

        smlal           v3.4s,   v17.4h,  v1.h[1]

        smlal           v3.4s,   v18.4h,  v1.h[2]

        smlal           v3.4s,   v19.4h,  v1.h[3]

        smlal           v3.4s,   v20.4h,  v1.h[4]

        smlal           v3.4s,   v21.4h,  v1.h[5]

        smlal           v3.4s,   v22.4h,  v1.h[6]

        smlal           v3.4s,   v24.4h,  v1.h[7]

.endif

        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)

        sqxtun          v3.4h,   v3.4s

        umin            v3.4h,   v3.4h,   v31.4h

        subs            \h,  \h,  #2

        st1             {v3.s}[0], [\dst], \d_strd

        st1             {v3.s}[1], [\ds2], \d_strd

        b.le            0f

        mov             v16.8b,  v18.8b

        mov             v17.8b,  v19.8b

        mov             v18.8b,  v20.8b

        mov             v19.8b,  v21.8b

        mov             v20.8b,  v22.8b

        mov             v21.8b,  v24.8b

        b               28b

0:

        ret             x15

L(\type\()_\taps\()_filter_2):

        ld1             {v25.8h},  [\sr2], \s_strd

        ld1             {v27.8h},  [\src], \s_strd

        ext             v26.16b, v25.16b, v25.16b, #2

        ext             v28.16b, v27.16b, v27.16b, #2

        trn1            v24.2s,  v25.2s,  v27.2s

        trn2            v27.2s,  v25.2s,  v27.2s

        trn1            v25.2s,  v26.2s,  v28.2s

        trn2            v28.2s,  v26.2s,  v28.2s

        smull           v24.4s,  v24.4h,  v0.h[0]

        smlal           v24.4s,  v25.4h,  v0.h[1]

        smlal           v24.4s,  v27.4h,  v0.h[2]

        smlal           v24.4s,  v28.4h,  v0.h[3]

        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)

        xtn             v24.4h,  v24.4s

ret

.endif

40:

        AARCH64_VALID_JUMP_TARGET

        ldur            s0,  [\xmx, #2]

        b.gt            480f

        ldur            s1,  [\xmy, #2]

        sub             \sr2, \src, #2

        sub             \src, \sr2, \s_strd

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        sxtl            v1.8h,   v1.8b

        mov             x15, x30

        // 4x2, 4x4 hv

        ld1             {v25.8h}, [\src], \s_strd

        ext             v26.16b, v25.16b, v25.16b, #2

        ext             v27.16b, v25.16b, v25.16b, #4

        ext             v28.16b, v25.16b, v25.16b, #6

        smull           v25.4s,  v25.4h,  v0.h[0]

        smlal           v25.4s,  v26.4h,  v0.h[1]

        smlal           v25.4s,  v27.4h,  v0.h[2]

        smlal           v25.4s,  v28.4h,  v0.h[3]

        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)

        // The intermediates from the horizontal pass fit in 16 bit without

        // any bias; we could just as well keep them as .4s, but narrowing

        // them to .4h gives a significant speedup on out of order cores

        // (at the cost of a smaller slowdown on in-order cores such as A53).

        xtn             v16.4h,  v16.4s

        bl              L(\type\()_\taps\()_filter_4)

        mov             v17.8b,  v24.8b

        mov             v18.8b,  v25.8b

4:

        bl              L(\type\()_\taps\()_filter_4)

        smull           v2.4s,   v16.4h,  v1.h[0]

        smlal           v2.4s,   v17.4h,  v1.h[1]

        smlal           v2.4s,   v18.4h,  v1.h[2]

        smlal           v2.4s,   v24.4h,  v1.h[3]

        smull           v3.4s,   v17.4h,  v1.h[0]

        smlal           v3.4s,   v18.4h,  v1.h[1]

        smlal           v3.4s,   v24.4h,  v1.h[2]

        smlal           v3.4s,   v25.4h,  v1.h[3]

.ifc \type, put

        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)

        sqxtun          v2.4h,   v2.4s

        sqxtun2         v2.8h,   v3.4s

        umin            v2.8h,   v2.8h,   v31.8h

.else

        rshrn           v2.4h,   v2.4s,   #6

        rshrn2          v2.8h,   v3.4s,   #6

        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS

.endif

        subs            \h,  \h,  #2

        st1             {v2.8b},   [\dst], \d_strd

        st1             {v2.d}[1], [\ds2], \d_strd

        b.le            0f

        mov             v16.8b,  v18.8b

        mov             v17.8b,  v24.8b

        mov             v18.8b,  v25.8b

        b               4b

480:    // 4x8, 4x16, 4x32 hv

        ld1             {v1.8b},  [\xmy]

        sub             \src, \src, #2

.ifc \taps, 6tap

        sub             \sr2, \src, \s_strd

        sub             \src, \src, \s_strd, lsl #1

.else

        sub             \sr2, \src, \s_strd, lsl #1

        sub             \src, \sr2, \s_strd

.endif

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        sxtl            v0.8h,   v0.8b

        sxtl            v1.8h,   v1.8b

        mov             x15, x30

        ld1             {v25.8h}, [\src], \s_strd

        ext             v26.16b, v25.16b, v25.16b, #2

        ext             v27.16b, v25.16b, v25.16b, #4

        ext             v28.16b, v25.16b, v25.16b, #6

        smull           v25.4s,  v25.4h,  v0.h[0]

        smlal           v25.4s,  v26.4h,  v0.h[1]

        smlal           v25.4s,  v27.4h,  v0.h[2]

        smlal           v25.4s,  v28.4h,  v0.h[3]

        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)

        // The intermediates from the horizontal pass fit in 16 bit without

        // any bias; we could just as well keep them as .4s, but narrowing

        // them to .4h gives a significant speedup on out of order cores

        // (at the cost of a smaller slowdown on in-order cores such as A53).

.ifc \taps, 6tap

        xtn             v18.4h,  v16.4s

.else

        xtn             v16.4h,  v16.4s

        bl              L(\type\()_\taps\()_filter_4)

        mov             v17.8b,  v24.8b

        mov             v18.8b,  v25.8b

.endif

        bl              L(\type\()_\taps\()_filter_4)

        mov             v19.8b,  v24.8b

        mov             v20.8b,  v25.8b

        bl              L(\type\()_\taps\()_filter_4)

        mov             v21.8b,  v24.8b

        mov             v22.8b,  v25.8b

48:

        bl              L(\type\()_\taps\()_filter_4)

.ifc \taps, 6tap

        smull           v3.4s,   v18.4h,  v1.h[1]

        smlal           v3.4s,   v19.4h,  v1.h[2]

        smlal           v3.4s,   v20.4h,  v1.h[3]

        smlal           v3.4s,   v21.4h,  v1.h[4]

        smlal           v3.4s,   v22.4h,  v1.h[5]

        smlal           v3.4s,   v24.4h,  v1.h[6]

        smull           v4.4s,   v19.4h,  v1.h[1]

        smlal           v4.4s,   v20.4h,  v1.h[2]

        smlal           v4.4s,   v21.4h,  v1.h[3]

        smlal           v4.4s,   v22.4h,  v1.h[4]

        smlal           v4.4s,   v24.4h,  v1.h[5]

        smlal           v4.4s,   v25.4h,  v1.h[6]

.else   // 8tap

        smull           v3.4s,   v16.4h,  v1.h[0]

        smlal           v3.4s,   v17.4h,  v1.h[1]

        smlal           v3.4s,   v18.4h,  v1.h[2]

        smlal           v3.4s,   v19.4h,  v1.h[3]

        smlal           v3.4s,   v20.4h,  v1.h[4]

        smlal           v3.4s,   v21.4h,  v1.h[5]

        smlal           v3.4s,   v22.4h,  v1.h[6]

        smlal           v3.4s,   v24.4h,  v1.h[7]

        smull           v4.4s,   v17.4h,  v1.h[0]

        smlal           v4.4s,   v18.4h,  v1.h[1]

        smlal           v4.4s,   v19.4h,  v1.h[2]

        smlal           v4.4s,   v20.4h,  v1.h[3]

        smlal           v4.4s,   v21.4h,  v1.h[4]

        smlal           v4.4s,   v22.4h,  v1.h[5]

        smlal           v4.4s,   v24.4h,  v1.h[6]

        smlal           v4.4s,   v25.4h,  v1.h[7]

.endif

.ifc \type, put

        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)

        sqxtun          v3.4h,   v3.4s

        sqxtun2         v3.8h,   v4.4s

        umin            v3.8h,   v3.8h,   v31.8h

.else

        rshrn           v3.4h,   v3.4s,   #6

        rshrn2          v3.8h,   v4.4s,   #6

        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS

.endif

        subs            \h,  \h,  #2

        st1             {v3.8b},   [\dst], \d_strd

        st1             {v3.d}[1], [\ds2], \d_strd

        b.le            0f

.ifc \taps, 8tap

        mov             v16.8b,  v18.8b

        mov             v17.8b,  v19.8b

.endif

        mov             v18.8b,  v20.8b

        mov             v19.8b,  v21.8b

        mov             v20.8b,  v22.8b

        mov             v21.8b,  v24.8b

        mov             v22.8b,  v25.8b

        b               48b

0:

        ret             x15

L(\type\()_\taps\()_filter_4):

        ld1             {v24.8h}, [\sr2], \s_strd

        ld1             {v25.8h}, [\src], \s_strd

        ext             v26.16b, v24.16b, v24.16b, #2

        ext             v27.16b, v24.16b, v24.16b, #4

        ext             v28.16b, v24.16b, v24.16b, #6

        smull           v24.4s,  v24.4h,  v0.h[0]

        smlal           v24.4s,  v26.4h,  v0.h[1]

        smlal           v24.4s,  v27.4h,  v0.h[2]

        smlal           v24.4s,  v28.4h,  v0.h[3]

        ext             v26.16b, v25.16b, v25.16b, #2

        ext             v27.16b, v25.16b, v25.16b, #4

        ext             v28.16b, v25.16b, v25.16b, #6

        smull           v25.4s,  v25.4h,  v0.h[0]

        smlal           v25.4s,  v26.4h,  v0.h[1]

        smlal           v25.4s,  v27.4h,  v0.h[2]

        smlal           v25.4s,  v28.4h,  v0.h[3]

        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)

        xtn             v24.4h,  v24.4s

        xtn             v25.4h,  v25.4s

ret

80:

160:

320:

        AARCH64_VALID_JUMP_TARGET

        b.gt            880f

        ld1             {v0.8b},  [\xmx]

        ldur            s1,  [\xmy, #2]

.ifc \taps, 6tap

        sub             \src,  \src,  #4

.else

        sub             \src,  \src,  #6

.endif

        sub             \src,  \src,  \s_strd

        sxtl            v0.8h,   v0.8b

        sxtl            v1.8h,   v1.8b

        mov             x15, x30

        mov             \my, \h

164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd, \d_strd, #1

        lsl             \s_strd, \s_strd, #1

        ld1             {v27.8h, v28.8h},  [\src], \s_strd

.ifc \taps, 6tap

        smull           v24.4s,  v27.4h,  v0.h[1]

        smull2          v25.4s,  v27.8h,  v0.h[1]

    .irpc i, 23456

        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)

        smlal           v24.4s,  v26.4h,  v0.h[\i]

        smlal2          v25.4s,  v26.8h,  v0.h[\i]

    .endr

.else

        smull           v24.4s,  v27.4h,  v0.h[0]

        smull2          v25.4s,  v27.8h,  v0.h[0]

    .irpc i, 1234567

        ext             v26.16b, v27.16b, v28.16b, #(2*\i)

        smlal           v24.4s,  v26.4h,  v0.h[\i]

        smlal2          v25.4s,  v26.8h,  v0.h[\i]

    .endr

.endif

        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)

        // The intermediates from the horizontal pass fit in 16 bit without

        // any bias; we could just as well keep them as .4s, but narrowing

        // them to .4h gives a significant speedup on out of order cores

        // (at the cost of a smaller slowdown on in-order cores such as A53),

        // and conserves register space (no need to clobber v8-v15).

        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2

        bl              L(\type\()_\taps\()_filter_8)

        mov             v17.16b, v23.16b

        mov             v18.16b, v24.16b

8:

        smull           v2.4s,   v16.4h,  v1.h[0]

        smull2          v3.4s,   v16.8h,  v1.h[0]

        bl              L(\type\()_\taps\()_filter_8)

        smull           v4.4s,   v17.4h,  v1.h[0]

        smull2          v5.4s,   v17.8h,  v1.h[0]

        smlal           v2.4s,   v17.4h,  v1.h[1]

        smlal2          v3.4s,   v17.8h,  v1.h[1]

        smlal           v4.4s,   v18.4h,  v1.h[1]

        smlal2          v5.4s,   v18.8h,  v1.h[1]

        smlal           v2.4s,   v18.4h,  v1.h[2]

        smlal2          v3.4s,   v18.8h,  v1.h[2]

        smlal           v4.4s,   v23.4h,  v1.h[2]

        smlal2          v5.4s,   v23.8h,  v1.h[2]

        smlal           v2.4s,   v23.4h,  v1.h[3]

        smlal2          v3.4s,   v23.8h,  v1.h[3]

        smlal           v4.4s,   v24.4h,  v1.h[3]

        smlal2          v5.4s,   v24.8h,  v1.h[3]

.ifc \type, put

        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)

        sqxtun          v2.4h,   v2.4s

        sqxtun2         v2.8h,   v3.4s

        sqxtun          v3.4h,   v4.4s

        sqxtun2         v3.8h,   v5.4s

        umin            v2.8h,   v2.8h,   v31.8h

        umin            v3.8h,   v3.8h,   v31.8h

.else

        rshrn           v2.4h,   v2.4s,   #6

        rshrn2          v2.8h,   v3.4s,   #6

        rshrn           v3.4h,   v4.4s,   #6

        rshrn2          v3.8h,   v5.4s,   #6

        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS

        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS

.endif

        subs            \h,  \h,  #2

        st1             {v2.8h}, [\dst], \d_strd

        st1             {v3.8h}, [\ds2], \d_strd

        b.le            9f

        mov             v16.16b, v18.16b

        mov             v17.16b, v23.16b

        mov             v18.16b, v24.16b

        b               8b

9:

        subs            \w,  \w,  #8

        b.le            0f

        asr             \s_strd,  \s_strd,  #1

        asr             \d_strd,  \d_strd,  #1

        msub            \src,  \s_strd,  \xmy,  \src

        msub            \dst,  \d_strd,  \xmy,  \dst

        sub             \src,  \src,  \s_strd,  lsl #2

        mov             \h,  \my

        add             \src,  \src,  #16

        add             \dst,  \dst,  #16

        b               164b

880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv

640:

1280:

        AARCH64_VALID_JUMP_TARGET

        ld1             {v0.8b},  [\xmx]

        ld1             {v1.8b},  [\xmy]

.ifc \taps, 6tap

        sub             \src,  \src,  #4

.else

        sub             \src,  \src,  #6

        sub             \src,  \src,  \s_strd

.endif

        sub             \src,  \src,  \s_strd, lsl #1

        sxtl            v0.8h,   v0.8b

        sxtl            v1.8h,   v1.8b

        mov             x15, x30

        mov             \my, \h

168:

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd, \d_strd, #1

        lsl             \s_strd, \s_strd, #1

        ld1             {v27.8h, v28.8h},  [\src], \s_strd

.ifc \taps, 6tap

        smull           v24.4s,  v27.4h,  v0.h[1]

        smull2          v25.4s,  v27.8h,  v0.h[1]

    .irpc i, 23456

        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)

        smlal           v24.4s,  v26.4h,  v0.h[\i]

        smlal2          v25.4s,  v26.8h,  v0.h[\i]

    .endr

.else   // 8tap

        smull           v24.4s,  v27.4h,  v0.h[0]

        smull2          v25.4s,  v27.8h,  v0.h[0]

    .irpc i, 1234567

        ext             v26.16b, v27.16b, v28.16b, #(2*\i)

        smlal           v24.4s,  v26.4h,  v0.h[\i]

        smlal2          v25.4s,  v26.8h,  v0.h[\i]

    .endr

.endif

        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)

        // The intermediates from the horizontal pass fit in 16 bit without

        // any bias; we could just as well keep them as .4s, but narrowing

        // them to .4h gives a significant speedup on out of order cores

        // (at the cost of a smaller slowdown on in-order cores such as A53),

        // and conserves register space (no need to clobber v8-v15).

.ifc \taps, 6tap

        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2

.else

        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2

        bl              L(\type\()_\taps\()_filter_8)

        mov             v17.16b, v23.16b

        mov             v18.16b, v24.16b

.endif

        bl              L(\type\()_\taps\()_filter_8)

        mov             v19.16b, v23.16b

        mov             v20.16b, v24.16b

        bl              L(\type\()_\taps\()_filter_8)

        mov             v21.16b, v23.16b

        mov             v22.16b, v24.16b

88:

.ifc \taps, 6tap

        smull           v2.4s,   v18.4h,  v1.h[1]

        smull2          v3.4s,   v18.8h,  v1.h[1]

        bl              L(\type\()_\taps\()_filter_8)

        smull           v4.4s,   v19.4h,  v1.h[1]

        smull2          v5.4s,   v19.8h,  v1.h[1]

        smlal           v2.4s,   v19.4h,  v1.h[2]

        smlal2          v3.4s,   v19.8h,  v1.h[2]

        smlal           v4.4s,   v20.4h,  v1.h[2]

        smlal2          v5.4s,   v20.8h,  v1.h[2]

        smlal           v2.4s,   v20.4h,  v1.h[3]

        smlal2          v3.4s,   v20.8h,  v1.h[3]

        smlal           v4.4s,   v21.4h,  v1.h[3]

        smlal2          v5.4s,   v21.8h,  v1.h[3]

        smlal           v2.4s,   v21.4h,  v1.h[4]

        smlal2          v3.4s,   v21.8h,  v1.h[4]

        smlal           v4.4s,   v22.4h,  v1.h[4]

        smlal2          v5.4s,   v22.8h,  v1.h[4]

        smlal           v2.4s,   v22.4h,  v1.h[5]

        smlal2          v3.4s,   v22.8h,  v1.h[5]

        smlal           v4.4s,   v23.4h,  v1.h[5]

        smlal2          v5.4s,   v23.8h,  v1.h[5]

        smlal           v2.4s,   v23.4h,  v1.h[6]

        smlal2          v3.4s,   v23.8h,  v1.h[6]

        smlal           v4.4s,   v24.4h,  v1.h[6]

        smlal2          v5.4s,   v24.8h,  v1.h[6]

.else   // 8tap

        smull           v2.4s,   v16.4h,  v1.h[0]

        smull2          v3.4s,   v16.8h,  v1.h[0]

        bl              L(\type\()_\taps\()_filter_8)

        smull           v4.4s,   v17.4h,  v1.h[0]

        smull2          v5.4s,   v17.8h,  v1.h[0]

        smlal           v2.4s,   v17.4h,  v1.h[1]

        smlal2          v3.4s,   v17.8h,  v1.h[1]

        smlal           v4.4s,   v18.4h,  v1.h[1]

        smlal2          v5.4s,   v18.8h,  v1.h[1]

        smlal           v2.4s,   v18.4h,  v1.h[2]

        smlal2          v3.4s,   v18.8h,  v1.h[2]

        smlal           v4.4s,   v19.4h,  v1.h[2]

        smlal2          v5.4s,   v19.8h,  v1.h[2]

        smlal           v2.4s,   v19.4h,  v1.h[3]

        smlal2          v3.4s,   v19.8h,  v1.h[3]

        smlal           v4.4s,   v20.4h,  v1.h[3]

        smlal2          v5.4s,   v20.8h,  v1.h[3]

        smlal           v2.4s,   v20.4h,  v1.h[4]

        smlal2          v3.4s,   v20.8h,  v1.h[4]

        smlal           v4.4s,   v21.4h,  v1.h[4]

        smlal2          v5.4s,   v21.8h,  v1.h[4]

        smlal           v2.4s,   v21.4h,  v1.h[5]

        smlal2          v3.4s,   v21.8h,  v1.h[5]

        smlal           v4.4s,   v22.4h,  v1.h[5]

        smlal2          v5.4s,   v22.8h,  v1.h[5]

        smlal           v2.4s,   v22.4h,  v1.h[6]

        smlal2          v3.4s,   v22.8h,  v1.h[6]

        smlal           v4.4s,   v23.4h,  v1.h[6]

        smlal2          v5.4s,   v23.8h,  v1.h[6]

        smlal           v2.4s,   v23.4h,  v1.h[7]

        smlal2          v3.4s,   v23.8h,  v1.h[7]

        smlal           v4.4s,   v24.4h,  v1.h[7]

        smlal2          v5.4s,   v24.8h,  v1.h[7]

.endif

.ifc \type, put

        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)

        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)

        sqxtun          v2.4h,   v2.4s

        sqxtun2         v2.8h,   v3.4s

        sqxtun          v3.4h,   v4.4s

        sqxtun2         v3.8h,   v5.4s

        umin            v2.8h,   v2.8h,   v31.8h

        umin            v3.8h,   v3.8h,   v31.8h

.else

        rshrn           v2.4h,   v2.4s,   #6

        rshrn2          v2.8h,   v3.4s,   #6

        rshrn           v3.4h,   v4.4s,   #6

        rshrn2          v3.8h,   v5.4s,   #6

        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS

        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS

.endif

        subs            \h,  \h,  #2

        st1             {v2.8h}, [\dst], \d_strd

        st1             {v3.8h}, [\ds2], \d_strd

        b.le            9f

.ifc \taps, 8tap

        mov             v16.16b, v18.16b

        mov             v17.16b, v19.16b

.endif

        mov             v18.16b, v20.16b

        mov             v19.16b, v21.16b

        mov             v20.16b, v22.16b

        mov             v21.16b, v23.16b

        mov             v22.16b, v24.16b

        b               88b

9:

        subs            \w,  \w,  #8

        b.le            0f

        asr             \s_strd,  \s_strd,  #1

        asr             \d_strd,  \d_strd,  #1

        msub            \src,  \s_strd,  \xmy,  \src

        msub            \dst,  \d_strd,  \xmy,  \dst

        sub             \src,  \src,  \s_strd,  lsl #3

        mov             \h,  \my

        add             \src,  \src,  #16

        add             \dst,  \dst,  #16

.ifc \taps, 6tap

        add             \src,  \src,  \s_strd,  lsl #1

.endif

        b               168b

0:

        ret             x15

L(\type\()_\taps\()_filter_8):

        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd

        ld1             {v6.8h, v7.8h},  [\src], \s_strd

.ifc \taps, 6tap

        smull           v25.4s,  v4.4h,   v0.h[1]

        smull2          v26.4s,  v4.8h,   v0.h[1]

        smull           v27.4s,  v6.4h,   v0.h[1]

        smull2          v28.4s,  v6.8h,   v0.h[1]

.irpc i, 23456

        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i-2)

        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i-2)

        smlal           v25.4s,  v23.4h,  v0.h[\i]

        smlal2          v26.4s,  v23.8h,  v0.h[\i]

        smlal           v27.4s,  v24.4h,  v0.h[\i]

        smlal2          v28.4s,  v24.8h,  v0.h[\i]

.endr

.else   // 8tap

        smull           v25.4s,  v4.4h,   v0.h[0]

        smull2          v26.4s,  v4.8h,   v0.h[0]

        smull           v27.4s,  v6.4h,   v0.h[0]

        smull2          v28.4s,  v6.8h,   v0.h[0]

.irpc i, 1234567

        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)

        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)

        smlal           v25.4s,  v23.4h,  v0.h[\i]

        smlal2          v26.4s,  v23.8h,  v0.h[\i]

        smlal           v27.4s,  v24.4h,  v0.h[\i]

        smlal2          v28.4s,  v24.8h,  v0.h[\i]

.endr

.endif

        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)

        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)

        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2

        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto

ret

endfunc

jumptable \type\()_\taps\()_hv_tbl

        .word 1280b - \type\()_\taps\()_hv_tbl

        .word 640b  - \type\()_\taps\()_hv_tbl

        .word 320b  - \type\()_\taps\()_hv_tbl

        .word 160b  - \type\()_\taps\()_hv_tbl

        .word 80b   - \type\()_\taps\()_hv_tbl

        .word 40b   - \type\()_\taps\()_hv_tbl

        .word 20b   - \type\()_\taps\()_hv_tbl

endjumptable

.endm

.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2

function \type\()_bilin_16bpc_neon, export=1

.ifc \bdmax, w8

        ldr             w8,  [sp]

.endif

        dup             v1.8h,   \mx

        dup             v3.8h,   \my

        mov             w10, #16

        sub             w9,  w10, \mx

        sub             w10, w10, \my

        dup             v0.8h,   w9

        dup             v2.8h,   w10

.ifc \type, prep

        uxtw            \d_strd, \w

        lsl             \d_strd, \d_strd, #1

.endif

        clz             \bdmax,   \bdmax       // bitdepth_max

        clz             w9,  \w

        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18

        mov             w11, #4

        sub             w9,  w9,  #24

        sub             w11, w11, \bdmax  // 4 - intermediate_bits

        add             w12, \bdmax, #4   // 4 + intermediate_bits

        cbnz            \mx, L(\type\()_bilin_h)

        cbnz            \my, L(\type\()_bilin_v)

        b               \type\()_16bpc_neon

L(\type\()_bilin_h):

        cbnz            \my, L(\type\()_bilin_hv)

        movrel          x10, \type\()_bilin_h_tbl

        dup             v31.8h,  w11      // 4 - intermediate_bits

        ldrsw           x9,  [x10, x9, lsl #2]

        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)

.ifc \type, put

        dup             v30.8h,  \bdmax   // intermediate_bits

.else

        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8

.endif

        add             x10, x10, x9

.ifc \type, put

        neg             v30.8h,  v30.8h   // -intermediate_bits

.endif

        br              x10

20:     // 2xN h

        AARCH64_VALID_JUMP_TARGET

.ifc \type, put

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd,  \d_strd,  #1

        lsl             \s_strd,  \s_strd,  #1

2:

        ld1             {v4.4h},  [\src], \s_strd

        ld1             {v6.4h},  [\sr2], \s_strd

        ext             v5.8b,   v4.8b,   v4.8b,   #2

        ext             v7.8b,   v6.8b,   v6.8b,   #2

        trn1            v4.2s,   v4.2s,   v6.2s

        trn1            v5.2s,   v5.2s,   v7.2s

        subs            \h,  \h,  #2

        mul             v4.4h,   v4.4h,   v0.4h

        mla             v4.4h,   v5.4h,   v1.4h

        urshl           v4.4h,   v4.4h,   v31.4h

        urshl           v4.4h,   v4.4h,   v30.4h

        st1             {v4.s}[0], [\dst], \d_strd

        st1             {v4.s}[1], [\ds2], \d_strd

        b.gt            2b

ret

.endif

40:     // 4xN h

        AARCH64_VALID_JUMP_TARGET

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd,  \d_strd,  #1

        lsl             \s_strd,  \s_strd,  #1

4:

        ld1             {v4.8h}, [\src], \s_strd

        ld1             {v6.8h}, [\sr2], \s_strd

        ext             v5.16b,  v4.16b,  v4.16b,  #2

        ext             v7.16b,  v6.16b,  v6.16b,  #2

        trn1            v4.2d,   v4.2d,   v6.2d

        trn1            v5.2d,   v5.2d,   v7.2d

        subs            \h,  \h,  #2

        mul             v4.8h,   v4.8h,   v0.8h

        mla             v4.8h,   v5.8h,   v1.8h

        urshl           v4.8h,   v4.8h,   v31.8h

.ifc \type, put

        urshl           v4.8h,   v4.8h,   v30.8h

.else

        sub             v4.8h,   v4.8h,   v29.8h

.endif

        st1             {v4.8b},   [\dst], \d_strd

        st1             {v4.d}[1], [\ds2], \d_strd

        b.gt            4b

ret

80:     // 8xN h

        AARCH64_VALID_JUMP_TARGET

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \d_strd,  \d_strd,  #1

        lsl             \s_strd,  \s_strd,  #1

8:

        ldr             h5,  [\src, #16]

        ldr             h7,  [\sr2, #16]

        ld1             {v4.8h}, [\src], \s_strd

        ld1             {v6.8h}, [\sr2], \s_strd

        ext             v5.16b,  v4.16b,  v5.16b,  #2

        ext             v7.16b,  v6.16b,  v7.16b,  #2

        subs            \h,  \h,  #2

        mul             v4.8h,   v4.8h,   v0.8h

        mla             v4.8h,   v5.8h,   v1.8h

        mul             v6.8h,   v6.8h,   v0.8h

        mla             v6.8h,   v7.8h,   v1.8h

        urshl           v4.8h,   v4.8h,   v31.8h

        urshl           v6.8h,   v6.8h,   v31.8h

.ifc \type, put

        urshl           v4.8h,   v4.8h,   v30.8h

        urshl           v6.8h,   v6.8h,   v30.8h

.else

        sub             v4.8h,   v4.8h,   v29.8h

        sub             v6.8h,   v6.8h,   v29.8h

.endif

        st1             {v4.8h}, [\dst], \d_strd

        st1             {v6.8h}, [\ds2], \d_strd

        b.gt            8b

ret

160:

320:

640:

1280:   // 16xN, 32xN, ... h

        AARCH64_VALID_JUMP_TARGET

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \s_strd,  \s_strd,  #1

        sub             \s_strd,  \s_strd,  \w, uxtw #1

        sub             \s_strd,  \s_strd,  #16

.ifc \type, put

        lsl             \d_strd,  \d_strd,  #1

        sub             \d_strd,  \d_strd,  \w, uxtw #1

.endif

161:

        ld1             {v16.8h},  [\src], #16

        ld1             {v21.8h},  [\sr2], #16

        mov             \mx, \w

16:

        ld1             {v17.8h, v18.8h},  [\src], #32

        ld1             {v22.8h, v23.8h},  [\sr2], #32

        ext             v19.16b, v16.16b, v17.16b, #2

        ext             v20.16b, v17.16b, v18.16b, #2

        ext             v24.16b, v21.16b, v22.16b, #2

        ext             v25.16b, v22.16b, v23.16b, #2

        mul             v16.8h,  v16.8h,  v0.8h

        mla             v16.8h,  v19.8h,  v1.8h

        mul             v17.8h,  v17.8h,  v0.8h

        mla             v17.8h,  v20.8h,  v1.8h

        mul             v21.8h,  v21.8h,  v0.8h

        mla             v21.8h,  v24.8h,  v1.8h

        mul             v22.8h,  v22.8h,  v0.8h

        mla             v22.8h,  v25.8h,  v1.8h

        urshl           v16.8h,  v16.8h,  v31.8h

        urshl           v17.8h,  v17.8h,  v31.8h

        urshl           v21.8h,  v21.8h,  v31.8h

        urshl           v22.8h,  v22.8h,  v31.8h

        subs            \mx, \mx, #16

.ifc \type, put

        urshl           v16.8h,  v16.8h,  v30.8h

        urshl           v17.8h,  v17.8h,  v30.8h

        urshl           v21.8h,  v21.8h,  v30.8h

        urshl           v22.8h,  v22.8h,  v30.8h

.else

        sub             v16.8h,  v16.8h,  v29.8h

        sub             v17.8h,  v17.8h,  v29.8h

        sub             v21.8h,  v21.8h,  v29.8h

        sub             v22.8h,  v22.8h,  v29.8h

.endif

        st1             {v16.8h, v17.8h}, [\dst], #32

        st1             {v21.8h, v22.8h}, [\ds2], #32

        b.le            9f

        mov             v16.16b, v18.16b

        mov             v21.16b, v23.16b

        b               16b

9:

        add             \dst,  \dst,  \d_strd

        add             \ds2,  \ds2,  \d_strd

        add             \src,  \src,  \s_strd

        add             \sr2,  \sr2,  \s_strd

        subs            \h,  \h,  #2

        b.gt            161b

ret

endfunc

jumptable \type\()_bilin_h_tbl

        .word 1280b - \type\()_bilin_h_tbl

        .word 640b  - \type\()_bilin_h_tbl

        .word 320b  - \type\()_bilin_h_tbl

        .word 160b  - \type\()_bilin_h_tbl

        .word 80b   - \type\()_bilin_h_tbl

        .word 40b   - \type\()_bilin_h_tbl

        .word 20b   - \type\()_bilin_h_tbl

endjumptable

function L(\type\()_bilin_v)

        cmp             \h,  #4

        movrel          x10, \type\()_bilin_v_tbl

.ifc \type, prep

        dup             v31.8h,  w11      // 4 - intermediate_bits

.endif

        ldrsw           x9,  [x10, x9, lsl #2]

.ifc \type, prep

        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8

        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)

.endif

        add             x10, x10, x9

        br              x10

20:     // 2xN v

        AARCH64_VALID_JUMP_TARGET

.ifc \type, put

        cmp             \h,  #2

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \s_strd,  \s_strd,  #1

        lsl             \d_strd,  \d_strd,  #1

        // 2x2 v

        ld1r            {v16.4s}, [\src], \s_strd

        b.gt            24f

22:

        ld1r            {v17.4s}, [\sr2], \s_strd

        ld1r            {v18.4s}, [\src], \s_strd

        trn1            v16.2s,  v16.2s,  v17.2s

        trn1            v17.2s,  v17.2s,  v18.2s

        mul             v4.4h,   v16.4h,  v2.4h

        mla             v4.4h,   v17.4h,  v3.4h

        urshr           v4.8h,   v4.8h,   #4

        str             s4,        [\dst]

        st1             {v4.s}[1], [\ds2]

ret

24:     // 2x4, 2x6, 2x8, ... v

        ld1r            {v17.4s}, [\sr2], \s_strd

        ld1r            {v18.4s}, [\src], \s_strd

        ld1r            {v19.4s}, [\sr2], \s_strd

        ld1r            {v20.4s}, [\src], \s_strd

        sub             \h,  \h,  #4

        trn1            v16.2s,  v16.2s,  v17.2s

        trn1            v17.2s,  v17.2s,  v18.2s

        trn1            v18.2s,  v18.2s,  v19.2s

        trn1            v19.2s,  v19.2s,  v20.2s

        trn1            v16.2d,  v16.2d,  v18.2d

        trn1            v17.2d,  v17.2d,  v19.2d

        mul             v4.8h,   v16.8h,  v2.8h

        mla             v4.8h,   v17.8h,  v3.8h

        cmp             \h,  #2

        urshr           v4.8h,   v4.8h,   #4

        st1             {v4.s}[0], [\dst], \d_strd

        st1             {v4.s}[1], [\ds2], \d_strd

        st1             {v4.s}[2], [\dst], \d_strd

        st1             {v4.s}[3], [\ds2], \d_strd

        b.lt            0f

        mov             v16.8b,  v20.8b

        b.eq            22b

        b               24b

0:

ret

.endif

40:     // 4xN v

        AARCH64_VALID_JUMP_TARGET

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \s_strd,  \s_strd,  #1

        lsl             \d_strd,  \d_strd,  #1

        ld1             {v16.4h}, [\src], \s_strd

4:

        ld1             {v17.4h}, [\sr2], \s_strd

        ld1             {v18.4h}, [\src], \s_strd

        trn1            v16.2d,  v16.2d,  v17.2d

        trn1            v17.2d,  v17.2d,  v18.2d

        mul             v4.8h,   v16.8h,  v2.8h

        mla             v4.8h,   v17.8h,  v3.8h

        subs            \h,  \h,  #2

.ifc \type, put

        urshr           v4.8h,   v4.8h,   #4

.else

        urshl           v4.8h,   v4.8h,   v31.8h

        sub             v4.8h,   v4.8h,   v29.8h

.endif

        st1             {v4.8b},   [\dst], \d_strd

        st1             {v4.d}[1], [\ds2], \d_strd

        b.le            0f

        mov             v16.8b,  v18.8b

        b               4b

0:

ret

80:     // 8xN v

        AARCH64_VALID_JUMP_TARGET

        add             \ds2,  \dst,  \d_strd

        add             \sr2,  \src,  \s_strd

        lsl             \s_strd,  \s_strd,  #1

        lsl             \d_strd,  \d_strd,  #1

        ld1             {v16.8h}, [\src], \s_strd

8:

        ld1             {v17.8h}, [\sr2], \s_strd

        ld1             {v18.8h}, [\src], \s_strd

        mul             v4.8h,   v16.8h,  v2.8h

        mla             v4.8h,   v17.8h,  v3.8h

        mul             v5.8h,   v17.8h,  v2.8h

        mla             v5.8h,   v18.8h,  v3.8h

        subs            \h,  \h,  #2

.ifc \type, put

        urshr           v4.8h,   v4.8h,   #4

        urshr           v5.8h,   v5.8h,   #4

.else

        urshl           v4.8h,   v4.8h,   v31.8h

        urshl           v5.8h,   v5.8h,   v31.8h

        sub             v4.8h,   v4.8h,   v29.8h

        sub             v5.8h,   v5.8h,   v29.8h

.endif

        st1             {v4.8h}, [\dst], \d_strd

        st1             {v5.8h}, [\ds2], \d_strd

        b.le            0f

        mov             v16.16b, v18.16b

        b               8b

0:

ret

160:    // 16xN, 32xN, ...

320:

640:

1280:

        AARCH64_VALID_JUMP_TARGET

        mov             \my, \h

1:

        add             \ds2, \dst, \d_strd

        add             \sr2, \src, \s_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        ld1             {v16.8h, v17.8h}, [\src], \s_strd

2:

        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd

        ld1             {v20.8h, v21.8h}, [\src], \s_strd

        mul             v4.8h,   v16.8h,  v2.8h

        mla             v4.8h,   v18.8h,  v3.8h

        mul             v5.8h,   v17.8h,  v2.8h

        mla             v5.8h,   v19.8h,  v3.8h

        mul             v6.8h,   v18.8h,  v2.8h

        mla             v6.8h,   v20.8h,  v3.8h

        mul             v7.8h,   v19.8h,  v2.8h

        mla             v7.8h,   v21.8h,  v3.8h

        subs            \h,  \h,  #2

.ifc \type, put

        urshr           v4.8h,   v4.8h,   #4

        urshr           v5.8h,   v5.8h,   #4

        urshr           v6.8h,   v6.8h,   #4

        urshr           v7.8h,   v7.8h,   #4

.else

        urshl           v4.8h,   v4.8h,   v31.8h

        urshl           v5.8h,   v5.8h,   v31.8h

        urshl           v6.8h,   v6.8h,   v31.8h

        urshl           v7.8h,   v7.8h,   v31.8h

        sub             v4.8h,   v4.8h,   v29.8h

        sub             v5.8h,   v5.8h,   v29.8h

        sub             v6.8h,   v6.8h,   v29.8h

        sub             v7.8h,   v7.8h,   v29.8h

.endif

        st1             {v4.8h, v5.8h}, [\dst], \d_strd

        st1             {v6.8h, v7.8h}, [\ds2], \d_strd

        b.le            9f

        mov             v16.16b, v20.16b

        mov             v17.16b, v21.16b

        b               2b

9:

        subs            \w,  \w,  #16

        b.le            0f

        asr             \s_strd, \s_strd, #1

        asr             \d_strd, \d_strd, #1

        msub            \src, \s_strd, \xmy, \src

        msub            \dst, \d_strd, \xmy, \dst

        sub             \src, \src, \s_strd, lsl #1

        mov             \h,  \my

        add             \src, \src, #32

        add             \dst, \dst, #32

        b               1b

0:

ret

endfunc

jumptable \type\()_bilin_v_tbl

        .word 1280b - \type\()_bilin_v_tbl

        .word 640b  - \type\()_bilin_v_tbl

        .word 320b  - \type\()_bilin_v_tbl

        .word 160b  - \type\()_bilin_v_tbl

        .word 80b   - \type\()_bilin_v_tbl

        .word 40b   - \type\()_bilin_v_tbl

        .word 20b   - \type\()_bilin_v_tbl

endjumptable

function L(\type\()_bilin_hv)

        movrel          x10, \type\()_bilin_hv_tbl

        dup             v31.8h,  w11      // 4 - intermediate_bits

        ldrsw           x9,  [x10, x9, lsl #2]

        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)

.ifc \type, put

        dup             v30.4s,  w12      // 4 + intermediate_bits

.else

        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8

.endif

        add             x10, x10, x9

.ifc \type, put

        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)

.endif

        br              x10

20:     // 2xN hv

        AARCH64_VALID_JUMP_TARGET

.ifc \type, put

        add             \sr2, \src, \s_strd

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        ld1             {v20.4h},  [\src], \s_strd

        ext             v21.8b,  v20.8b,  v20.8b,  #2

        mul             v16.4h,  v20.4h,  v0.4h

        mla             v16.4h,  v21.4h,  v1.4h

        urshl           v16.4h,  v16.4h,  v31.4h

2:

        ld1             {v22.4h},  [\sr2], \s_strd

        ld1             {v24.4h},  [\src], \s_strd

        ext             v23.8b,  v22.8b,  v22.8b,  #2

        ext             v25.8b,  v24.8b,  v24.8b,  #2

        trn1            v22.2s,  v22.2s,  v24.2s

        trn1            v23.2s,  v23.2s,  v25.2s

        mul             v17.4h,  v22.4h,  v0.4h

        mla             v17.4h,  v23.4h,  v1.4h

        urshl           v17.4h,  v17.4h,  v31.4h

        trn1            v16.2s,  v16.2s,  v17.2s

        umull           v4.4s,   v16.4h,  v2.4h

        umlal           v4.4s,   v17.4h,  v3.4h

        urshl           v4.4s,   v4.4s,   v30.4s

        xtn             v4.4h,   v4.4s

        subs            \h,  \h,  #2

        st1             {v4.s}[0], [\dst], \d_strd

        st1             {v4.s}[1], [\ds2], \d_strd

        b.le            0f

        trn2            v16.2s,  v17.2s,  v17.2s

        b               2b

0:

ret

.endif

40:     // 4xN hv

        AARCH64_VALID_JUMP_TARGET

        add             \sr2, \src, \s_strd

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        ld1             {v20.8h},  [\src], \s_strd

        ext             v21.16b, v20.16b, v20.16b, #2

        mul             v16.4h,  v20.4h,  v0.4h

        mla             v16.4h,  v21.4h,  v1.4h

        urshl           v16.4h,  v16.4h,  v31.4h

4:

        ld1             {v22.8h},  [\sr2], \s_strd

        ld1             {v24.8h},  [\src], \s_strd

        ext             v23.16b, v22.16b, v22.16b, #2

        ext             v25.16b, v24.16b, v24.16b, #2

        trn1            v22.2d,  v22.2d,  v24.2d

        trn1            v23.2d,  v23.2d,  v25.2d

        mul             v17.8h,  v22.8h,  v0.8h

        mla             v17.8h,  v23.8h,  v1.8h

        urshl           v17.8h,  v17.8h,  v31.8h

        trn1            v16.2d,  v16.2d,  v17.2d

        umull           v4.4s,   v16.4h,  v2.4h

        umlal           v4.4s,   v17.4h,  v3.4h

        umull2          v5.4s,   v16.8h,  v2.8h

        umlal2          v5.4s,   v17.8h,  v3.8h

.ifc \type, put

        urshl           v4.4s,   v4.4s,   v30.4s

        urshl           v5.4s,   v5.4s,   v30.4s

        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2

.else

        rshrn           v4.4h,   v4.4s,   #4

        rshrn2          v4.8h,   v5.4s,   #4

        sub             v4.8h,   v4.8h,   v29.8h

.endif

        subs            \h,  \h,  #2

        st1             {v4.8b},   [\dst], \d_strd

        st1             {v4.d}[1], [\ds2], \d_strd

        b.le            0f

        trn2            v16.2d,  v17.2d,  v17.2d

        b               4b

0:

ret

80:     // 8xN, 16xN, ... hv

160:

320:

640:

1280:

        AARCH64_VALID_JUMP_TARGET

        mov             \my, \h

1:

        add             \sr2, \src, \s_strd

        add             \ds2, \dst, \d_strd

        lsl             \s_strd, \s_strd, #1

        lsl             \d_strd, \d_strd, #1

        ldr             h21, [\src, #16]

        ld1             {v20.8h},  [\src], \s_strd

        ext             v21.16b, v20.16b, v21.16b, #2

        mul             v16.8h,  v20.8h,  v0.8h

        mla             v16.8h,  v21.8h,  v1.8h

        urshl           v16.8h,  v16.8h,  v31.8h

2:

        ldr             h23, [\sr2, #16]

        ld1             {v22.8h},  [\sr2], \s_strd

        ldr             h25, [\src, #16]

        ld1             {v24.8h},  [\src], \s_strd

        ext             v23.16b, v22.16b, v23.16b, #2

        ext             v25.16b, v24.16b, v25.16b, #2

        mul             v17.8h,  v22.8h,  v0.8h

        mla             v17.8h,  v23.8h,  v1.8h

        mul             v18.8h,  v24.8h,  v0.8h

        mla             v18.8h,  v25.8h,  v1.8h

        urshl           v17.8h,  v17.8h,  v31.8h

        urshl           v18.8h,  v18.8h,  v31.8h

        umull           v4.4s,   v16.4h,  v2.4h

        umlal           v4.4s,   v17.4h,  v3.4h

        umull2          v5.4s,   v16.8h,  v2.8h

        umlal2          v5.4s,   v17.8h,  v3.8h

        umull           v6.4s,   v17.4h,  v2.4h

        umlal           v6.4s,   v18.4h,  v3.4h

        umull2          v7.4s,   v17.8h,  v2.8h

        umlal2          v7.4s,   v18.8h,  v3.8h

.ifc \type, put

        urshl           v4.4s,   v4.4s,   v30.4s

        urshl           v5.4s,   v5.4s,   v30.4s

        urshl           v6.4s,   v6.4s,   v30.4s

        urshl           v7.4s,   v7.4s,   v30.4s

        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2

        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto

.else

        rshrn           v4.4h,   v4.4s,   #4

        rshrn2          v4.8h,   v5.4s,   #4

        rshrn           v5.4h,   v6.4s,   #4

        rshrn2          v5.8h,   v7.4s,   #4

        sub             v4.8h,   v4.8h,   v29.8h

        sub             v5.8h,   v5.8h,   v29.8h

.endif

        subs            \h,  \h,  #2

        st1             {v4.8h}, [\dst], \d_strd

        st1             {v5.8h}, [\ds2], \d_strd

        b.le            9f

        mov             v16.16b, v18.16b

        b               2b

9:

        subs            \w,  \w,  #8

        b.le            0f

        asr             \s_strd,  \s_strd,  #1

        asr             \d_strd,  \d_strd,  #1

        msub            \src,  \s_strd,  \xmy,  \src

        msub            \dst,  \d_strd,  \xmy,  \dst

        sub             \src,  \src,  \s_strd,  lsl #1

        mov             \h,  \my

        add             \src,  \src,  #16

        add             \dst,  \dst,  #16

        b               1b

0:

ret

endfunc

jumptable \type\()_bilin_hv_tbl

        .word 1280b - \type\()_bilin_hv_tbl

        .word 640b  - \type\()_bilin_hv_tbl

        .word 320b  - \type\()_bilin_hv_tbl

        .word 160b  - \type\()_bilin_hv_tbl

        .word 80b   - \type\()_bilin_hv_tbl

        .word 40b   - \type\()_bilin_hv_tbl

        .word 20b   - \type\()_bilin_hv_tbl

endjumptable

.endm

make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap

make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap

make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap

make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap

make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap

filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap

make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap

make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap

make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap

make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap

filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap

filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10

make_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap

make_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap

make_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap

make_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap

make_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap

filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap

make_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap

make_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap

make_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap

make_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap

filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap

filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10

.macro load_filter_row dst, src, inc

        asr             w13, \src, #10

        add             \src, \src, \inc

        ldr             \dst, [x11, w13, sxtw #3]

.endm

function warp_filter_horz_neon

        add             w12, w5,  #512

        ld1             {v16.8h, v17.8h}, [x2], x3

        load_filter_row d0, w12, w7

        load_filter_row d1, w12, w7

        load_filter_row d2, w12, w7

        sxtl            v0.8h,   v0.8b

        load_filter_row d3, w12, w7

        sxtl            v1.8h,   v1.8b

        load_filter_row d4, w12, w7

        sxtl            v2.8h,   v2.8b

        load_filter_row d5, w12, w7

        sxtl            v3.8h,   v3.8b

        load_filter_row d6, w12, w7

        sxtl            v4.8h,   v4.8b

        load_filter_row d7, w12, w7

        sxtl            v5.8h,   v5.8b

        ext             v18.16b, v16.16b, v17.16b, #2*1

        smull           v8.4s,   v16.4h,  v0.4h

        smull2          v9.4s,   v16.8h,  v0.8h

        sxtl            v6.8h,   v6.8b

        ext             v19.16b, v16.16b, v17.16b, #2*2

        smull           v10.4s,  v18.4h,  v1.4h

        smull2          v11.4s,  v18.8h,  v1.8h

        sxtl            v7.8h,   v7.8b

        ext             v20.16b, v16.16b, v17.16b, #2*3

        smull           v0.4s,   v19.4h,  v2.4h

        smull2          v1.4s,   v19.8h,  v2.8h

        ext             v21.16b, v16.16b, v17.16b, #2*4

        addp            v8.4s,   v8.4s,   v9.4s

        smull           v2.4s,   v20.4h,  v3.4h

        smull2          v3.4s,   v20.8h,  v3.8h

        ext             v22.16b, v16.16b, v17.16b, #2*5

        addp            v9.4s,   v10.4s,  v11.4s

        smull           v10.4s,  v21.4h,  v4.4h

        smull2          v11.4s,  v21.8h,  v4.8h

        ext             v23.16b, v16.16b, v17.16b, #2*6

        addp            v0.4s,   v0.4s,   v1.4s

        smull           v18.4s,  v22.4h,  v5.4h

        smull2          v19.4s,  v22.8h,  v5.8h

        ext             v16.16b, v16.16b, v17.16b, #2*7

        addp            v1.4s,   v2.4s,   v3.4s

        addp            v2.4s,   v10.4s,  v11.4s

        smull           v20.4s,  v23.4h,  v6.4h

        smull2          v21.4s,  v23.8h,  v6.8h

        addp            v3.4s,   v18.4s,  v19.4s

        smull           v22.4s,  v16.4h,  v7.4h

        smull2          v23.4s,  v16.8h,  v7.8h

        addp            v4.4s,   v20.4s,  v21.4s

        addp            v5.4s,   v22.4s,  v23.4s

        addp            v8.4s,   v8.4s,   v9.4s

        addp            v0.4s,   v0.4s,   v1.4s

        addp            v2.4s,   v2.4s,   v3.4s

        addp            v4.4s,   v4.4s,   v5.4s

        addp            v16.4s,  v8.4s,   v0.4s

        addp            v17.4s,  v2.4s,   v4.4s

        add             w5,  w5,  w8

        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)

        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)

ret

endfunc

// void dav1d_warp_affine_8x8_16bpc_neon(

//         pixel *dst, const ptrdiff_t dst_stride,

//         const pixel *src, const ptrdiff_t src_stride,

//         const int16_t *const abcd, int mx, int my,

//         const int bitdepth_max)

.macro warp t

function warp_affine_8x8\t\()_16bpc_neon, export=1

        stp             d8,  d9,  [sp, #-0x40]!

        stp             d10, d11, [sp, #0x10]

        stp             d12, d13, [sp, #0x20]

        stp             d14, d15, [sp, #0x30]

.ifb \t

        dup             v15.8h,  w7        // bitdepth_max

.else

        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8

.endif

        clz             w7,  w7

                                           // intermediate_bits = clz(bitdepth_max) - 18

.ifb \t

        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7

.endif

        sub             w7,  w7,  #25      // -(7 - intermediate_bits)

.ifb \t

        neg             w8,  w8            // -(7 + intermediate_bits)

.endif

        dup             v14.4s,  w7        // -(7 - intermediate_bits)

.ifb \t

        dup             v13.4s,  w8        // -(7 + intermediate_bits)

.endif

        ldr             x4,  [x4]

        sbfx            x7,  x4, #0,  #16

        sbfx            x8,  x4, #16, #16

        sbfx            x9,  x4, #32, #16

        sbfx            x4,  x4, #48, #16

        mov             w10, #8

        sub             x2,  x2,  x3, lsl #1

        sub             x2,  x2,  x3

        sub             x2,  x2,  #6

        movrel          x11, X(mc_warp_filter), 64*8

        mov             x15, x30

.ifnb \t

        lsl             x1,  x1,  #1

.endif

        bl              warp_filter_horz_neon

        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2

        bl              warp_filter_horz_neon

        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto

        bl              warp_filter_horz_neon

        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto

        bl              warp_filter_horz_neon

        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto

        bl              warp_filter_horz_neon

        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto

        bl              warp_filter_horz_neon

        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto

        bl              warp_filter_horz_neon

        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto

1:

        add             w14, w6,  #512

        bl              warp_filter_horz_neon

        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2

        load_filter_row d0, w14, w9

        load_filter_row d1, w14, w9

        load_filter_row d2, w14, w9

        load_filter_row d3, w14, w9

        load_filter_row d4, w14, w9

        load_filter_row d5, w14, w9

        load_filter_row d6, w14, w9

        load_filter_row d7, w14, w9

        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl

        // This ordering of smull/smlal/smull2/smlal2 is highly

        // beneficial for Cortex A53 here.

        smull           v16.4s,  v24.4h,  v0.4h

        smlal           v16.4s,  v25.4h,  v1.4h

        smlal           v16.4s,  v26.4h,  v2.4h

        smlal           v16.4s,  v27.4h,  v3.4h

        smlal           v16.4s,  v28.4h,  v4.4h

        smlal           v16.4s,  v29.4h,  v5.4h

        smlal           v16.4s,  v30.4h,  v6.4h

        smlal           v16.4s,  v31.4h,  v7.4h

        smull2          v17.4s,  v24.8h,  v0.8h

        smlal2          v17.4s,  v25.8h,  v1.8h

        smlal2          v17.4s,  v26.8h,  v2.8h

        smlal2          v17.4s,  v27.8h,  v3.8h

        smlal2          v17.4s,  v28.8h,  v4.8h

        smlal2          v17.4s,  v29.8h,  v5.8h

        smlal2          v17.4s,  v30.8h,  v6.8h

        smlal2          v17.4s,  v31.8h,  v7.8h

        mov             v24.16b, v25.16b

        mov             v25.16b, v26.16b

.ifb \t

        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)

        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)

.else

        rshrn           v16.4h,  v16.4s,  #7

        rshrn2          v16.8h,  v17.4s,  #7

.endif

        mov             v26.16b, v27.16b

.ifb \t

        sqxtun          v16.4h,  v16.4s

        sqxtun2         v16.8h,  v17.4s

.else

        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS

.endif

        mov             v27.16b, v28.16b

        mov             v28.16b, v29.16b

.ifb \t

        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max

.endif

        mov             v29.16b, v30.16b

        mov             v30.16b, v31.16b

        subs            w10, w10, #1

        st1             {v16.8h}, [x0], x1

        add             w6,  w6,  w4

        b.gt            1b

        ldp             d14, d15, [sp, #0x30]

        ldp             d12, d13, [sp, #0x20]

        ldp             d10, d11, [sp, #0x10]

        ldp             d8,  d9,  [sp], 0x40

        ret             x15

endfunc

.endm

warp

warp t

// void dav1d_emu_edge_16bpc_neon(

//         const intptr_t bw, const intptr_t bh,

//         const intptr_t iw, const intptr_t ih,

//         const intptr_t x, const intptr_t y,

//         pixel *dst, const ptrdiff_t dst_stride,

//         const pixel *ref, const ptrdiff_t ref_stride)

function emu_edge_16bpc_neon, export=1

        ldp             x8,  x9,  [sp]

        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)

        // ref += iclip(x, 0, iw - 1)

        sub             x12, x3,  #1           // ih - 1

        cmp             x5,  x3

        sub             x13, x2,  #1           // iw - 1

        csel            x12, x12, x5,  ge      // min(y, ih - 1)

        cmp             x4,  x2

        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)

        csel            x13, x13, x4,  ge      // min(x, iw - 1)

        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)

        madd            x8,  x12, x9,  x8      // ref += iclip() * stride

        add             x8,  x8,  x13, lsl #1  // ref += iclip()

        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)

        // top_ext = iclip(-y, 0, bh - 1)

        add             x10, x5,  x1           // y + bh

        neg             x5,  x5                // -y

        sub             x10, x10, x3           // y + bh - ih

        sub             x12, x1,  #1           // bh - 1

        cmp             x10, x1

        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)

        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)

        cmp             x5,  x1

        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)

        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)

        // right_ext = iclip(x + bw - iw, 0, bw - 1)

        // left_ext = iclip(-x, 0, bw - 1)

        add             x11, x4,  x0           // x + bw

        neg             x4,  x4                // -x

        sub             x11, x11, x2           // x + bw - iw

        sub             x13, x0,  #1           // bw - 1

        cmp             x11, x0

        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)

        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)

        cmp             x4,  x0

        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)

        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)

        // center_h = bh - top_ext - bottom_ext

        // dst += top_ext * PXSTRIDE(dst_stride)

        // center_w = bw - left_ext - right_ext

        sub             x1,  x1,  x5           // bh - top_ext

        madd            x6,  x5,  x7,  x6

        sub             x2,  x0,  x4           // bw - left_ext

        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext

        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext

        mov             x14, x6                // backup of dst

.macro v_loop need_left, need_right

0:

.if \need_left

        ld1r            {v0.8h}, [x8]

        mov             x12, x6                // out = dst

        mov             x3,  x4

        mov             v1.16b,  v0.16b

1:

        subs            x3,  x3,  #16

        st1             {v0.8h, v1.8h}, [x12], #32

        b.gt            1b

.endif

        mov             x13, x8

        add             x12, x6,  x4, lsl #1   // out = dst + left_ext

        mov             x3,  x2

1:

        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64

        subs            x3,  x3,  #32

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64

        b.gt            1b

.if \need_right

        add             x3,  x8,  x2, lsl #1   // in + center_w

        sub             x3,  x3,  #2           // in + center_w - 1

        add             x12, x6,  x4, lsl #1   // dst + left_ext

        ld1r            {v0.8h}, [x3]

        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w

        mov             x3,  x11

        mov             v1.16b,  v0.16b

1:

        subs            x3,  x3,  #16

        st1             {v0.8h, v1.8h}, [x12], #32

        b.gt            1b

.endif

        subs            x1,  x1,  #1           // center_h--

        add             x6,  x6,  x7

        add             x8,  x8,  x9

        b.gt            0b

.endm

        cbz             x4,  2f

        // need_left

        cbz             x11, 3f

        // need_left + need_right

        v_loop          1,   1

        b               5f

2:

        // !need_left

        cbz             x11, 4f

        // !need_left + need_right

        v_loop          0,   1

        b               5f

3:

        // need_left + !need_right

        v_loop          1,   0

        b               5f

4:

        // !need_left + !need_right

        v_loop          0,   0

5:

        cbz             x10, 3f

        // need_bottom

        sub             x8,  x6,  x7           // ref = dst - stride

        mov             x4,  x0

1:

        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64

        mov             x3,  x10

2:

        subs            x3,  x3,  #1

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7

        b.gt            2b

        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride

        subs            x4,  x4,  #32          // bw -= 32

        add             x6,  x6,  #64          // dst += 32

        b.gt            1b

3:

        cbz             x5,  3f

        // need_top

        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride

1:

        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64

        mov             x3,  x5

2:

        subs            x3,  x3,  #1

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7

        b.gt            2b

        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride

        subs            x0,  x0,  #32          // bw -= 32

        add             x6,  x6,  #64          // dst += 32

        b.gt            1b

3:

ret

endfunc

Source code

Revision control

Copy as Markdown

Other Tools