msac.S - mozsearch

/*

 * Copyright © 2019, VideoLAN and dav1d authors

 * Copyright © 2019, Martin Storsjo

 * All rights reserved.

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 * 1. Redistributions of source code must retain the above copyright notice, this

 *    list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 *    this list of conditions and the following disclaimer in the documentation

 *    and/or other materials provided with the distribution.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "src/arm/asm.S"

#include "util.S"

#define BUF_POS 0

#define BUF_END 8

#define DIF 16

#define RNG 24

#define CNT 28

#define ALLOW_UPDATE_CDF 32

#define COEFFS_BASE_OFFSET 30

#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)

const coeffs

        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0

        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0

        // masks8

        .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E

endconst

.macro ld1_n d0, d1, src, sz, n

.if \n <= 8

        ld1             {\d0\sz},  [\src]

.else

        ld1             {\d0\sz, \d1\sz},  [\src]

.endif

.endm

.macro st1_n s0, s1, dst, sz, n

.if \n <= 8

        st1             {\s0\sz},  [\dst]

.else

        st1             {\s0\sz, \s1\sz},  [\dst]

.endif

.endm

.macro ushr_n d0, d1, s0, s1, shift, sz, n

        ushr            \d0\sz,  \s0\sz,  \shift

.if \n == 16

        ushr            \d1\sz,  \s1\sz,  \shift

.endif

.endm

.macro add_n d0, d1, s0, s1, s2, s3, sz, n

        add             \d0\sz,  \s0\sz,  \s2\sz

.if \n == 16

        add             \d1\sz,  \s1\sz,  \s3\sz

.endif

.endm

.macro sub_n d0, d1, s0, s1, s2, s3, sz, n

        sub             \d0\sz,  \s0\sz,  \s2\sz

.if \n == 16

        sub             \d1\sz,  \s1\sz,  \s3\sz

.endif

.endm

.macro and_n d0, d1, s0, s1, s2, s3, sz, n

        and             \d0\sz,  \s0\sz,  \s2\sz

.if \n == 16

        and             \d1\sz,  \s1\sz,  \s3\sz

.endif

.endm

.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n

        cmhs            \d0\sz,  \s0\sz,  \s2\sz

.if \n == 16

        cmhs            \d1\sz,  \s1\sz,  \s3\sz

.endif

.endm

.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n

        sshl            \d0\sz,  \s0\sz,  \s2\sz

.if \n == 16

        sshl            \d1\sz,  \s1\sz,  \s3\sz

.endif

.endm

.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n

        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz

.if \n == 16

        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz

.endif

.endm

.macro str_n            idx0, idx1, dstreg, dstoff, n

        str             \idx0,  [\dstreg, \dstoff]

.if \n == 16

        str             \idx1,  [\dstreg, \dstoff + 16]

.endif

.endm

// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,

//                                               size_t n_symbols);

function msac_decode_symbol_adapt4_neon, export=1

.macro decode_update sz, szb, n

.if \n == 16

        sub             sp,  sp,  #48

.endif

        add             x8,  x0,  #RNG

        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf

        ld1r            {v29\sz}, [x8]                            // rng

        movrel          x9,  coeffs, COEFFS_BASE_OFFSET

        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00

        sub             x10, x9,  x2, lsl #1

        mvni            v30\sz, #0x3f                             // 0xffc0

        and             v7\szb, v29\szb, v31\szb                  // rng & 0x7f00

.if \n == 16

        str             h29, [sp, #14]                            // store original u = s->rng

.endif

        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0

        ld1_n           v4,  v5,  x10, \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)

        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1

        ldr             d28, [x0, #DIF]

        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)

        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

        dup             v30\sz, v28.h[3]                          // dif >> (EC_WIN_SIZE - 16)

.if \n == 8

        ldur            q31, [x9, #MASKS8_OFFSET]

.elseif \n == 16

        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access

.endif

        // After the condition starts being true it continues, such that the vector looks like:

        //   0, 0, 0 ... -1, -1

        cmhs_n          v2,  v3,  v30, v30, v4,  v5,  \sz,  \n    // c >= v

.if \n == 4

        ext             v29\szb, v29\szb, v4\szb, #6              // u

        umov            x15, v2.d[0]

        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]

        rev             x15, x15

        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v

        // rev + clz = count trailing zeros

        clz             x15, x15                                  // 16*ret

.elseif \n == 8

        // The final short of the compare is always set.

        // Using addv, subtract -0x202*ret from this value to create a lookup table for a short.

        //  For n == 8:

        // -0x202 + -0x202 + ... + 0xF0E

        //                    (0x202*7) | (1 << 8)

        //                                    ^-------offset for second byte of the short

        and             v31\szb, v31\szb, v2\szb

        ext             v29\szb, v29\szb, v4\szb, #14             // u

        addv            h31, v31\sz                               // ((2*ret + 1) << 8) | (2*ret)

        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]

        sub             v30\sz, v30\sz, v4\sz                     // (dif >> 48) - v

        smov            w15, v31.b[0]                             // 2*ret

        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v

.elseif \n == 16

        add             v6\sz,  v2\sz,  v3\sz

        addv            h31, v6\sz                                // -n + ret

        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]

        smov            w15, v31.h[0]

.endif

        cbz             w4,  0f

        // update_cdf

        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]

.if \n == 16

        // 16 case has a lower bound that guarantees n_symbols > 2

        mov             w4,  #-5

.elseif \n == 8

        mvn             w14, w2

        mov             w4,  #-4

        cmn             w14, #3                                   // set C if n_symbols <= 2

.else

        // if n_symbols < 4 (or < 6 even) then

        //   (1 + n_symbols) >> 2 == n_symbols > 2

        add             w14, w2,  #17                             // (1 + n_symbols) + (4 << 2)

.endif

        sub_n           v16, v17, v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)

        orr             v2\sz, #0x80, lsl #8

.if \n == 16

        orr             v3\sz, #0x80, lsl #8

.endif

.if \n == 16

        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)

.elseif \n == 8

        lsr             w14, w3,  #4                              // count >> 4

        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)

.else

        neg             w4, w14, lsr #2                           // -((n_symbols > 2) + 4)

        sub             w4,  w4,  w3,  lsr #4                     // -((count >> 4) + (n_symbols > 2) + 4)

.endif

        sub_n           v2,  v3,  v2,  v3,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])

        dup             v6\sz,    w4                              // -rate

        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)

        sshl_n          v2,  v3,  v2,  v3,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate

        add             w3,  w3,  #1                              // count + (count < 32)

        add_n           v0,  v1,  v16, v17, v2,  v3,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate

        st1_n           v0,  v1,  x1,  \sz, \n

        strh            w3,  [x1, x2, lsl #1]

0:

        // renorm

.if \n == 4

        ldr             w6,  [x0, #CNT]

        ldr             x7,  [x0, #DIF]

        mov             x4,  v29.d[0]          // rng (packed)

        mov             x3,  v4.d[0]           // v (packed)

        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is

        //  garbage in the remaining bits, but we can work around this.

        lsr             x4,  x4,  x15          // rng

        lsr             x3,  x3,  x15          // v

        lsl             w5,  w4,  #16          // rng << 16

        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)

        clz             w5,  w5                // d = clz(rng << 16)

        lsl             w4,  w4,  w5           // rng << d

        subs            w6,  w6,  w5           // cnt -= d

        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d

        strh            w4,  [x0, #RNG]

        b.lo            1f

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        lsr             w0,  w15, #4

ret

1:

        lsr             w15, w15, #4

        b L(refill)

.elseif \n == 8

        ldr             w6,  [x0, #CNT]

        tbl             v30.8b, {v30.16b}, v31.8b

        tbl             v29.8b, {v29.16b}, v31.8b

        ins             v28.h[3], v30.h[0]     // dif - (v << 48)

        clz             v0.4h,  v29.4h         // d = clz(rng)

        umov            w5,  v0.h[0]

        ushl            v29.4h, v29.4h, v0.4h  // rng << d

        // The vec for clz(rng) is filled with garbage after the first short,

        //  but ushl/sshl conveniently uses only the first byte for the shift

        //  amount.

        ushl            d28, d28, d0           // (dif - (v << 48)) << d

        subs            w6,  w6,  w5           // cnt -= d

        str             h29, [x0, #RNG]

        b.lo            1f

        str             w6,  [x0, #CNT]

        str             d28, [x0, #DIF]

        lsr             w0,  w15, #1           // ret

ret

1:

        lsr             w15, w15, #1           // ret

        mov             x7, v28.d[0]

        b L(refill)

.elseif \n == 16

        add             x8,  sp,  w15, sxtw #1

        ldrh            w3,  [x8, #48]         // v

        ldurh           w4,  [x8, #46]         // u

        ldr             w6,  [x0, #CNT]

        ldr             x7,  [x0, #DIF]

        sub             w4,  w4,  w3           // rng = u - v

        clz             w5,  w4                // clz(rng)

        eor             w5,  w5,  #16          // d = clz(rng) ^ 16

        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)

        lsl             w4,  w4,  w5           // rng << d

        subs            w6,  w6,  w5           // cnt -= d

        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d

        str             w4,  [x0, #RNG]

        add             sp,  sp,  #48

        b.lo            1f

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        add             w0,  w15, #\n          // ret

ret

1:

        add             w15, w15, #\n          // ret

        b L(refill)

.endif

.endm

        decode_update   .4h, .8b, 4

L(refill):

        // refill

        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END

        add             x5,  x3,  #8

        subs            x5,  x5,  x4

        b.hi            6f

        ldr             x8,  [x3]              // next_bits

        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)

        mvn             x8,  x8

        neg             w5,  w4

        rev             x8,  x8                // next_bits = bswap(next_bits)

        lsr             w5,  w5,  #3           // num_bytes_read

        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)

2:      // refill_end

        add             x3,  x3,  x5

        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read

        str             x3,  [x0, #BUF_POS]

3:      // refill_end2

        orr             x7,  x7,  x8           // dif |= next_bits

4:      // end

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        mov             w0,  w15

ret

5:      // pad_with_ones

        add             w8,  w6,  #-16

        ror             x8,  x8,  x8

        b               3b

6:      // refill_eob

        cmp             x3,  x4

        b.hs            5b

        ldr             x8,  [x4, #-8]

        lsl             w5,  w5,  #3

        lsr             x8,  x8,  x5

        add             w5,  w6,  #-48

        mvn             x8,  x8

        sub             w4,  w4,  w3           // num_bytes_left

        rev             x8,  x8

        lsr             x8,  x8,  x5

        neg             w5,  w5

        lsr             w5,  w5,  #3

        cmp             w5,  w4

        csel            w5,  w5,  w4,  lo      // num_bytes_read

        b               2b

endfunc

function msac_decode_symbol_adapt8_neon, export=1

        decode_update   .8h, .16b, 8

endfunc

function msac_decode_symbol_adapt16_neon, export=1

        decode_update   .8h, .16b, 16

endfunc

function msac_decode_hi_tok_neon, export=1

        ld1             {v0.4h},  [x1]            // cdf

        add             x16, x0,  #RNG

        movi            v31.4h, #0x7f, lsl #8     // 0x7f00

        movrel          x17, coeffs, COEFFS_BASE_OFFSET-2*3

        mvni            v30.4h, #0x3f             // 0xffc0

        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]

        ld1r            {v3.4h},  [x16]           // rng

        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)

        add             x17, x0,  #DIF + 6

        mov             w13, #-24*8

        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0

        ldr             w10, [x0, #ALLOW_UPDATE_CDF]

        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)

        ldr             w6,  [x0, #CNT]

        ldr             x7,  [x0, #DIF]

1:

        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00

        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1

        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)

        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

        cmhs            v2.4h,   v1.4h,   v4.4h   // c >= v

        add             w13, w13, #5*8

        ext             v18.8b, v3.8b,  v4.8b, #6 // u

        umov            x15, v2.d[0]

        rev             x15, x15

        sub             v18.4h, v18.4h, v4.4h     // rng = u-v

        // rev + clz = count trailing zeros

        clz             x15, x15                  // 16*ret

        cbz             w10, 2f

        // update_cdf

        sub             v5.4h,   v0.4h,   v2.4h   // cdf[i] + (i >= val ? 1 : 0)

        mov             w4,  #-5

        orr             v2.4h, #0x80, lsl #8      // i >= val ? -1 : 32768

        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)

        sub             v2.4h,   v2.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])

        dup             v6.4h,    w4              // -rate

        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)

        sshl            v2.4h,   v2.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate

        add             w9,  w9,  #1              // count + (count < 32)

        add             v0.4h,   v5.4h,   v2.4h   // cdf[i] + (32768 - cdf[i]) >> rate

        st1             {v0.4h},  [x1]

        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0

        strh            w9,  [x1, #6]

2:

        mov             x4,  v18.d[0]          // rng (packed)

        mov             x3,  v4.d[0]           // v (packed)

        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is

        //  garbage in the remaining bits, but we can work around this.

        lsr             x4,  x4,  x15          // rng

        lsr             x3,  x3,  x15          // v

        lsl             w5,  w4,  #16          // rng << 16

        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)

        clz             w5,  w5                // d = clz(rng << 16)

        lsl             w4,  w4,  w5           // rng << d

        subs            w6,  w6,  w5           // cnt -= d

        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d

        strh            w4,  [x0, #RNG]

        dup             v3.4h,   w4

        b.hs            5f

        // refill

        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END

        add             x5,  x3,  #8

        subs            x5,  x5,  x4

        b.hi            7f

        ldr             x8,  [x3]              // next_bits

        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)

        mvn             x8,  x8

        neg             w5,  w4

        rev             x8,  x8                // next_bits = bswap(next_bits)

        lsr             w5,  w5,  #3           // num_bytes_read

        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)

3:      // refill_end

        add             x3,  x3,  x5

        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read

        str             x3,  [x0, #BUF_POS]

4:      // refill_end2

        orr             x7,  x7,  x8           // dif |= next_bits

5:      // end

        sub             w15, w15, #5*8

        lsr             x12, x7,  #48

        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15

        dup             v1.8h,   w12

        b.cc            1b                     // loop if !carry

        add             w13, w13, #30*8

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        lsr             w0,  w13, #4

ret

6:      // pad_with_ones

        add             w8,  w6,  #-16

        ror             x8,  x8,  x8

        b               4b

7:      // refill_eob

        cmp             x3,  x4

        b.hs            6b

        ldr             x8,  [x4, #-8]

        lsl             w5,  w5,  #3

        lsr             x8,  x8,  x5

        add             w5,  w6,  #-48

        mvn             x8,  x8

        sub             w4,  w4,  w3           // num_bytes_left

        rev             x8,  x8

        lsr             x8,  x8,  x5

        neg             w5,  w5

        lsr             w5,  w5,  #3

        cmp             w5,  w4

        csel            w5,  w5,  w4,  lo      // num_bytes_read

        b               3b

endfunc

function msac_decode_bool_equi_neon, export=1

        ldp             w5,  w6,  [x0, #RNG]   // + CNT

        ldr             x7,  [x0, #DIF]

        bic             w4,  w5,  #0xff        // r &= 0xff00

        add             w4,  w4,  #8

        subs            x8,  x7,  x4, lsl #47  // dif - vw

        lsr             w4,  w4,  #1           // v

        sub             w5,  w5,  w4           // r - v

        cset            w15, lo

        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;

        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

        clz             w5,  w4                // clz(rng)

        eor             w5,  w5,  #16          // d = clz(rng) ^ 16

        lsl             w4,  w4,  w5           // rng << d

        subs            w6,  w6,  w5           // cnt -= d

        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d

        str             w4,  [x0, #RNG]

        b.lo            L(refill)

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        mov             w0,  w15

ret

endfunc

function msac_decode_bool_neon, export=1

        ldp             w5,  w6,  [x0, #RNG]   // + CNT

        ldr             x7,  [x0, #DIF]

        lsr             w4,  w5,  #8           // r >> 8

        bic             w1,  w1,  #0x3f        // f &= ~63

        mul             w4,  w4,  w1

        lsr             w4,  w4,  #7

        add             w4,  w4,  #4           // v

        subs            x8,  x7,  x4, lsl #48  // dif - vw

        sub             w5,  w5,  w4           // r - v

        cset            w15, lo

        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;

        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

        clz             w5,  w4                // clz(rng)

        eor             w5,  w5,  #16          // d = clz(rng) ^ 16

        lsl             w4,  w4,  w5           // rng << d

        subs            w6,  w6,  w5           // cnt -= d

        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d

        str             w4,  [x0, #RNG]

        b.lo            L(refill)

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        mov             w0,  w15

ret

endfunc

function msac_decode_bool_adapt_neon, export=1

        ldr             w9,  [x1]              // cdf[0-1]

        ldp             w5,  w6,  [x0, #RNG]   // + CNT

        ldr             x7,  [x0, #DIF]

        lsr             w4,  w5,  #8           // r >> 8

        and             w2,  w9,  #0xffc0      // f &= ~63

        mul             w4,  w4,  w2

        lsr             w4,  w4,  #7

        add             w4,  w4,  #4           // v

        subs            x8,  x7,  x4, lsl #48  // dif - vw

        sub             w5,  w5,  w4           // r - v

        cset            w15, lo

        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;

        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

        ldr             w10, [x0, #ALLOW_UPDATE_CDF]

        clz             w5,  w4                // clz(rng)

        eor             w5,  w5,  #16          // d = clz(rng) ^ 16

        cbz             w10, 1f

        lsr             w2,  w9,  #16          // count = cdf[1]

        and             w9,  w9,  #0xffff      // cdf[0]

        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)

        lsr             w2,  w2,  #4           // count >> 4

        add             w10, w3,  #1           // count + (count < 32)

        add             w2,  w2,  #4           // rate = (count >> 4) | 4

        sub             w9,  w9,  w15          // cdf[0] -= bit

        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}

        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate

        sub             w9,  w9,  w11          // cdf[0]

        strh            w9,  [x1]

        strh            w10, [x1, #2]

1:

        lsl             w4,  w4,  w5           // rng << d

        subs            w6,  w6,  w5           // cnt -= d

        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d

        str             w4,  [x0, #RNG]

        b.lo            L(refill)

        str             w6,  [x0, #CNT]

        str             x7,  [x0, #DIF]

        mov             w0,  w15

ret

endfunc

Source code

Revision control

Copy as Markdown

Other Tools