Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#define BUF_POS 0
#define BUF_END 8
#define DIF 16
#define RNG 24
#define CNT 28
#define ALLOW_UPDATE_CDF 32
#define COEFFS_BASE_OFFSET 30
#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)
const coeffs
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
.short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
// masks8
.short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
endconst
.macro ld1_n d0, d1, src, sz, n
.if \n <= 8
ld1 {\d0\sz}, [\src]
.else
ld1 {\d0\sz, \d1\sz}, [\src]
.endif
.endm
.macro st1_n s0, s1, dst, sz, n
.if \n <= 8
st1 {\s0\sz}, [\dst]
.else
st1 {\s0\sz, \s1\sz}, [\dst]
.endif
.endm
.macro ushr_n d0, d1, s0, s1, shift, sz, n
ushr \d0\sz, \s0\sz, \shift
.if \n == 16
ushr \d1\sz, \s1\sz, \shift
.endif
.endm
.macro add_n d0, d1, s0, s1, s2, s3, sz, n
add \d0\sz, \s0\sz, \s2\sz
.if \n == 16
add \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
sub \d0\sz, \s0\sz, \s2\sz
.if \n == 16
sub \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro and_n d0, d1, s0, s1, s2, s3, sz, n
and \d0\sz, \s0\sz, \s2\sz
.if \n == 16
and \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
cmhs \d0\sz, \s0\sz, \s2\sz
.if \n == 16
cmhs \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
sshl \d0\sz, \s0\sz, \s2\sz
.if \n == 16
sshl \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
sqdmulh \d0\sz, \s0\sz, \s2\sz
.if \n == 16
sqdmulh \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro str_n idx0, idx1, dstreg, dstoff, n
str \idx0, [\dstreg, \dstoff]
.if \n == 16
str \idx1, [\dstreg, \dstoff + 16]
.endif
.endm
// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
// size_t n_symbols);
function msac_decode_symbol_adapt4_neon, export=1
.macro decode_update sz, szb, n
.if \n == 16
sub sp, sp, #48
.endif
add x8, x0, #RNG
ld1_n v0, v1, x1, \sz, \n // cdf
ld1r {v29\sz}, [x8] // rng
movrel x9, coeffs, COEFFS_BASE_OFFSET
movi v31\sz, #0x7f, lsl #8 // 0x7f00
sub x10, x9, x2, lsl #1
mvni v30\sz, #0x3f // 0xffc0
and v7\szb, v29\szb, v31\szb // rng & 0x7f00
.if \n == 16
str h29, [sp, #14] // store original u = s->rng
.endif
and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
ld1_n v4, v5, x10, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
ldr d28, [x0, #DIF]
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
dup v30\sz, v28.h[3] // dif >> (EC_WIN_SIZE - 16)
.if \n == 8
ldur q31, [x9, #MASKS8_OFFSET]
.elseif \n == 16
str_n q4, q5, sp, #16, \n // store v values to allow indexed access
.endif
// After the condition starts being true it continues, such that the vector looks like:
// 0, 0, 0 ... -1, -1
cmhs_n v2, v3, v30, v30, v4, v5, \sz, \n // c >= v
.if \n == 4
ext v29\szb, v29\szb, v4\szb, #6 // u
umov x15, v2.d[0]
ldr w4, [x0, #ALLOW_UPDATE_CDF]
rev x15, x15
sub v29\sz, v29\sz, v4\sz // rng = u-v
// rev + clz = count trailing zeros
clz x15, x15 // 16*ret
.elseif \n == 8
// The final short of the compare is always set.
// Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
// For n == 8:
// -0x202 + -0x202 + ... + 0xF0E
// (0x202*7) | (1 << 8)
// ^-------offset for second byte of the short
and v31\szb, v31\szb, v2\szb
ext v29\szb, v29\szb, v4\szb, #14 // u
addv h31, v31\sz // ((2*ret + 1) << 8) | (2*ret)
ldr w4, [x0, #ALLOW_UPDATE_CDF]
sub v30\sz, v30\sz, v4\sz // (dif >> 48) - v
smov w15, v31.b[0] // 2*ret
sub v29\sz, v29\sz, v4\sz // rng = u-v
.elseif \n == 16
add v6\sz, v2\sz, v3\sz
addv h31, v6\sz // -n + ret
ldr w4, [x0, #ALLOW_UPDATE_CDF]
smov w15, v31.h[0]
.endif
cbz w4, 0f
// update_cdf
ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
.if \n == 16
// 16 case has a lower bound that guarantees n_symbols > 2
mov w4, #-5
.elseif \n == 8
mvn w14, w2
mov w4, #-4
cmn w14, #3 // set C if n_symbols <= 2
.else
// if n_symbols < 4 (or < 6 even) then
// (1 + n_symbols) >> 2 == n_symbols > 2
add w14, w2, #17 // (1 + n_symbols) + (4 << 2)
.endif
sub_n v16, v17, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
orr v2\sz, #0x80, lsl #8
.if \n == 16
orr v3\sz, #0x80, lsl #8
.endif
.if \n == 16
sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
.elseif \n == 8
lsr w14, w3, #4 // count >> 4
sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
.else
neg w4, w14, lsr #2 // -((n_symbols > 2) + 4)
sub w4, w4, w3, lsr #4 // -((count >> 4) + (n_symbols > 2) + 4)
.endif
sub_n v2, v3, v2, v3, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6\sz, w4 // -rate
sub w3, w3, w3, lsr #5 // count - (count == 32)
sshl_n v2, v3, v2, v3, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
add w3, w3, #1 // count + (count < 32)
add_n v0, v1, v16, v17, v2, v3, \sz, \n // cdf + (32768 - cdf[i]) >> rate
st1_n v0, v1, x1, \sz, \n
strh w3, [x1, x2, lsl #1]
0:
// renorm
.if \n == 4
ldr w6, [x0, #CNT]
ldr x7, [x0, #DIF]
mov x4, v29.d[0] // rng (packed)
mov x3, v4.d[0] // v (packed)
// Shift 'v'/'rng' for ret into the 16 least sig bits. There is
// garbage in the remaining bits, but we can work around this.
lsr x4, x4, x15 // rng
lsr x3, x3, x15 // v
lsl w5, w4, #16 // rng << 16
sub x7, x7, x3, lsl #48 // dif - (v << 48)
clz w5, w5 // d = clz(rng << 16)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
strh w4, [x0, #RNG]
b.lo 1f
str w6, [x0, #CNT]
str x7, [x0, #DIF]
lsr w0, w15, #4
ret
1:
lsr w15, w15, #4
b L(refill)
.elseif \n == 8
ldr w6, [x0, #CNT]
tbl v30.8b, {v30.16b}, v31.8b
tbl v29.8b, {v29.16b}, v31.8b
ins v28.h[3], v30.h[0] // dif - (v << 48)
clz v0.4h, v29.4h // d = clz(rng)
umov w5, v0.h[0]
ushl v29.4h, v29.4h, v0.4h // rng << d
// The vec for clz(rng) is filled with garbage after the first short,
// but ushl/sshl conveniently uses only the first byte for the shift
// amount.
ushl d28, d28, d0 // (dif - (v << 48)) << d
subs w6, w6, w5 // cnt -= d
str h29, [x0, #RNG]
b.lo 1f
str w6, [x0, #CNT]
str d28, [x0, #DIF]
lsr w0, w15, #1 // ret
ret
1:
lsr w15, w15, #1 // ret
mov x7, v28.d[0]
b L(refill)
.elseif \n == 16
add x8, sp, w15, sxtw #1
ldrh w3, [x8, #48] // v
ldurh w4, [x8, #46] // u
ldr w6, [x0, #CNT]
ldr x7, [x0, #DIF]
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
sub x7, x7, x3, lsl #48 // dif - (v << 48)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
add sp, sp, #48
b.lo 1f
str w6, [x0, #CNT]
str x7, [x0, #DIF]
add w0, w15, #\n // ret
ret
1:
add w15, w15, #\n // ret
b L(refill)
.endif
.endm
decode_update .4h, .8b, 4
L(refill):
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
subs x5, x5, x4
b.hi 6f
ldr x8, [x3] // next_bits
add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
mvn x8, x8
neg w5, w4
rev x8, x8 // next_bits = bswap(next_bits)
lsr w5, w5, #3 // num_bytes_read
lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
2: // refill_end
add x3, x3, x5
add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
3: // refill_end2
orr x7, x7, x8 // dif |= next_bits
4: // end
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
5: // pad_with_ones
add w8, w6, #-16
ror x8, x8, x8
b 3b
6: // refill_eob
cmp x3, x4
b.hs 5b
ldr x8, [x4, #-8]
lsl w5, w5, #3
lsr x8, x8, x5
add w5, w6, #-48
mvn x8, x8
sub w4, w4, w3 // num_bytes_left
rev x8, x8
lsr x8, x8, x5
neg w5, w5
lsr w5, w5, #3
cmp w5, w4
csel w5, w5, w4, lo // num_bytes_read
b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
decode_update .8h, .16b, 8
endfunc
function msac_decode_symbol_adapt16_neon, export=1
decode_update .8h, .16b, 16
endfunc
function msac_decode_hi_tok_neon, export=1
ld1 {v0.4h}, [x1] // cdf
add x16, x0, #RNG
movi v31.4h, #0x7f, lsl #8 // 0x7f00
movrel x17, coeffs, COEFFS_BASE_OFFSET-2*3
mvni v30.4h, #0x3f // 0xffc0
ldrh w9, [x1, #6] // count = cdf[n_symbols]
ld1r {v3.4h}, [x16] // rng
ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
add x17, x0, #DIF + 6
mov w13, #-24*8
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
ldr w10, [x0, #ALLOW_UPDATE_CDF]
ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
ldr w6, [x0, #CNT]
ldr x7, [x0, #DIF]
1:
and v7.8b, v3.8b, v31.8b // rng & 0x7f00
sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
cmhs v2.4h, v1.4h, v4.4h // c >= v
add w13, w13, #5*8
ext v18.8b, v3.8b, v4.8b, #6 // u
umov x15, v2.d[0]
rev x15, x15
sub v18.4h, v18.4h, v4.4h // rng = u-v
// rev + clz = count trailing zeros
clz x15, x15 // 16*ret
cbz w10, 2f
// update_cdf
sub v5.4h, v0.4h, v2.4h // cdf[i] + (i >= val ? 1 : 0)
mov w4, #-5
orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768
sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
sub v2.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6.4h, w4 // -rate
sub w9, w9, w9, lsr #5 // count - (count == 32)
sshl v2.4h, v2.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
add w9, w9, #1 // count + (count < 32)
add v0.4h, v5.4h, v2.4h // cdf[i] + (32768 - cdf[i]) >> rate
st1 {v0.4h}, [x1]
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
strh w9, [x1, #6]
2:
mov x4, v18.d[0] // rng (packed)
mov x3, v4.d[0] // v (packed)
// Shift 'v'/'rng' for ret into the 16 least sig bits. There is
// garbage in the remaining bits, but we can work around this.
lsr x4, x4, x15 // rng
lsr x3, x3, x15 // v
lsl w5, w4, #16 // rng << 16
sub x7, x7, x3, lsl #48 // dif - (v << 48)
clz w5, w5 // d = clz(rng << 16)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
strh w4, [x0, #RNG]
dup v3.4h, w4
b.hs 5f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
subs x5, x5, x4
b.hi 7f
ldr x8, [x3] // next_bits
add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
mvn x8, x8
neg w5, w4
rev x8, x8 // next_bits = bswap(next_bits)
lsr w5, w5, #3 // num_bytes_read
lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
3: // refill_end
add x3, x3, x5
add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
4: // refill_end2
orr x7, x7, x8 // dif |= next_bits
5: // end
sub w15, w15, #5*8
lsr x12, x7, #48
adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
dup v1.8h, w12
b.cc 1b // loop if !carry
add w13, w13, #30*8
str w6, [x0, #CNT]
str x7, [x0, #DIF]
lsr w0, w13, #4
ret
6: // pad_with_ones
add w8, w6, #-16
ror x8, x8, x8
b 4b
7: // refill_eob
cmp x3, x4
b.hs 6b
ldr x8, [x4, #-8]
lsl w5, w5, #3
lsr x8, x8, x5
add w5, w6, #-48
mvn x8, x8
sub w4, w4, w3 // num_bytes_left
rev x8, x8
lsr x8, x8, x5
neg w5, w5
lsr w5, w5, #3
cmp w5, w4
csel w5, w5, w4, lo // num_bytes_read
b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
ldr x7, [x0, #DIF]
bic w4, w5, #0xff // r &= 0xff00
add w4, w4, #8
subs x8, x7, x4, lsl #47 // dif - vw
lsr w4, w4, #1 // v
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.lo L(refill)
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
endfunc
function msac_decode_bool_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
bic w1, w1, #0x3f // f &= ~63
mul w4, w4, w1
lsr w4, w4, #7
add w4, w4, #4 // v
subs x8, x7, x4, lsl #48 // dif - vw
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.lo L(refill)
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
endfunc
function msac_decode_bool_adapt_neon, export=1
ldr w9, [x1] // cdf[0-1]
ldp w5, w6, [x0, #RNG] // + CNT
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
and w2, w9, #0xffc0 // f &= ~63
mul w4, w4, w2
lsr w4, w4, #7
add w4, w4, #4 // v
subs x8, x7, x4, lsl #48 // dif - vw
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
ldr w10, [x0, #ALLOW_UPDATE_CDF]
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, 1f
lsr w2, w9, #16 // count = cdf[1]
and w9, w9, #0xffff // cdf[0]
sub w3, w2, w2, lsr #5 // count - (count >= 32)
lsr w2, w2, #4 // count >> 4
add w10, w3, #1 // count + (count < 32)
add w2, w2, #4 // rate = (count >> 4) | 4
sub w9, w9, w15 // cdf[0] -= bit
sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
sub w9, w9, w11 // cdf[0]
strh w9, [x1]
strh w10, [x1, #2]
1:
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.lo L(refill)
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
endfunc