Source code

Revision control

Copy as Markdown

Other Tools

/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2
vmslt.vx v0, \vec_tmp1, zero
vneg.v \vec_tmp1, \vec_tmp1, v0.t
vmmv.m v1, v0
vmslt.vx v0, \vec_tmp2, zero
vneg.v \vec_tmp2, \vec_tmp2, v0.t
vsra.vx \vec1, \vec_tmp1, \shift
vsra.vx \vec2, \vec_tmp2, \shift
vrsub.vx \vec1, \vec1, \strength
vrsub.vx \vec2, \vec2, \strength
vmax.vx \vec1, \vec1, zero
vmax.vx \vec2, \vec2, zero
vmin.vv \vec_tmp1, \vec1, \vec_tmp1
vmin.vv \vec_tmp2, \vec2, \vec_tmp2
vneg.v \vec_tmp2, \vec_tmp2, v0.t
vmmv.m v0, v1
vneg.v \vec_tmp1, \vec_tmp1, v0.t
.endm
.macro padding_fn w, h
li t5, -32768 # INT16_MIN
andi t4, a7, 4
li t2, -2 # y_start
.if \w == 4
vsetivli zero, \w + 4, e16, m1, ta, ma
.else
vsetivli zero, \w + 4, e16, m2, ta, ma
.endif
vmv.v.x v0, t5
bnez t4, L(top_done_\w\()x\h)
slli t5, a1, 1
addi t5, t5, 2
slli t5, t5, 1
sub t5, a0, t5
sh1add t4, a1, t5
vse16.v v0, (t5)
vse16.v v0, (t4)
li t2, 0
L(top_done_\w\()x\h):
andi t4, a7, 8
li t3, 2 + \h # y_end
bnez t4, L(bottom_done_\w\()x\h)
li t5, \h
mul t5, a1, t5
addi t5, t5, -2
sh1add t5, t5, a0
sh1add t4, a1, t5
vse16.v v0, (t5)
vse16.v v0, (t4)
addi t3, t3, -2
L(bottom_done_\w\()x\h):
andi t4, a7, 1
li t0, -2 # x_start
.if \w == 4
vsetivli zero, 2, e16, m1, ta, ma
.else
vsetivli zero, 2, e16, m2, ta, ma
.endif
bnez t4, L(left_done_\w\()x\h)
mul t5, a1, t2
addi t5, t5, -2
sh1add t5, t5, a0
sub t0, t3, t2
3:
vse16.v v0, (t5)
sh1add t5, a1, t5
addi t0, t0, -1
bnez t0, 3b
L(left_done_\w\()x\h):
andi t4, a7, 2
li t1, 2 + \w # x_end
bnez t4, L(right_done_\w\()x\h)
mul t5, t2, a1
addi t5, t5, \w
sh1add t5, t5, a0
sub t1, t3, t2
4:
vse16.v v0, (t5)
sh1add t5, a1, t5
addi t1, t1, -1
bnez t1, 4b
li t1, \w
L(right_done_\w\()x\h):
beqz t2, L(top_skip_\w\()x\h)
mul t5, a1, t2
add t5, t0, t5
sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start
add a5, a5, t0
sub t5, t1, t0 # x_end - x_start
slli t6, t0, 1
.if \w == 4
vsetvli zero, t5, e16, m1, ta, ma
.else
vsetvli zero, t5, e16, m2, ta, ma
.endif
5:
vle8.v v0, (a5)
addi t2, t2, 1
vzext.vf2 v2, v0
add a5, a3, a5
vse16.v v2, (a0)
sh1add a0, a1, a0
bnez t2, 5b
sub a0, a0, t6 # tmp -= x_start
L(top_skip_\w\()x\h):
li a5, \h
beqz t0, L(left_skip_\w\()x\h)
sh1add a0, t0, a0 # tmp += x_start
7:
.if \w == 4
vsetivli zero, 2, e16, m1, ta, ma
.else
vsetivli zero, 2, e16, m2, ta, ma
.endif
vle8.v v0, (a4)
addi a5, a5, -1
vzext.vf2 v2, v0
addi a4, a4, 2
vse16.v v2, (a0)
sh1add a0, a1, a0
bnez a5, 7b
li a5, \h
mul t5, a1, a5
add t5, t5, t0
slli t5, t5, 1
sub a0, a0, t5 # tmp -= h * tmp_stride + x_start
L(left_skip_\w\()x\h):
8:
.if \w == 4
vsetvli zero, t1, e16, m1, ta, ma
.else
vsetvli zero, t1, e16, m2, ta, ma
.endif
vle8.v v0, (a2)
vzext.vf2 v2, v0
vse16.v v2, (a0)
add a2, a3, a2
sh1add a0, a1, a0
addi a5, a5, -1
bnez a5, 8b
li a5, \h
sh1add a0, t0, a0 # tmp += x_start
add a6, a6, t0 # bottom += x_start
beq a5, t3, L(bottom_skip_\w\()x\h)
sub t5, t1, t0
.if \w == 4
vsetvli zero, t5, e16, m1, ta, ma
.else
vsetvli zero, t5, e16, m2, ta, ma
.endif
9:
vle8.v v0, (a6)
add a6, a3, a6
vzext.vf2 v2, v0
addi a5, a5, 1
vse16.v v2, (a0)
sh1add a0, a1, a0
bne a5, t3, 9b
L(bottom_skip_\w\()x\h):
li t6, \h
mul t6, a3, t6
sub a2, a2, t6 # src -= h * src_stride
mul t5, a1, t3
add t5, t5, t0
slli t5, t5, 1
sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start
.endm
.macro cdef_fn w, h
function cdef_filter_block_\w\()x\h\()_8bpc_rvv, export=1, ext="v,zba,zbb"
csrw vxrm, zero
addi sp, sp, -32 - 144*2
sd a5, 24(sp) # pri_strength
sd a6, 16(sp) # sec_strength
sd a7, 8(sp) # dir
ld a7, 8 + 32 + 144*2(sp) # edges
mv a6, a4 # bottom
mv a5, a3 # top
mv a4, a2 # left
mv a3, a1 # dst_stride
mv a2, a0 # dst
li a1, 12 # tmp_stride
addi a0, sp, 32 + 2*(2*12+2)
padding_fn \w, \h
ld a4, 32 + 2*144(sp) # damping
ld a5, 24(sp) # pri_strength
ld a6, 16(sp) # sec_strength
ld a7, 8(sp) # dir
beqz a5, cdef_filter_sec_only_\w\()x\h
bnez a6, cdef_filter_pri_sec_\w\()x\h
andi t0, a5, 1
li t1, 4
sub t4, t1, t0
li t1, 63
clz t2, a5
sub t1, t1, t2
sub t1, a4, t1
li t0, \h
la t2, dav1d_cdef_directions
addi t3, a7, 2
sh1add t2, t3, t2
blt zero, t1, 1f
mv t1, zero
1:
vsetivli zero, \w, e16, m1, ta, mu
lb t3, 0(t2)
vle8.v v0, (a2)
vzext.vf2 v2, v0
sh1add t6, t3, a0
slli t3, t3, 1
sub t3, a0, t3
vle16.v v4, (t6)
vle16.v v6, (t3)
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
vmul.vx v28, v16, t4
vmacc.vx v28, t4, v8
lb t3, 1(t2)
andi t5, t4, 3
ori t5, t5, 2
sh1add t6, t3, a0
slli t3, t3, 1
sub t3, a0, t3
vsetvli zero, zero, e16, m1, ta, mu
vle16.v v4, (t6)
vle16.v v6, (t3)
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
vmacc.vx v28, t5, v16
vmacc.vx v28, t5, v8
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v24, v28, 4
vadd.vv v28, v2, v24
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v24, v28, 0
vse8.v v24, (a2)
addi t0, t0, -1
add a2, a2, a3
sh1add a0, a1, a0
bnez t0, 1b
addi sp, sp, 32 + 144*2
ret
cdef_filter_sec_only_\w\()x\h:
li t1, 63
clz t2, a6
sub t1, t1, t2
sub t1, a4, t1
li t0, \h
la t2, dav1d_cdef_directions
addi t3, a7, 4
sh1add t3, t3, t2
sh1add t2, a7, t2
2:
vsetivli zero, \w, e16, m1, ta, mu
lb t4, 0(t3)
lb t5, 0(t2)
vle8.v v0, (a2)
vzext.vf2 v2, v0
sh1add t6, t4, a0
slli t4, t4, 1
sub t4, a0, t4
vle16.v v4, (t6)
vle16.v v6, (t4)
sh1add t4, t5, a0
slli t5, t5, 1
sub t5, a0, t5
vle16.v v8, (t4)
vle16.v v10, (t5)
vwsub.vv v12, v4, v2
vwsub.vv v14, v6, v2
vwsub.vv v16, v8, v2
vwsub.vv v18, v10, v2
vsetvli zero, zero, e32, m2, ta, mu
li t4, 2
constrain_vectors v4, v6, v12, a6, t1, v12, v14
constrain_vectors v8, v10, v14, a6, t1, v16, v18
vmul.vx v28, v18, t4
vmacc.vx v28, t4, v16
vmacc.vx v28, t4, v14
vmacc.vx v28, t4, v12
lb t4, 1(t3)
lb t5, 1(t2)
sh1add t6, t4, a0
slli t4, t4, 1
sub t4, a0, t4
vsetvli zero, zero, e16, m1, ta, mu
vle16.v v4, (t6)
vle16.v v6, (t4)
sh1add t4, t5, a0
slli t5, t5, 1
sub t5, a0, t5
vle16.v v8, (t4)
vle16.v v10, (t5)
vwsub.vv v12, v4, v2
vwsub.vv v14, v6, v2
vwsub.vv v16, v8, v2
vwsub.vv v18, v10, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a6, t1, v12, v14
constrain_vectors v8, v10, v14, a6, t1, v16, v18
vadd.vv v4, v28, v12
vadd.vv v28, v4, v14
vadd.vv v4, v28, v16
vadd.vv v28, v4, v18
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v24, v28, 4
vadd.vv v28, v2, v24
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v24, v28, 0
vse8.v v24, (a2)
addi t0, t0, -1
add a2, a2, a3
sh1add a0, a1, a0
bnez t0, 2b
addi sp, sp, 32 + 144*2
ret
cdef_filter_pri_sec_\w\()x\h:
li t1, 63
clz t2, a5
clz t3, a6
sub t2, t1, t2
sub t3, t1, t3
sub t1, a4, t2
sub t2, a4, t3
li t0, \h
la t3, dav1d_cdef_directions
blt zero, t1, 3f
mv t1, zero
3:
vsetivli zero, \w, e16, m1, ta, ma
li t4, 4
andi t6, a5, 1
addi t5, a7, 2
sub t4, t4, t6
sh1add t5, t5, t3
vle8.v v0, (a2)
lb t6, 0(t5)
vzext.vf2 v2, v0
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v2
vmax.vv v24, v4, v2
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
vmul.vx v28, v16, t4
vmacc.vx v28, t4, v8
lb t6, 1(t5)
andi t4, t4, 3
ori t4, t4, 2
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
addi t5, a7, 4
vmacc.vx v28, t4, v16
vmacc.vx v28, t4, v8
sh1add t5, t5, t3
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
li t6, 2
constrain_vectors v4, v6, v12, a6, t2, v8, v16
vmacc.vx v28, t6, v16
vmacc.vx v28, t6, v8
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a6, t2, v8, v16
sh1add t5, a7, t3
vadd.vv v4, v28, v8
vadd.vv v28, v4, v16
vsetvli zero, zero, e16, m1, ta, ma
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
li t6, 2
constrain_vectors v4, v6, v12, a6, t2, v8, v16
vmacc.vx v28, t6, v16
vmacc.vx v28, t6, v8
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a6, t2, v8, v16
vadd.vv v4, v28, v8
vadd.vv v28, v4, v16
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, mu
vnclip.wi v16, v28, 4
vadd.vv v28, v2, v16
vmslt.vv v0, v20, v28
vmerge.vvm v4, v20, v28, v0
vmslt.vv v0, v4, v24
vmerge.vvm v28, v24, v4, v0
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v24, v28, 0
vse8.v v24, (a2)
addi t0, t0, -1
add a2, a2, a3
sh1add a0, a1, a0
bnez t0, 3b
addi sp, sp, 32 + 144*2
ret
endfunc
.endm
cdef_fn 4, 4
cdef_fn 4, 8
cdef_fn 8, 8