Source code

Revision control

Copy as Markdown

Other Tools

/*
* Copyright © 2024, VideoLAN and dav1d authors
* Copyright © 2024, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
.macro ipred_dc_gen topleft, width, height
add.d t0, \width, \height //dc
srai.d t0, t0, 1
addi.d t3, \topleft,1
or t1, zero, zero //data index
srai.d t2, \width, 4 //loop param
beqz t2, 2f
1: // width/16
vldx vr0, t3, t1
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vhaddw.qu.du vr0, vr0, vr0
vpickve2gr.du t4, vr0, 0
add.d t0, t0, t4
addi.d t1, t1, 16
addi.d t2, t2, -1
bnez t2, 1b
b 4f
2: // &8
andi t2, \width, 8
beqz t2, 3f
vxor.v vr0, vr0, vr0
fldx.d f0, t3, t1
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vpickve2gr.du t4, vr0, 0
add.d t0, t0, t4
addi.d t1, t1, 8
b 4f
3: // &4
andi t2, \width, 4
beqz t2, 4f
vxor.v vr0, vr0, vr0
fldx.s f0, t3, t1
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vpickve2gr.wu t4, vr0, 0
add.d t0, t0, t4
addi.d t1, t1, 4
4:
addi.d t3, \topleft,0
srai.d t2, \height, 4 //loop param
beqz t2, 8f
7: // height/16
addi.d t3, t3, -16
vld vr0, t3, 0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vhaddw.qu.du vr0, vr0, vr0
vpickve2gr.du t4, vr0, 0
add.d t0, t0, t4
addi.d t2, t2, -1
bnez t2, 7b
b 10f
8: // &8
andi t2, \height, 8
beqz t2, 9f
addi.d t3, t3, -8
vxor.v vr0, vr0, vr0
fld.d f0, t3, 0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vpickve2gr.du t4, vr0, 0
add.d t0, t0, t4
b 10f
9: // &4
andi t2, \height, 4
beqz t2, 10f
addi.d t3, t3, -4
vxor.v vr0, vr0, vr0
fld.s f0, t3, 0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vpickve2gr.wu t4, vr0, 0
add.d t0, t0, t4
10:
add.d t1, \width, \height
ctz.w t1, t1
sra.w t0, t0, t1
// w != h
beq \width, \height, 16f
add.d t2, \height, \height
add.d t3, \width, \width
slt t2, t2, \width
slt t3, t3, \height
or t2, t2, t3
li.w t3, 0x3334
maskeqz t1, t3, t2
li.w t3, 0x5556
masknez t2, t3, t2
or t1, t1, t2
mul.w t0, t0, t1
srai.w t0, t0, 16
16:
.endm
.macro ipred_splat_dc dst, stride, width, height, dc
li.w t1, 4
blt t1, \width, 2f
li.w t1, 0x01010101
mulw.d.wu t1, \dc, t1
beqz \height, 7f
or t2, \dst, \dst
1: // width <= 4
st.w t1, t2, 0
add.d t2, t2, \stride
addi.d \height, \height, -1
bnez \height, 1b
b 7f
2: //width > 4
li.d t1, 0x0101010101010101
mul.d t1, \dc, t1
vreplgr2vr.d vr0, t1
or t4, \dst, \dst
beqz \height, 7f
3:
andi t5, \width, 64
beqz t5, 4f
vst vr0, t4, 0
vst vr0, t4, 16
vst vr0, t4, 32
vst vr0, t4, 48
b 6f
4:
andi t5, \width, 32
beqz t5, 41f
vst vr0, t4, 0
vst vr0, t4, 16
b 6f
41:
andi t5, \width, 16
beqz t5, 5f
vst vr0, t4, 0
b 6f
5:
fst.d f0, t4, 0
6:
add.d t4, t4, \stride
addi.d \height, \height, -1
bnez \height, 3b
7:
.endm
.macro ipred_dc_gen_top topleft, width
srai.d t0, \width, 1
addi.d t1, \topleft,1
srai.d t2, \width, 4
beqz t2, 2f
1:
vld vr0, t1, 0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vhaddw.qu.du vr0, vr0, vr0
vpickve2gr.du t3, vr0, 0
add.d t0, t0, t3
addi.d t1, t1, 16
addi.d t2, t2, -1
bnez t2, 1b
b 4f
2: // &8
andi t2, \width, 8
beqz t2, 3f
vxor.v vr0, vr0, vr0
fld.d f0, t1, 0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vpickve2gr.du t2, vr0, 0
add.d t0, t0, t2
addi.d t1, t1, 8
b 4f
3: // &4
andi t2, \width, 4
beqz t2, 4f
vxor.v vr0, vr0, vr0
fld.s f0, t1, 0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vpickve2gr.du t2, vr0, 0
add.d t0, t0, t2
addi.d t1, t1, 4
4:
ctz.w t1, \width
sra.w t0, t0, t1
.endm
.macro ipred_dc_gen_left topleft, height
srai.d t0, \height, 1
srai.d t2, \height, 4 //loop param
beqz t2, 8f
7: // height/16
addi.d \topleft,\topleft,-16
vld vr0, \topleft,0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vhaddw.qu.du vr0, vr0, vr0
vpickve2gr.du t4, vr0, 0
add.d t0, t0, t4
addi.d t2, t2, -1
bnez t2, 7b
b 10f
8: // &8
andi t2, \height, 8
beqz t2, 9f
addi.d \topleft,\topleft,-8
vxor.v vr0, vr0, vr0
fld.d f0, \topleft,0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.du.wu vr0, vr0, vr0
vpickve2gr.du t4, vr0, 0
add.d t0, t0, t4
b 10f
9: // &4
andi t2, \height, 4
beqz t2, 10f
addi.d \topleft,\topleft,-4
vxor.v vr0, vr0, vr0
fld.s f0, \topleft,0
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.wu.hu vr0, vr0, vr0
vpickve2gr.wu t4, vr0, 0
add.d t0, t0, t4
10:
ctz.w t1, \height
sra.w t0, t0, t1
.endm
// void ipred_dc_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_dc_8bpc_lsx
ipred_dc_gen a2, a3, a4
ipred_splat_dc a0, a1, a3, a4, t0
endfunc
// void ipred_dc_128_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_dc_128_8bpc_lsx
li.w t0, 128
ipred_splat_dc a0, a1, a3, a4, t0
endfunc
// void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_dc_top_8bpc_lsx
ipred_dc_gen_top a2, a3
ipred_splat_dc a0, a1, a3, a4, t0
endfunc
// void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_dc_left_8bpc_lsx
ipred_dc_gen_left a2, a4
ipred_splat_dc a0, a1, a3, a4, t0
endfunc
.macro pixel_set_8bpc dst_ptr, src_ptr, width
vldrepl.b vr0, \src_ptr, 0
1:
andi a5, \width, 64
beqz a5, 2f
vst vr0, \dst_ptr, 0
vst vr0, \dst_ptr, 16
vst vr0, \dst_ptr, 32
vst vr0, \dst_ptr, 48
b 6f
2:
andi a5, \width, 32
beqz a5, 3f
vst vr0, \dst_ptr, 0
vst vr0, \dst_ptr, 16
b 6f
3:
andi a5, \width, 16
beqz a5, 4f
vst vr0, \dst_ptr, 0
b 6f
4:
andi a5, \width, 8
beqz a5, 5f
fst.d f0, \dst_ptr, 0
b 6f
5:
andi a5, \width, 4
beqz a5, 6f
fst.s f0, \dst_ptr, 0
6:
.endm
// void ipred_h_c(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_h_8bpc_lsx
beqz a4, .IPRED_H_END
.IPRED_H_LOOP:
addi.d a2, a2, -1
pixel_set_8bpc a0, a2, a3
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .IPRED_H_LOOP
.IPRED_H_END:
endfunc
.macro pixel_copy_8bpc dst_ptr, src_ptr, width
1:
andi a5, \width, 64
beqz a5, 2f
vld vr0, \src_ptr, 0
vld vr1, \src_ptr, 16
vld vr2, \src_ptr, 32
vld vr3, \src_ptr, 48
vst vr0, \dst_ptr, 0
vst vr1, \dst_ptr, 16
vst vr2, \dst_ptr, 32
vst vr3, \dst_ptr, 48
b 6f
2:
andi a5, \width, 32
beqz a5, 3f
vld vr0, \src_ptr, 0
vld vr1, \src_ptr, 16
vst vr0, \dst_ptr, 0
vst vr1, \dst_ptr, 16
b 6f
3:
andi a5, \width, 16
beqz a5, 4f
vld vr0, \src_ptr, 0
vst vr0, \dst_ptr, 0
b 6f
4:
andi a5, \width, 8
beqz a5, 5f
fld.d f0, \src_ptr, 0
fst.d f0, \dst_ptr, 0
b 6f
5:
andi a5, \width, 4
beqz a5, 6f
fld.s f0, \src_ptr, 0
fst.s f0, \dst_ptr, 0
6:
.endm
// void ipred_v_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_v_8bpc_lsx
beqz a4, .IPRED_V_END
addi.d a2, a2, 1
.IPRED_V_LOOP:
pixel_copy_8bpc a0, a2, a3
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .IPRED_V_LOOP
.IPRED_V_END:
endfunc
// void ipred_paeth_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const tl_ptr,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_paeth_8bpc_lsx
vldrepl.b vr0, a2, 0 //topleft
vsllwil.hu.bu vr0, vr0, 0
or a6, a2, a2
addi.d a7, a2, 1
.IPRED_PAETH_H_LOOP:
addi.d a6, a6, -1
vldrepl.b vr1, a6, 0 //left
vsllwil.hu.bu vr1, vr1, 0
.IPRED_PAETH_W_LOOP64:
andi a5, a3, 64
beqz a5, .IPRED_PAETH_W_LOOP32
vld vr2, a7, 0 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 0
vld vr2, a7, 16 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 16
vld vr2, a7, 32 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 32
vld vr2, a7, 48 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 48
b .IPRED_PAETH_W_LOOPEND
.IPRED_PAETH_W_LOOP32:
andi a5, a3, 32
beqz a5, .IPRED_PAETH_W_LOOP16
vld vr2, a7, 0 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 0
vld vr2, a7, 16 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 16
b .IPRED_PAETH_W_LOOPEND
.IPRED_PAETH_W_LOOP16:
andi a5, a3, 16
beqz a5, .IPRED_PAETH_W_LOOP8
vld vr2, a7, 0 //top
vpermi.w vr9, vr2, 0x0e
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr9, vr9, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vabsd.hu vr10, vr0, vr9
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vadd.h vr11, vr1, vr9
vabsd.hu vr6, vr3, vr6 //tldiff
vabsd.hu vr11, vr3, vr11 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
vsle.hu vr12, vr5, vr11
vbitsel.v vr7, vr0, vr9, vr12
vsle.hu vr12, vr10, vr5
vsle.hu vr8, vr10, vr11
vand.v vr12, vr12, vr8
vbitsel.v vr12, vr7, vr1, vr12
vsrlni.b.h vr12, vr12, 0
vpermi.w vr12, vr3, 0x44
vst vr12, a0, 0
b .IPRED_PAETH_W_LOOPEND
.IPRED_PAETH_W_LOOP8:
andi a5, a3, 8
beqz a5, .IPRED_PAETH_W_LOOP4
fld.d f2, a7, 0 //top
vsllwil.hu.bu vr2, vr2, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vabsd.hu vr6, vr3, vr6 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
fst.d f3, a0, 0
b .IPRED_PAETH_W_LOOPEND
.IPRED_PAETH_W_LOOP4:
andi a5, a3, 4
beqz a5, .IPRED_PAETH_W_LOOPEND
fld.s f2, a7, 0 //top
vsllwil.hu.bu vr2, vr2, 0
vabsd.hu vr5, vr0, vr1 //tdiff
vabsd.hu vr4, vr0, vr2 //ldiff
vadd.h vr3, vr0, vr0
vadd.h vr6, vr1, vr2
vabsd.hu vr6, vr3, vr6 //tldiff
vsle.hu vr3, vr5, vr6
vbitsel.v vr7, vr0, vr2, vr3
vsle.hu vr3, vr4, vr5
vsle.hu vr8, vr4, vr6
vand.v vr3, vr3, vr8
vbitsel.v vr3, vr7, vr1, vr3
vsrlni.b.h vr3, vr3, 0
fst.s f3, a0, 0
b .IPRED_PAETH_W_LOOPEND
.IPRED_PAETH_W_LOOPEND:
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .IPRED_PAETH_H_LOOP
endfunc
const dav1d_sm_weights
.byte 0, 0
// bs = 2
.byte 255, 128
// bs = 4
.byte 255, 149, 85, 64
// bs = 8
.byte 255, 197, 146, 105, 73, 50, 37, 32
// bs = 16
.byte 255, 225, 196, 170, 145, 123, 102, 84
.byte 68, 54, 43, 33, 26, 20, 17, 16
// bs = 32
.byte 255, 240, 225, 210, 196, 182, 169, 157
.byte 145, 133, 122, 111, 101, 92, 83, 74
.byte 66, 59, 52, 45, 39, 34, 29, 25
.byte 21, 17, 14, 12, 10, 9, 8, 8
// bs = 64
.byte 255, 248, 240, 233, 225, 218, 210, 203
.byte 196, 189, 182, 176, 169, 163, 156, 150
.byte 144, 138, 133, 127, 121, 116, 111, 106
.byte 101, 96, 91, 86, 82, 77, 73, 69
.byte 65, 61, 57, 54, 50, 47, 44, 41
.byte 38, 35, 32, 29, 27, 25, 22, 20
.byte 18, 16, 15, 13, 12, 10, 9, 8
.byte 7, 6, 6, 5, 5, 4, 4, 4
endconst
// void ipred_smooth_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_smooth_8bpc_lsx
la.local a5, dav1d_sm_weights
add.d a6, a5, a3 //hor
add.d a5, a5, a4 //ver
add.d a7, a2, a3
sub.d t0, a2, a4
vldrepl.b vr0, a7, 0 //right
vldrepl.b vr1, t0, 0 //bottom
vsllwil.hu.bu vr0, vr0, 0
vsllwil.wu.hu vr0, vr0, 0
vsllwil.hu.bu vr1, vr1, 0
vsllwil.wu.hu vr1, vr1, 0
li.w t0, 256
vreplgr2vr.w vr6, t0
addi.d t0, a2, 1 //ptr topleft[x]
addi.d t3, a2, -1 //ptr topleft[y]
.IPRED_SMOOTH_H_LOOP:
vldrepl.b vr2, a5, 0 //ver[y]
vldrepl.b vr3, t3, 0 //topleft[y]
vsllwil.hu.bu vr2, vr2, 0
vsllwil.wu.hu vr2, vr2, 0
vsllwil.hu.bu vr3, vr3, 0
vsllwil.wu.hu vr3, vr3, 0
vsub.w vr7, vr6, vr2 //256-ver[y]
or t1, zero, zero //xx
srai.d t2, a3, 2 //loop max
.IPRED_SMOOTH_W_LOOP:
fldx.s f4, t0, t1 //topleft[x]
fldx.s f5, a6, t1 //hor[x]
vsllwil.hu.bu vr4, vr4, 0
vsllwil.wu.hu vr4, vr4, 0
vsllwil.hu.bu vr5, vr5, 0
vsllwil.wu.hu vr5, vr5, 0
vsub.w vr8, vr6, vr5 //256-hor[x]
vmul.w vr9, vr8, vr0
vmadd.w vr9, vr5, vr3
vmadd.w vr9, vr7, vr1
vmadd.w vr9, vr2, vr4 //pred
vadd.w vr9, vr9, vr6
vsrlni.h.w vr9, vr9, 9
vsrlni.b.h vr9, vr9, 0
fstx.s f9, a0, t1
addi.d t1, t1, 4
addi.d t2, t2, -1
bnez t2, .IPRED_SMOOTH_W_LOOP
.IPRED_SMOOTH_W_LOOP_END:
addi.d t3, t3, -1
addi.d a5, a5, 1
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .IPRED_SMOOTH_H_LOOP
endfunc
// void ipred_smooth_v_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_smooth_v_8bpc_lsx
la.local a5, dav1d_sm_weights
add.d a5, a5, a4 //ver
sub.d t0, a2, a4
vldrepl.b vr0, t0, 0 //bottom
vsllwil.hu.bu vr0, vr0, 0
li.w t0, 256
vreplgr2vr.h vr2, t0
li.w t0, 128
vreplgr2vr.h vr3, t0
addi.d t0, a2, 1 //ptr topleft[x]
.IPRED_SMOOTH_V_H_LOOP:
vldrepl.b vr1, a5, 0 //ver[y]
vsllwil.hu.bu vr1, vr1, 0
vsub.h vr5, vr2, vr1 //256-ver[y]
or t1, zero, zero //xx
srai.d t2, a3, 3 //loop max
beqz t2, .IPRED_SMOOTH_V_W_LOOP4
.IPRED_SMOOTH_V_W_LOOP8:
fldx.d f4, t0, t1 //topleft[x]
vsllwil.hu.bu vr4, vr4, 0
vmul.h vr6, vr5, vr0
vmadd.h vr6, vr1, vr4 //pred
vadd.h vr6, vr6, vr3
vsrlni.b.h vr6, vr6, 8
fstx.d f6, a0, t1
addi.d t1, t1, 8
addi.d t2, t2, -1
bnez t2, .IPRED_SMOOTH_V_W_LOOP8
b .IPRED_SMOOTH_V_W_LOOP_END
.IPRED_SMOOTH_V_W_LOOP4:
fldx.s f4, t0, t1 //topleft[x]
vsllwil.hu.bu vr4, vr4, 0
vmul.h vr6, vr5, vr0
vmadd.h vr6, vr1, vr4 //pred
vadd.h vr6, vr6, vr3
vsrai.h vr6, vr6, 8
vsrlni.b.h vr6, vr6, 0
fstx.s f6, a0, t1
addi.d t1, t1, 4
.IPRED_SMOOTH_V_W_LOOP_END:
addi.d a5, a5, 1
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .IPRED_SMOOTH_V_H_LOOP
endfunc
// void ipred_smooth_h_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_smooth_h_8bpc_lsx
la.local a5, dav1d_sm_weights
add.d a6, a5, a3 //hor
add.d a7, a2, a3
vldrepl.b vr0, a7, 0 //right
vsllwil.hu.bu vr0, vr0, 0
li.w t0, 256
vreplgr2vr.h vr1, t0
li.w t0, 128
vreplgr2vr.h vr2, t0
addi.d t3, a2, -1 //ptr topleft[y]
.IPRED_SMOOTH_H_H_LOOP:
vldrepl.b vr3, t3, 0 //topleft[y]
vsllwil.hu.bu vr3, vr3, 0
or t1, zero, zero //xx
srai.d t2, a3, 3 //loop max
beqz t2, .IPRED_SMOOTH_H_W_LOOP4
.IPRED_SMOOTH_H_W_LOOP8:
fldx.d f5, a6, t1 //hor[x]
vsllwil.hu.bu vr5, vr5, 0
vsub.h vr4, vr1, vr5 //256-hor[x]
vmul.h vr6, vr4, vr0
vmadd.h vr6, vr5, vr3 //pred
vadd.h vr6, vr6, vr2
vsrlni.b.h vr6, vr6, 8
fstx.d f6, a0, t1
addi.d t1, t1, 8
addi.d t2, t2, -1
bnez t2, .IPRED_SMOOTH_H_W_LOOP8
b .IPRED_SMOOTH_W_H_LOOP_END
.IPRED_SMOOTH_H_W_LOOP4:
fldx.s f5, a6, t1 //hor[x]
vsllwil.hu.bu vr5, vr5, 0
vsub.h vr4, vr1, vr5 //256-hor[x]
vmul.h vr6, vr4, vr0
vmadd.h vr6, vr5, vr3 //pred
vadd.h vr6, vr6, vr2
vsrai.h vr6, vr6, 8
vsrlni.b.h vr6, vr6, 0
fstx.s f6, a0, t1
addi.d t1, t1, 4
.IPRED_SMOOTH_W_H_LOOP_END:
addi.d t3, t3, -1
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .IPRED_SMOOTH_H_H_LOOP
endfunc
// void pal_pred_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const pal, const uint8_t *idx,
// const int w, const int h)
function pal_pred_8bpc_lsx
srai.d a7, a5, 2
.PAL_PRED_WLOOP4:
andi a6, a4, 4
beqz a6, .PAL_PRED_WLOOP8
fld.d f0, a3, 0
vsrli.b vr1, vr0, 4
vandi.b vr2, vr0, 7
vilvl.b vr0, vr1, vr2
fld.d f1, a2, 0
vshuf.b vr2, vr1, vr1, vr0
vstelm.w vr2, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr2, a0, 0, 1
add.d a0, a0, a1
vstelm.w vr2, a0, 0, 2
add.d a0, a0, a1
vstelm.w vr2, a0, 0, 3
add.d a0, a0, a1
addi.d a3, a3, 8
addi.d a7, a7, -1
bnez a7, .PAL_PRED_WLOOP4
b .PAL_PRED_END
.PAL_PRED_WLOOP8:
andi a6, a4, 8
beqz a6, .PAL_PRED_WLOOP16
vld vr0, a3, 0
vsrli.b vr1, vr0, 4
vandi.b vr2, vr0, 7
vilvl.b vr0, vr1, vr2
vilvh.b vr3, vr1, vr2
fld.d f1, a2, 0
vshuf.b vr0, vr1, vr1, vr0
vshuf.b vr3, vr1, vr1, vr3
vstelm.d vr0, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr0, a0, 0, 1
add.d a0, a0, a1
vstelm.d vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr3, a0, 0, 1
add.d a0, a0, a1
addi.d a3, a3, 16
addi.d a7, a7, -1
bnez a7, .PAL_PRED_WLOOP8
b .PAL_PRED_END
.PAL_PRED_WLOOP16:
andi a6, a4, 16
beqz a6, .PAL_PRED_WLOOP32
vld vr0, a3, 0
vld vr1, a3, 16
fld.d f6, a2, 0
vsrli.b vr2, vr0, 4
vandi.b vr3, vr0, 7
vsrli.b vr4, vr1, 4
vandi.b vr5, vr1, 7
vilvl.b vr0, vr2, vr3
vilvh.b vr1, vr2, vr3
vilvl.b vr2, vr4, vr5
vilvh.b vr3, vr4, vr5
vshuf.b vr0, vr6, vr6, vr0
vshuf.b vr1, vr6, vr6, vr1
vshuf.b vr2, vr6, vr6, vr2
vshuf.b vr3, vr6, vr6, vr3
vst vr0, a0, 0
add.d a0, a0, a1
vst vr1, a0, 0
add.d a0, a0, a1
vst vr2, a0, 0
add.d a0, a0, a1
vst vr3, a0, 0
add.d a0, a0, a1
addi.d a3, a3, 32
addi.d a7, a7, -1
bnez a7, .PAL_PRED_WLOOP16
b .PAL_PRED_END
.PAL_PRED_WLOOP32:
andi a6, a4, 32
beqz a6, .PAL_PRED_WLOOP64
vld vr0, a3, 0
vld vr1, a3, 16
vld vr2, a3, 32
vld vr3, a3, 48
fld.d f4, a2, 0
vsrli.b vr5, vr0, 4
vandi.b vr6, vr0, 7
vsrli.b vr7, vr1, 4
vandi.b vr8, vr1, 7
vsrli.b vr9, vr2, 4
vandi.b vr10, vr2, 7
vsrli.b vr11, vr3, 4
vandi.b vr12, vr3, 7
vilvl.b vr0, vr5, vr6
vilvh.b vr1, vr5, vr6
vilvl.b vr2, vr7, vr8
vilvh.b vr3, vr7, vr8
vilvl.b vr5, vr9, vr10
vilvh.b vr6, vr9, vr10
vilvl.b vr7, vr11, vr12
vilvh.b vr8, vr11, vr12
vshuf.b vr0, vr4, vr4, vr0
vshuf.b vr1, vr4, vr4, vr1
vshuf.b vr2, vr4, vr4, vr2
vshuf.b vr3, vr4, vr4, vr3
vshuf.b vr5, vr4, vr4, vr5
vshuf.b vr6, vr4, vr4, vr6
vshuf.b vr7, vr4, vr4, vr7
vshuf.b vr8, vr4, vr4, vr8
vst vr0, a0, 0
vst vr1, a0, 16
add.d a0, a0, a1
vst vr2, a0, 0
vst vr3, a0, 16
add.d a0, a0, a1
vst vr5, a0, 0
vst vr6, a0, 16
add.d a0, a0, a1
vst vr7, a0, 0
vst vr8, a0, 16
add.d a0, a0, a1
addi.d a3, a3, 64
addi.d a7, a7, -1
bnez a7, .PAL_PRED_WLOOP32
b .PAL_PRED_END
.PAL_PRED_WLOOP64:
vld vr0, a3, 0
vld vr1, a3, 16
fld.d f2, a2, 0
vsrli.b vr3, vr0, 4
vandi.b vr4, vr0, 7
vsrli.b vr5, vr1, 4
vandi.b vr6, vr1, 7
vilvl.b vr0, vr3, vr4
vilvh.b vr1, vr3, vr4
vilvl.b vr3, vr5, vr6
vilvh.b vr4, vr5, vr6
vshuf.b vr0, vr2, vr2, vr0
vshuf.b vr1, vr2, vr2, vr1
vshuf.b vr3, vr2, vr2, vr3
vshuf.b vr4, vr2, vr2, vr4
vst vr0, a0, 0
vst vr1, a0, 16
vst vr3, a0, 32
vst vr4, a0, 48
add.d a0, a0, a1
addi.d a3, a3, 32
addi.d a5, a5, -1
bnez a5, .PAL_PRED_WLOOP64
.PAL_PRED_END:
endfunc
.macro apply_sign_vrh v, s, vrzero, vrt0 ,out
vslt.h \vrt0, \s, \vrzero
vandn.v \s, \vrt0, \v
vsigncov.h \v, \vrt0, \v
vor.v \out, \s, \v
.endm
.macro iclip_pixel_vrh in0, in1, in2, tmp0, tmp1, out
vmin.h \tmp0, \in2, \in0
vslt.h \in0, \in0, \in1
vand.v \tmp1, \in0, \in1
vandn.v \tmp0, \in0, \tmp0
vor.v \out, \tmp1, \tmp0
.endm
.macro ipred_cfl_pred dst, stride, w, h, dc, ac, alpha
vreplgr2vr.h vr2, \alpha
vreplgr2vr.h vr7, \dc
li.w t1, 32
vreplgr2vr.h vr3, t1
vxor.v vr4, vr4, vr4
li.w t1, 255
vreplgr2vr.h vr6, t1
add.d t4, \w, \w
1:
or t1, zero, zero
or t2, zero, zero
srai.d t3, \w, 3
beqz t3, 3f
2:
vldx vr0, \ac, t1
vmul.h vr1, vr2, vr0
vadda.h vr0, vr1, vr3
vsrai.h vr0, vr0, 6
apply_sign_vrh vr0, vr1, vr4, vr5, vr0
vadd.h vr1, vr0, vr7
iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0
vsrlni.b.h vr0, vr0, 0
fstx.d f0, \dst, t2
addi.d t1, t1, 16
addi.d t2, t2, 8
addi.d t3, t3, -1
bnez t3, 2b
b 4f
3:
fld.d f0, \ac, 0
vmul.h vr1, vr2, vr0
vadda.h vr0, vr1, vr3
vsrai.h vr0, vr0, 6
apply_sign_vrh vr0, vr1, vr4, vr5, vr0
vadd.h vr1, vr0, vr7
iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0
vsrlni.b.h vr0, vr0, 0
fst.s f0, \dst, 0
4:
add.d \ac, \ac, t4
add.d \dst, \dst, \stride
addi.d \h, \h, -1
bnez \h, 1b
.endm
function ipred_cfl_8bpc_lsx
ipred_dc_gen a2, a3, a4
ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
endfunc
function ipred_cfl_top_8bpc_lsx
ipred_dc_gen_top a2, a3
ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
endfunc
function ipred_cfl_left_8bpc_lsx
ipred_dc_gen_left a2, a4
ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
endfunc
function ipred_cfl_128_8bpc_lsx
li.w t0, 128
ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
endfunc
const dav1d_filter_intra_taps_lsx
//arr0 8*7
.byte -6, -5, -3, -3, -4, -3, -3, -3
.byte 10, 2, 1, 1, 6, 2, 2, 1
.byte 0, 10, 1, 1, 0, 6, 2, 2
.byte 0, 0, 10, 2, 0, 0, 6, 2
.byte 0, 0, 0, 10, 0, 0, 0, 6
.byte 12, 9, 7, 5, 2, 2, 2, 3
.byte 0, 0, 0, 0, 12, 9, 7, 5
//arr1
.byte -10, -6, -4, -2, -10, -6, -4, -2
.byte 16, 0, 0, 0, 16, 0, 0, 0
.byte 0, 16, 0, 0, 0, 16, 0, 0
.byte 0, 0, 16, 0, 0, 0, 16, 0
.byte 0, 0, 0, 16, 0, 0, 0, 16
.byte 10, 6, 4, 2, 0, 0, 0, 0
.byte 0, 0, 0, 0, 10, 6, 4, 2
//arr2
.byte -8, -8, -8, -8, -4, -4, -4, -4
.byte 8, 0, 0, 0, 4, 0, 0, 0
.byte 0, 8, 0, 0, 0, 4, 0, 0
.byte 0, 0, 8, 0, 0, 0, 4, 0
.byte 0, 0, 0, 8, 0, 0, 0, 4
.byte 16, 16, 16, 16, 0, 0, 0, 0
.byte 0, 0, 0, 0, 16, 16, 16, 16
//arr3
.byte -2, -1, -1, 0, -1, -1, -1, -1
.byte 8, 3, 2, 1, 4, 3, 2, 2
.byte 0, 8, 3, 2, 0, 4, 3, 2
.byte 0, 0, 8, 3, 0, 0, 4, 3
.byte 0, 0, 0, 8, 0, 0, 0, 4
.byte 10, 6, 4, 2, 3, 4, 4, 3
.byte 0, 0, 0, 0, 10, 6, 4, 3
//arr4
.byte -12, -10, -9, -8, -10, -9, -8, -7
.byte 14, 0, 0, 0, 12, 1, 0, 0
.byte 0, 14, 0, 0, 0, 12, 0, 0
.byte 0, 0, 14, 0, 0, 0, 12, 1
.byte 0, 0, 0, 14, 0, 0, 0, 12
.byte 14, 12, 11, 10, 0, 0, 1, 1
.byte 0, 0, 0, 0, 14, 12, 11, 9
endconst
.macro ipred_filter_load_p
vldrepl.b vr0, t0, 0
vldrepl.b vr1, a7, 0
vldrepl.b vr2, a7, 1
vldrepl.b vr3, a7, 2
vldrepl.b vr4, a7, 3
vldrepl.b vr5, t1, 0
vldrepl.b vr6, t1, -1
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr1, vr1, 0
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr3, vr3, 0
vsllwil.hu.bu vr4, vr4, 0
vsllwil.hu.bu vr5, vr5, 0
vsllwil.hu.bu vr6, vr6, 0
.endm
.macro ipred_filter_loadx_p
vldrepl.b vr0, t0, 0
vldrepl.b vr1, a7, 0
vldrepl.b vr2, a7, 1
vldrepl.b vr3, a7, 2
vldrepl.b vr4, a7, 3
vldrepl.b vr5, t1, 0
ldx.bu t3, t1, a1
vreplgr2vr.b vr6, t3
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr1, vr1, 0
vsllwil.hu.bu vr2, vr2, 0
vsllwil.hu.bu vr3, vr3, 0
vsllwil.hu.bu vr4, vr4, 0
vsllwil.hu.bu vr5, vr5, 0
vsllwil.hu.bu vr6, vr6, 0
.endm
.macro ipred_filter_load_fltptr
fld.d f7, a6, 0
fld.d f8, a6, 8
fld.d f9, a6, 16
fld.d f10, a6, 24
fld.d f11, a6, 32
fld.d f12, a6, 40
fld.d f13, a6, 48
vsllwil.h.b vr7, vr7, 0
vsllwil.h.b vr8, vr8, 0
vsllwil.h.b vr9, vr9, 0
vsllwil.h.b vr10, vr10, 0
vsllwil.h.b vr11, vr11, 0
vsllwil.h.b vr12, vr12, 0
vsllwil.h.b vr13, vr13, 0
.endm
.macro ipred_filter_calc_acc
vmul.h vr7, vr7, vr0
vmadd.h vr7, vr8, vr1
vmadd.h vr7, vr9, vr2
vmadd.h vr7, vr10, vr3
vmadd.h vr7, vr11, vr4
vmadd.h vr7, vr12, vr5
vmadd.h vr7, vr13, vr6
vaddi.hu vr7, vr7, 8
vsrai.h vr7, vr7, 4
iclip_pixel_vrh vr7, vr14, vr15, vr9, vr10, vr8
vsrlni.b.h vr8, vr8, 0
.endm
// void ipred_filter_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft_in,
// const int width, const int height, int filt_idx,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_filter_8bpc_lsx
andi a5, a5, 511
la.local a6, dav1d_filter_intra_taps_lsx
li.w a7, 56
mul.w a7, a7, a5
add.d a6, a6, a7 //*filter
addi.d a7, a2, 1 //*top
or a5, zero, zero //y
vxor.v vr14, vr14, vr14
li.w t0, 255
vreplgr2vr.h vr15, t0
.FILTER_LOOP_H:
sub.d t0, a2, a5 //*topleft
addi.d t1, t0, -1 //left
ctz.w t2, a3
addi.d t3, t2, -2
beqz t3, .FILTER_LOOP_W4
addi.d t3, t2, -3
beqz t3, .FILTER_LOOP_W8
addi.d t3, t2, -4
beqz t3, .FILTER_LOOP_W16
addi.d t3, t2, -5
beqz t3, .FILTER_LOOP_W32
.FILTER_LOOP_W4:
ipred_filter_load_p
or t3, a0, a0 //*ptr
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
b .FILTER_LOOP_W_END
.FILTER_LOOP_W8:
ipred_filter_load_p
or t3, a0, a0
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 3
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 4
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
b .FILTER_LOOP_W_END
.FILTER_LOOP_W16:
ipred_filter_load_p
or t3, a0, a0
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 3
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 4
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 7
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 8
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 11
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 12
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
b .FILTER_LOOP_W_END
.FILTER_LOOP_W32:
ipred_filter_load_p
or t3, a0, a0
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 3
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 4
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 7
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 8
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 11
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 12
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 15
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 16
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 19
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 20
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 23
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 24
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
addi.d t1, a0, 27
addi.d a7, a7, 4
addi.d t0, a7, -1
ipred_filter_loadx_p
addi.d t3, a0, 28
ipred_filter_load_fltptr
ipred_filter_calc_acc
fst.s f8, t3, 0
add.d t3, t3, a1
vstelm.w vr8, t3, 0, 1
add.d t3, t3, a1
.FILTER_LOOP_W_END:
add.d a7, a0, a1
add.d t2, a1, a1
add.d a0, a0, t2
addi.d a5, a5, 2
blt a5, a4, .FILTER_LOOP_H
endfunc
const dav1d_dr_intra_derivative
// Values that are 0 will never be used
.short 0 // Angles:
.short 1023, 0 // 3, 93, 183
.short 547 // 6, 96, 186
.short 372, 0, 0 // 9, 99, 189
.short 273 // 14, 104, 194
.short 215, 0 // 17, 107, 197
.short 178 // 20, 110, 200
.short 151, 0 // 23, 113, 203 (113 & 203 are base angles)
.short 132 // 26, 116, 206
.short 116, 0 // 29, 119, 209
.short 102, 0 // 32, 122, 212
.short 90 // 36, 126, 216
.short 80, 0 // 39, 129, 219
.short 71 // 42, 132, 222
.short 64, 0 // 45, 135, 225 (45 & 135 are base angles)
.short 57 // 48, 138, 228
.short 51, 0 // 51, 141, 231
.short 45, 0 // 54, 144, 234
.short 40 // 58, 148, 238
.short 35, 0 // 61, 151, 241
.short 31 // 64, 154, 244
.short 27, 0 // 67, 157, 247 (67 & 157 are base angles)
.short 23 // 70, 160, 250
.short 19, 0 // 73, 163, 253
.short 15, 0 // 76, 166, 256
.short 11, 0 // 81, 171, 261
.short 7 // 84, 174, 264
.short 3 // 87, 177, 267
endconst
const z1_upsample_edge_kernel
.short -1, 9, 9, -1, -1, 9, 9, -1
endconst
const ipred_filter_edge_kernel1
.short 0, 4, 8, 4, 0, 4, 8, 4
.short 0, 5, 6, 5, 0, 5, 6, 5
.short 2, 4, 4, 4, 2, 4, 4, 4
endconst
const ipred_filter_edge_kernel2
.short 0, 0, 0, 0, 0, 0, 0, 0
.short 0, 0, 0, 0, 0, 0, 0, 0
.short 2, 2, 2, 2, 2, 2, 2, 2
endconst
.macro z1_upsample_edge_calc_loop
vsllwil.hu.bu vr10, vr7, 0
vsllwil.hu.bu vr11, vr11, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
vmul.h vr10, vr10, vr0
vmul.h vr11, vr11, vr0
vmul.h vr12, vr12, vr0
vmul.h vr13, vr13, vr0
vhaddw.w.h vr10, vr10, vr10
vhaddw.w.h vr11, vr11, vr11
vhaddw.w.h vr12, vr12, vr12
vhaddw.w.h vr13, vr13, vr13
vhaddw.d.w vr10, vr10, vr10
vhaddw.d.w vr11, vr11, vr11
vhaddw.d.w vr12, vr12, vr12
vhaddw.d.w vr13, vr13, vr13
vpackev.h vr10, vr11, vr10
vpackev.h vr11, vr13, vr12
vpackev.w vr12, vr11, vr10 //s:01234567
vsrari.h vr12, vr12, 4
iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12
vsrlni.b.h vr12, vr12, 0 //out: 13579...
vbsrl.v vr11, vr7, 1 //out:02468...
vilvl.b vr13, vr12, vr11
.endm
.macro z1_upsample_edge_data_init1
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
.endm
.macro z1_upsample_edge_data_init2
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_upsample_edge_calc_loop
.endm
.macro z1_upsample_edge_calc_other
vsllwil.hu.bu vr10, vr7, 0
vmul.h vr10, vr10, vr0
vhaddw.w.h vr10, vr10, vr10
vhaddw.d.w vr10, vr10, vr10
vreplvei.h vr12, vr10, 0 //s0-s7
vsrari.h vr12, vr12, 4
iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12
vsrlni.b.h vr12, vr12, 0
vilvl.b vr13, vr12, vr7
.endm
.macro z1_filter_edge_calc_loop1
vmul.h vr10, vr10, vr1
vmul.h vr11, vr11, vr1
vmul.h vr12, vr12, vr1
vmul.h vr13, vr13, vr1
vhaddw.w.h vr10, vr10, vr10
vhaddw.w.h vr11, vr11, vr11
vhaddw.w.h vr12, vr12, vr12
vhaddw.w.h vr13, vr13, vr13
vhaddw.d.w vr10, vr10, vr10
vhaddw.d.w vr11, vr11, vr11
vhaddw.d.w vr12, vr12, vr12
vhaddw.d.w vr13, vr13, vr13
vpackev.h vr10, vr11, vr10
vpackev.h vr11, vr13, vr12
vpackev.w vr10, vr11, vr10 //s:01234567
.endm
.macro z1_filter_edge_calc_loop2
vsllwil.hu.bu vr13, vr13, 0
vmadd.h vr10, vr13, vr6
vsrari.h vr12, vr10, 4
vsrlni.b.h vr12, vr12, 0 //out: 0-7
.endm
.macro z1_filter_edge_calc_other
vsllwil.hu.bu vr10, vr10, 0
vmul.h vr11, vr10, vr1
vhaddw.w.h vr11, vr11, vr11
vhaddw.d.w vr11, vr11, vr11
vreplvei.h vr12, vr11, 4
vextrins.h vr12, vr11, 0x00
vreplvei.h vr13, vr10, 1
vmadd.h vr12, vr13, vr6
vsrari.h vr12, vr12, 4
vsrlni.b.h vr12, vr12, 0 //out: 0-7
.endm
.macro z1_filter_edge_data_init1
vbsll.v vr10, vr7, 1
vextrins.b vr10, vr10, 0x01
vbsrl.v vr12, vr7, 1
vbsrl.v vr13, vr7, 2
vsllwil.hu.bu vr10, vr10, 0
vsllwil.hu.bu vr11, vr7, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
z1_filter_edge_calc_loop1
.endm
.macro z1_filter_edge_data_init2
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vbsrl.v vr13, vr7, 3
vsllwil.hu.bu vr10, vr7, 0
vsllwil.hu.bu vr11, vr11, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
z1_filter_edge_calc_loop1
.endm
.macro z1_filter_edge_data_init3
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x76
vsllwil.hu.bu vr10, vr7, 0
vsllwil.hu.bu vr11, vr11, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
z1_filter_edge_calc_loop1
.endm
.macro z1_filter_edge_data_init4
vbsll.v vr10, vr7, 1
vextrins.b vr10, vr10, 0x01
vbsrl.v vr12, vr7, 1
vbsrl.v vr13, vr7, 2
vextrins.b vr13, vr13, 0x76
vsllwil.hu.bu vr10, vr10, 0
vsllwil.hu.bu vr11, vr7, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
z1_filter_edge_calc_loop1
.endm
.macro pixel_set_8bpc_allw dst_ptr, src_ptr, width, tmp0, tmp1
vldrepl.b vr10, \src_ptr, 0
or \tmp1, zero, zero
srai.d \tmp0, \width, 4
beqz \tmp0, 2f
1:
vstx vr10, \dst_ptr, \tmp1
addi.d \tmp1, \tmp1, 16
addi.d \tmp0, \tmp0, -1
bnez \tmp0, 1b
2:
andi \tmp0, \width, 8
beqz \tmp0, 3f
fstx.d f10, \dst_ptr, \tmp1
addi.d \tmp1, \tmp1, 8
3:
andi \tmp0, \width, 4
beqz \tmp0, 4f
fstx.s f10, \dst_ptr, \tmp1
addi.d \tmp1, \tmp1, 4
4:
andi \tmp0, \width, 2
beqz \tmp0, 5f
ldx.bu \tmp0, \src_ptr, zero
stx.b \tmp0, \dst_ptr, \tmp1
addi.d \tmp1, \tmp1, 1
stx.b \tmp0, \dst_ptr, \tmp1
addi.d \tmp1, \tmp1, 1
5:
andi \tmp0, \width, 1
beqz \tmp0, 6f
ldx.bu \tmp0, \src_ptr, zero
stx.b \tmp0, \dst_ptr, \tmp1
6:
.endm
// void ipred_z1_lsx(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft_in,
// const int width, const int height, int angle,
// const int max_width, const int max_height
// HIGHBD_DECL_SUFFIX)
function ipred_z1_8bpc_lsx
addi.d a2, a2, 1 //&topleft_in[1]
addi.d sp, sp, -128
or t2, sp, sp //top_out
srai.d a6, a5, 9
andi a6, a6, 1 //is_sum
srai.d a7, a5, 10 //enable_intra_edge_filter
andi a5, a5, 511
la.local t0, dav1d_dr_intra_derivative
andi t1, a5, 0xFFE
ldx.hu t1, t0, t1 //dx
beqz a7, .IPRED_Z1_NOTUA
add.d t3, a3, a4
li.w t4, 90
sub.w t4, t4, a5
// ipred_get_upsample t5:upsample_above
li.w t6, 16
sra.d t6, t6, a6
bge t6, t3, .Z1_GETUS1
addi.d t5, zero, 0
b .Z1_GETUS2
.Z1_GETUS1:
addi.d t5, zero, 1
.Z1_GETUS2:
li.w t6, 40
blt t4, t6, .Z1_GETUS3
addi.d t6, zero, 0
b .Z1_GETUS4
.Z1_GETUS3:
addi.d t6, zero, 1
.Z1_GETUS4:
and t5, t5, t6
beqz t5, .IPRED_Z1_NOTUA
la.local t0, z1_upsample_edge_kernel
vld vr0, t0, 0 //kernel
vxor.v vr15, vr15, vr15
li.w t0, 255
vreplgr2vr.h vr16, t0
.Z1_UEDGE_W4:
andi t6, a3, 4
beqz t6, .Z1_UEDGE_W8
.Z1_UEDGE_W4_H4:
andi t6, a4, 4
beqz t6, .Z1_UEDGE_W4_H8
//0-6
vld vr7, a2, -1
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 0
vstelm.w vr13, t2, 8, 2
vstelm.h vr13, t2, 12, 6
ld.bu t7, a2, 7
st.b t7, t2, 14
b .Z1_UEDGE_END
.Z1_UEDGE_W4_H8:
andi t6, a4, 8
beqz t6, .Z1_UEDGE_W4_H16
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init2
vst vr13, t2, 0
//8-10
vldrepl.b vr7, a2, 7
z1_upsample_edge_calc_other
vstelm.w vr13, t2, 16, 0
vstelm.h vr13, t2, 20, 2
ld.bu t7, a2, 7
st.b t7, t2, 22
b .Z1_UEDGE_END
.Z1_UEDGE_W4_H16:
andi t6, a4, 16
beqz t6, .Z1_UEDGE_W4_H32
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init2
vst vr13, t2, 0
//8-15
vldrepl.b vr7, a2, 7
z1_upsample_edge_calc_other
vst vr13, t2, 16
//16-18
vstelm.w vr13, t2, 32, 0
vstelm.h vr13, t2, 36, 2
ld.bu t7, a2, 7
st.b t7, t2, 38
b .Z1_UEDGE_END
.Z1_UEDGE_W4_H32:
andi t6, a4, 32
beqz t6, .Z1_UEDGE_W4_H64
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init2
vst vr13, t2, 0
//8-15
vldrepl.b vr7, a2, 7
z1_upsample_edge_calc_other
vst vr13, t2, 16
vst vr13, t2, 32 //16-23
vst vr13, t2, 48 //24-31
//32-34
vstelm.w vr13, t2, 64, 0
vstelm.h vr13, t2, 68, 2
ld.bu t7, a2, 7
st.b t7, t2, 70
b .Z1_UEDGE_END
.Z1_UEDGE_W4_H64:
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init2
vst vr13, t2, 0
//8-15
vldrepl.b vr7, a2, 7
z1_upsample_edge_calc_other
vst vr13, t2, 16
vst vr13, t2, 32 //16-23
vst vr13, t2, 48 //24-31
vst vr13, t2, 64 //32-39
vst vr13, t2, 80 //40-47
vst vr13, t2, 96 //48-55
vst vr13, t2, 112 //56-63
//64-66
vstelm.w vr13, t2, 128, 0
vstelm.h vr13, t2, 132, 2
ld.bu t7, a2, 7
st.b t7, t2, 134
b .Z1_UEDGE_END
.Z1_UEDGE_W8:
andi t6, a3, 8
beqz t6, .Z1_UEDGE_W16
.Z1_UEDGE_W8_H4:
andi t6, a4, 4
beqz t6, .Z1_UEDGE_W8_H8
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x32
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x21
vextrins.b vr13, vr13, 0x31
z1_upsample_edge_calc_loop
vstelm.w vr13, t2, 16, 0
vstelm.h vr13, t2, 20, 2
ld.bu t7, a2, 11
st.b t7, t2, 22
b .Z1_UEDGE_END
.Z1_UEDGE_W8_H8:
andi t6, a4, 8
beqz t6, .Z1_UEDGE_W8_H16
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-14
vld vr7, a2, 7
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 16
vstelm.w vr13, t2, 24, 2
vstelm.h vr13, t2, 28, 6
ld.bu t7, a2, 15
st.b t7, t2, 30
b .Z1_UEDGE_END
.Z1_UEDGE_W8_H16:
andi t6, a4, 16
beqz t6, .Z1_UEDGE_W8_H32
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init2
vst vr13, t2, 16
//16-22
vldrepl.b vr7, a2, 15
z1_upsample_edge_calc_other
fst.d f13, t2, 32
vstelm.w vr13, t2, 40, 2
vstelm.h vr13, t2, 44, 6
ld.bu t7, a2, 15
st.b t7, t2, 46
b .Z1_UEDGE_END
.Z1_UEDGE_W8_H32:
andi t6, a4, 32
beqz t6, .Z1_UEDGE_W8_H64
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init2
vst vr13, t2, 16
//16-23
vldrepl.b vr7, a2, 15
z1_upsample_edge_calc_other
vst vr13, t2, 32
vst vr13, t2, 48 //24-31
//32-38
fst.d f13, t2, 64
vstelm.w vr13, t2, 72, 2
vstelm.h vr13, t2, 76, 6
ld.bu t7, a2, 15
st.b t7, t2, 78
b .Z1_UEDGE_END
.Z1_UEDGE_W8_H64:
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init2
vst vr13, t2, 16
//16-23
vldrepl.b vr7, a2, 15
z1_upsample_edge_calc_other
vst vr13, t2, 32
vst vr13, t2, 48 //24-31
vst vr13, t2, 64 //32-39
vst vr13, t2, 80 //40-47
vst vr13, t2, 96 //48-55
vst vr13, t2, 112 //56-63
//64-70
fst.d f13, t2, 128
vstelm.w vr13, t2, 136, 2
vstelm.h vr13, t2, 140, 6
ld.bu t7, a2, 15
st.b t7, t2, 142
b .Z1_UEDGE_END
.Z1_UEDGE_W16:
andi t6, a3, 16
beqz t6, .Z1_UEDGE_W32
.Z1_UEDGE_W16_H4:
andi t6, a4, 4
beqz t6, .Z1_UEDGE_W16_H8
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-18
vld vr7, a2, 15
z1_upsample_edge_data_init1
vstelm.w vr13, t2, 32, 0
vstelm.h vr13, t2, 36, 2
ld.bu t7, a2, 19
st.b t7, t2, 38
b .Z1_UEDGE_END
.Z1_UEDGE_W16_H8:
andi t6, a4, 8
beqz t6, .Z1_UEDGE_W16_H16
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-22
vld vr7, a2, 15
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 32
vstelm.w vr13, t2, 40, 2
vstelm.h vr13, t2, 44, 6
ld.bu t7, a2, 23
st.b t7, t2, 46
b .Z1_UEDGE_END
.Z1_UEDGE_W16_H16:
andi t6, a4, 16
beqz t6, .Z1_UEDGE_W16_H32
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-30
vld vr7, a2, 23
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 48
vstelm.w vr13, t2, 56, 2
vstelm.h vr13, t2, 60, 6
ld.bu t7, a2, 31
st.b t7, t2, 62
b .Z1_UEDGE_END
.Z1_UEDGE_W16_H32:
andi t6, a4, 32
beqz t6, .Z1_UEDGE_W16_H64
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init2
vst vr13, t2, 48
//32-39
vldrepl.b vr7, a2, 31
z1_upsample_edge_calc_other
vst vr13, t2, 64
//40-46
fst.d f13, t2, 80
vstelm.w vr13, t2, 88, 2
vstelm.h vr13, t2, 92, 6
ld.bu t7, a2, 31
st.b t7, t2, 94
b .Z1_UEDGE_END
.Z1_UEDGE_W16_H64:
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init2
vst vr13, t2, 48
//32-39
vldrepl.b vr7, a2, 31
z1_upsample_edge_calc_other
vst vr13, t2, 64
vst vr13, t2, 80 //40-47
vst vr13, t2, 96 //48-55
vst vr13, t2, 112 //56-63
vst vr13, t2, 128 //64-71
//72-78
fst.d f13, t2, 144
vstelm.w vr13, t2, 152, 2
vstelm.h vr13, t2, 156, 6
ld.bu t7, a2, 31
st.b t7, t2, 158
b .Z1_UEDGE_END
.Z1_UEDGE_W32:
andi t6, a3, 32
beqz t6, .Z1_UEDGE_W64
.Z1_UEDGE_W32_H8:
andi t6, a4, 8
beqz t6, .Z1_UEDGE_W32_H16
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-38
vld vr7, a2, 31
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 64
vstelm.w vr13, t2, 72, 2
vstelm.h vr13, t2, 76, 6
ld.bu t7, a2, 39
st.b t7, t2, 78
b .Z1_UEDGE_END
.Z1_UEDGE_W32_H16:
andi t6, a4, 16
beqz t6, .Z1_UEDGE_W32_H32
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-39
vld vr7, a2, 31
z1_upsample_edge_data_init1
vst vr13, t2, 64
//40-46
vld vr7, a2, 39
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 80
vstelm.w vr13, t2, 88, 2
vstelm.h vr13, t2, 92, 6
ld.bu t7, a2, 47
st.b t7, t2, 94
b .Z1_UEDGE_END
.Z1_UEDGE_W32_H32:
andi t6, a4, 32
beqz t6, .Z1_UEDGE_W32_H64
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-39
vld vr7, a2, 31
z1_upsample_edge_data_init1
vst vr13, t2, 64
//40-47
vld vr7, a2, 39
z1_upsample_edge_data_init1
vst vr13, t2, 80
//48-55
vld vr7, a2, 47
z1_upsample_edge_data_init1
vst vr13, t2, 96
//56-62
vld vr7, a2, 55
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vextrins.b vr12, vr12, 0x76
vbsrl.v vr13, vr7, 3
z1_upsample_edge_calc_loop
fst.d f13, t2, 112
vstelm.w vr13, t2, 120, 2
vstelm.h vr13, t2, 124, 6
ld.bu t7, a2, 63
st.b t7, t2, 126
b .Z1_UEDGE_END
.Z1_UEDGE_W32_H64:
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-39
vld vr7, a2, 31
z1_upsample_edge_data_init1
vst vr13, t2, 64
//40-47
vld vr7, a2, 39
z1_upsample_edge_data_init1
vst vr13, t2, 80
//48-55
vld vr7, a2, 47
z1_upsample_edge_data_init1
vst vr13, t2, 96
//56-63
vld vr7, a2, 55
z1_upsample_edge_data_init2
vst vr13, t2, 112
//64-71
vldrepl.b vr7, a2, 63
z1_upsample_edge_calc_other
vst vr13, t2, 128
vst vr13, t2, 144 //72-79
vst vr13, t2, 160 //80-87
//88-94
fst.d f13, t2, 176
vstelm.w vr13, t2, 184, 2
vstelm.h vr13, t2, 188, 6
ld.bu t7, a2, 63
st.b t7, t2, 190
b .Z1_UEDGE_END
.Z1_UEDGE_W64:
.Z1_UEDGE_W64_H16:
andi t6, a4, 16
beqz t6, .Z1_UEDGE_W64_H32
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-39
vld vr7, a2, 31
z1_upsample_edge_data_init1
vst vr13, t2, 64
//40-47
vld vr7, a2, 39
z1_upsample_edge_data_init1
vst vr13, t2, 80
//48-55
vld vr7, a2, 47
z1_upsample_edge_data_init1
vst vr13, t2, 96
//56-63
vld vr7, a2, 55
z1_upsample_edge_data_init1
vst vr13, t2, 112
//64-71
vld vr7, a2, 63
z1_upsample_edge_data_init1
vst vr13, t2, 128
//72-78
vld vr7, a2, 71
z1_upsample_edge_data_init2
fst.d f13, t2, 144
vstelm.w vr13, t2, 152, 2
vstelm.h vr13, t2, 156, 6
ld.bu t7, a2, 79
st.b t7, t2, 158
b .Z1_UEDGE_END
.Z1_UEDGE_W64_H32:
andi t6, a4, 32
beqz t6, .Z1_UEDGE_W64_H64
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-39
vld vr7, a2, 31
z1_upsample_edge_data_init1
vst vr13, t2, 64
//40-47
vld vr7, a2, 39
z1_upsample_edge_data_init1
vst vr13, t2, 80
//48-55
vld vr7, a2, 47
z1_upsample_edge_data_init1
vst vr13, t2, 96
//56-63
vld vr7, a2, 55
z1_upsample_edge_data_init1
vst vr13, t2, 112
//64-71
vld vr7, a2, 63
z1_upsample_edge_data_init1
vst vr13, t2, 128
//72-79
vld vr7, a2, 71
z1_upsample_edge_data_init1
vst vr13, t2, 144
//80-87
vld vr7, a2, 79
z1_upsample_edge_data_init1
vst vr13, t2, 160
//88-94
vld vr7, a2, 87
z1_upsample_edge_data_init2
fst.d f13, t2, 176
vstelm.w vr13, t2, 184, 2
vstelm.h vr13, t2, 188, 6
ld.bu t7, a2, 95
st.b t7, t2, 190
b .Z1_UEDGE_END
.Z1_UEDGE_W64_H64:
//0-7
vld vr7, a2, -1
z1_upsample_edge_data_init1
vst vr13, t2, 0
//8-15
vld vr7, a2, 7
z1_upsample_edge_data_init1
vst vr13, t2, 16
//16-23
vld vr7, a2, 15
z1_upsample_edge_data_init1
vst vr13, t2, 32
//24-31
vld vr7, a2, 23
z1_upsample_edge_data_init1
vst vr13, t2, 48
//32-39
vld vr7, a2, 31
z1_upsample_edge_data_init1
vst vr13, t2, 64
//40-47
vld vr7, a2, 39
z1_upsample_edge_data_init1
vst vr13, t2, 80
//48-55
vld vr7, a2, 47
z1_upsample_edge_data_init1
vst vr13, t2, 96
//56-63
vld vr7, a2, 55
z1_upsample_edge_data_init1
vst vr13, t2, 112
//64-71
vld vr7, a2, 63
z1_upsample_edge_data_init1
vst vr13, t2, 128
//72-79
vld vr7, a2, 71
z1_upsample_edge_data_init1
vst vr13, t2, 144
//80-87
vld vr7, a2, 79
z1_upsample_edge_data_init1
vst vr13, t2, 160
//88-95
vld vr7, a2, 87
z1_upsample_edge_data_init1
vst vr13, t2, 176
//96-103
vld vr7, a2, 95
z1_upsample_edge_data_init1
vst vr13, t2, 192
//104-111
vld vr7, a2, 103
z1_upsample_edge_data_init1
vst vr13, t2, 208
//112-119
vld vr7, a2, 111
z1_upsample_edge_data_init1
vst vr13, t2, 224
//120-126
vld vr7, a2, 119
z1_upsample_edge_data_init2
fst.d f13, t2, 240
vstelm.w vr13, t2, 248, 2
vstelm.h vr13, t2, 252, 6
ld.bu t7, a2, 127
st.b t7, t2, 254
b .Z1_UEDGE_END
.Z1_UEDGE_END:
//upsample_edge end
or a7, t2, t2 //top
add.d t0, a3, a4
slli.d t0, t0, 1
addi.d t0, t0, -2 //max_base_x
slli.d t1, t1, 1
b .IPRED_Z1_UA_END
.IPRED_Z1_NOTUA:
or t5, zero, zero //upsample_above=0
beqz a7, .IPRED_Z1_NOTFS
add.d a7, a3, a4 //w+h
li.w t4, 90
sub.d t4, t4, a5
// ipred_get_filter_strength a6:filter_strength
beqz a6, .Z1_GETFS20
.Z1_GETFS10: //wh<=8
addi.d t6, a7, -8
blt zero, t6, .Z1_GETFS11
addi.d t6, t4, -64
blt t6, zero, .Z1_GETFS101
ori a6, zero, 2
b .Z1_GETFS40
.Z1_GETFS101:
addi.d t6, t4, -40
blt t6, zero, .Z1_GETFS30
ori a6, zero, 1
b .Z1_GETFS40
.Z1_GETFS11: //wh<=16
addi.d t6, a7, -16
blt zero, t6, .Z1_GETFS12
addi.d t6, t4, -48
blt t6, zero, .Z1_GETFS111
ori a6, zero, 2
b .Z1_GETFS40
.Z1_GETFS111:
addi.d t6, t4, -20
blt t6, zero, .Z1_GETFS30
ori a6, zero, 1
b .Z1_GETFS40
.Z1_GETFS12: //wh<=24
addi.d t6, a7, -24
blt zero, t6, .Z1_GETFS13
addi.d t6, t4, -4
blt t6, zero, .Z1_GETFS30
ori a6, zero, 3
b .Z1_GETFS40
.Z1_GETFS13:
ori a6, zero, 3
b .Z1_GETFS40
.Z1_GETFS20: //wh<=8
addi.d t6, a7, -8
blt zero, t6, .Z1_GETFS21
addi.d t6, t4, -56
blt t6, zero, .Z1_GETFS30
ori a6, zero, 1
b .Z1_GETFS40
.Z1_GETFS21: //wh<=16
addi.d t6, a7, -16
blt zero, t6, .Z1_GETFS22
addi.d t6, t4, -40
blt t6, zero, .Z1_GETFS30
ori a6, zero, 1
b .Z1_GETFS40
.Z1_GETFS22: //wh<=24
addi.d t6, a7, -24
blt zero, t6, .Z1_GETFS23
addi.d t6, t4, -32
blt t6, zero, .Z1_GETFS221
ori a6, zero, 3
b .Z1_GETFS40
.Z1_GETFS221:
addi.d t6, t4, -16
blt t6, zero, .Z1_GETFS222
ori a6, zero, 2
b .Z1_GETFS40
.Z1_GETFS222:
addi.d t6, t4, -8
blt t6, zero, .Z1_GETFS30
ori a6, zero, 1
b .Z1_GETFS40
.Z1_GETFS23: //wh<=32
addi.d t6, a7, -32
blt zero, t6, .Z1_GETFS24
addi.d t6, t4, -32
blt t6, zero, .Z1_GETFS231
ori a6, zero, 3
b .Z1_GETFS40
.Z1_GETFS231:
addi.d t6, t4, -4
blt t6, zero, .Z1_GETFS232
ori a6, zero, 2
b .Z1_GETFS40
.Z1_GETFS232:
ori a6, zero, 1
b .Z1_GETFS40
.Z1_GETFS24:
ori a6, zero, 3
b .Z1_GETFS40
.Z1_GETFS30:
or a6, zero, zero
.Z1_GETFS40:
beqz a6, .IPRED_Z1_NOTFS
.IPRED_Z1_IFFS:
// filter_edge
addi.d a6, a6, -1
slli.d a6, a6, 4
la.local t0, ipred_filter_edge_kernel1
vldx vr1, t0, a6 //kernel[0-3]
la.local t0, ipred_filter_edge_kernel2
vldx vr6, t0, a6 //kernel[4]
.IPRED_Z1_FS_W4:
andi t0, a3, 4
beqz t0, .IPRED_Z1_FS_W8
.IPRED_Z1_FS_W4_H4:
andi t0, a4, 4
beqz t0, .IPRED_Z1_FS_W4_H8
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init4
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W4_H8:
andi t0, a4, 8
beqz t0, .IPRED_Z1_FS_W4_H16
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init4
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-11
vreplvei.b vr10, vr7, 8
vextrins.b vr10, vr7, 0x07
z1_filter_edge_calc_other
fst.s f12, t2, 8
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W4_H16:
andi t0, a4, 16
beqz t0, .IPRED_Z1_FS_W4_H32
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init4
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vreplvei.b vr10, vr7, 8
vextrins.b vr10, vr7, 0x07
z1_filter_edge_calc_other
fst.d f12, t2, 8
//16-19
vreplvei.b vr12, vr12, 1
fst.s f12, t2, 16
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W4_H32:
andi t0, a4, 32
beqz t0, .IPRED_Z1_FS_W4_H64
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init4
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vreplvei.b vr10, vr7, 8
vextrins.b vr10, vr7, 0x07
z1_filter_edge_calc_other
fst.d f12, t2, 8
//16-23
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 16
fst.d f12, t2, 24 //24-31
fst.s f12, t2, 32 //32-35
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W4_H64:
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init4
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vreplvei.b vr10, vr7, 8
vextrins.b vr10, vr7, 0x07
z1_filter_edge_calc_other
fst.d f12, t2, 8
//16-23
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 16
fst.d f12, t2, 24 //24-31
fst.d f12, t2, 32 //32-39
fst.d f12, t2, 40 //40-47
fst.d f12, t2, 48 //48-55
fst.d f12, t2, 56 //56-63
fst.s f12, t2, 64 //64-67
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W8:
andi t0, a3, 8
beqz t0, .IPRED_Z1_FS_W16
.IPRED_Z1_FS_W8_H4:
andi t0, a4, 4
beqz t0, .IPRED_Z1_FS_W8_H8
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-11
vld vr7, a2, 6
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x32
vsllwil.hu.bu vr10, vr7, 0
vsllwil.hu.bu vr11, vr11, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
z1_filter_edge_calc_loop1
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x21
vextrins.b vr13, vr13, 0x31
z1_filter_edge_calc_loop2
fst.s f12, t2, 8
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W8_H8:
andi t0, a4, 8
beqz t0, .IPRED_Z1_FS_W8_H16
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W8_H16:
andi t0, a4, 16
beqz t0, .IPRED_Z1_FS_W8_H32
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vreplvei.b vr10, vr7, 9
vextrins.b vr10, vr7, 0x08
z1_filter_edge_calc_other
fst.d f12, t2, 16
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W8_H32:
andi t0, a4, 32
beqz t0, .IPRED_Z1_FS_W8_H64
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vreplvei.b vr10, vr7, 9
vextrins.b vr10, vr7, 0x08
z1_filter_edge_calc_other
fst.d f12, t2, 16
//24-31
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 24
//32-39
fst.d f12, t2, 32
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W8_H64:
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vreplvei.b vr10, vr7, 9
vextrins.b vr10, vr7, 0x08
z1_filter_edge_calc_other
fst.d f12, t2, 16
//24-31
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 24
fst.d f12, t2, 32 //32-39
fst.d f12, t2, 40 //40-47
fst.d f12, t2, 48 //48-55
fst.d f12, t2, 56 //56-63
fst.d f12, t2, 64 //64-71
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W16:
andi t0, a3, 16
beqz t0, .IPRED_Z1_FS_W32
.IPRED_Z1_FS_W16_H4:
andi t0, a4, 4
beqz t0, .IPRED_Z1_FS_W16_H8
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-19
vld vr7, a2, 14
vbsrl.v vr11, vr7, 1
vbsrl.v vr12, vr7, 2
vbsrl.v vr13, vr7, 3
vextrins.b vr13, vr13, 0x32
vsllwil.hu.bu vr10, vr7, 0
vsllwil.hu.bu vr11, vr11, 0
vsllwil.hu.bu vr12, vr12, 0
vsllwil.hu.bu vr13, vr13, 0
z1_filter_edge_calc_loop1
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x21
vextrins.b vr13, vr13, 0x31
z1_filter_edge_calc_loop2
fst.s f12, t2, 16
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W16_H8:
andi t0, a4, 8
beqz t0, .IPRED_Z1_FS_W16_H16
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W16_H16:
andi t0, a4, 16
beqz t0, .IPRED_Z1_FS_W16_H32
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W16_H32:
andi t0, a4, 32
beqz t0, .IPRED_Z1_FS_W16_H64
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vreplvei.b vr10, vr7, 9
vextrins.b vr10, vr7, 0x08
z1_filter_edge_calc_other
fst.d f12, t2, 32
//40-47
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 40
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W16_H64:
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vreplvei.b vr10, vr7, 9
vextrins.b vr10, vr7, 0x08
z1_filter_edge_calc_other
fst.d f12, t2, 32
//40-47
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 40
fst.d f12, t2, 48 //48-55
fst.d f12, t2, 56 //56-63
fst.d f12, t2, 64 //64-71
fst.d f12, t2, 72 //72-81
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W32:
andi t0, a3, 32
beqz t0, .IPRED_Z1_FS_W64
.IPRED_Z1_FS_W32_H8:
andi t0, a4, 8
beqz t0, .IPRED_Z1_FS_W32_H16
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W32_H16:
andi t0, a4, 16
beqz t0, .IPRED_Z1_FS_W32_H32
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
//40-47
vld vr7, a2, 38
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 40
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W32_H32:
andi t0, a4, 32
beqz t0, .IPRED_Z1_FS_W32_H64
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
//40-47
vld vr7, a2, 38
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 40
//48-55
vld vr7, a2, 46
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 48
//56-63
vld vr7, a2, 54
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 56
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W32_H64:
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
//40-47
vld vr7, a2, 38
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 40
//48-55
vld vr7, a2, 46
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 48
//56-63
vld vr7, a2, 54
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 56
//64-71
vreplvei.b vr10, vr7, 9
vextrins.b vr10, vr7, 0x08
z1_filter_edge_calc_other
fst.d f12, t2, 64
//72-89
vreplvei.b vr12, vr12, 1
fst.d f12, t2, 72
fst.d f12, t2, 80 //80-87
fst.d f12, t2, 88 //88-95
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W64:
.IPRED_Z1_FS_W64_H16:
andi t0, a4, 16
beqz t0, .IPRED_Z1_FS_W64_H32
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
//40-47
vld vr7, a2, 38
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 40
//48-55
vld vr7, a2, 46
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 48
//56-63
vld vr7, a2, 54
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 56
//64-71
vld vr7, a2, 62
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 64
//72-79
vld vr7, a2, 70
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 72
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W64_H32:
andi t0, a4, 32
beqz t0, .IPRED_Z1_FS_W64_H64
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
//40-47
vld vr7, a2, 38
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 40
//48-55
vld vr7, a2, 46
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 48
//56-63
vld vr7, a2, 54
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 56
//64-71
vld vr7, a2, 62
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 64
//72-79
vld vr7, a2, 70
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 72
//80-87
vld vr7, a2, 78
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 80
//88-95
vld vr7, a2, 86
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 88
b .IPRED_Z1_FS_END
.IPRED_Z1_FS_W64_H64:
//0-7
vld vr7, a2, -1
z1_filter_edge_data_init1
vbsrl.v vr13, vr7, 3
z1_filter_edge_calc_loop2
fst.d f12, t2, 0
//8-15
vld vr7, a2, 6
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 8
//16-23
vld vr7, a2, 14
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 16
//24-31
vld vr7, a2, 22
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 24
//32-39
vld vr7, a2, 30
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 32
//40-47
vld vr7, a2, 38
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 40
//48-55
vld vr7, a2, 46
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 48
//56-63
vld vr7, a2, 54
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 56
//64-71
vld vr7, a2, 62
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 64
//72-79
vld vr7, a2, 70
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 72
//80-87
vld vr7, a2, 78
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 80
//88-95
vld vr7, a2, 86
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 88
//96-103
vld vr7, a2, 94
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 96
//104-111
vld vr7, a2, 102
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 104
//112-119
vld vr7, a2, 110
z1_filter_edge_data_init2
vbsrl.v vr13, vr7, 4
z1_filter_edge_calc_loop2
fst.d f12, t2, 112
//120-127
vld vr7, a2, 118
z1_filter_edge_data_init3
vbsrl.v vr13, vr7, 4
vextrins.b vr13, vr13, 0x65
vextrins.b vr13, vr13, 0x75
z1_filter_edge_calc_loop2
fst.d f12, t2, 120
.IPRED_Z1_FS_END:
addi.d t0, a7, -1 //max_base_x
or a7, t2, t2 //top
b .IPRED_Z1_UA_END
.IPRED_Z1_NOTFS:
or a7, a2, a2 //top
// imin_gr
blt a3, a4, .Z1_IMIN1
or t0, a4, a4
b .Z1_IMIN2
.Z1_IMIN1:
or t0, a3, a3
.Z1_IMIN2:
add.d t0, a3, t0
addi.d t0, t0, -1 //max_base_x
.IPRED_Z1_UA_END:
//st dst, t1:dx a2 a6 t6 t7
beqz t5, .Z1_UA0
li.w a5, 64
vreplgr2vr.h vr0, a5
vsrai.h vr7, vr0, 1
or t2, zero, zero //y
or t3, t1, t1 //xpos
.Z1_LOOPY:
andi t4, t3, 0x3e //frac
vreplgr2vr.h vr1, t4
vsub.h vr2, vr0, vr1
or a6, zero, zero //x
or a2, zero, zero //base_num
srai.d t6, t3, 6 //base
or t7, t6, t6
bge t7, t0, .Z1_LOOPX
.Z1_BASENUM:
addi.d a2, a2, 1
addi.d t7, t7, 2
blt t7, t0, .Z1_BASENUM
.Z1_LOOPX:
blt a2, a3, .Z1_LOOPX_BASEMAX
srai.d t8, a3, 3 //loop param
beqz t8, .Z1_LOOPX_W4
.Z1_LOOPX_W8:
add.d t5, a7, t6
vld vr3, t5, 0
vpickev.b vr5, vr3, vr3 //0 2 4 6...
vpickod.b vr6, vr3, vr3 //1 3 5 7...
vsllwil.hu.bu vr5, vr5, 0
vsllwil.hu.bu vr6, vr6, 0
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.d f3, a0, a6
addi.d a6, a6, 8
addi.d t6, t6, 16
addi.d t8, t8, -1
bnez t8, .Z1_LOOPX_W8
b .Z1_LOOPY_END
.Z1_LOOPX_W4:
vldx vr3, a7, t6
vsllwil.hu.bu vr3, vr3, 0
vpickev.h vr5, vr3, vr3 //0 2 4 6...
vpickod.h vr6, vr3, vr3 //1 3 5 7...
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.s f3, a0, a6
b .Z1_LOOPY_END
.Z1_LOOPX_BASEMAX:
srai.d t8, a2, 3 //loop param
beqz t8, .Z1_LOOPX_BASEMAX4
.Z1_LOOPX_BASEMAX8:
add.d t5, a7, t6
vld vr3, t5, 0
vpickev.b vr5, vr3, vr3 //0 2 4 6...
vpickod.b vr6, vr3, vr3 //1 3 5 7...
vsllwil.hu.bu vr5, vr5, 0
vsllwil.hu.bu vr6, vr6, 0
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.d f3, a0, a6
addi.d a6, a6, 8
addi.d t6, t6, 16
addi.d t8, t8, -1
bnez t8, .Z1_LOOPX_BASEMAX8
.Z1_LOOPX_BASEMAX4:
andi t8, a2, 4
beqz t8, .Z1_LOOPX_BASEMAX2
vldx vr3, a7, t6
vsllwil.hu.bu vr3, vr3, 0
vpickev.h vr5, vr3, vr3 //0 2 4 6...
vpickod.h vr6, vr3, vr3 //1 3 5 7...
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.s f3, a0, a6
addi.d a6, a6, 4
addi.d t6, t6, 8
.Z1_LOOPX_BASEMAX2:
andi t8, a2, 2
beqz t8, .Z1_LOOPX_BASEMAX1
vldx vr3, a7, t6
vsllwil.hu.bu vr3, vr3, 0
vpickev.h vr5, vr3, vr3 //0 2 4 6...
vpickod.h vr6, vr3, vr3 //1 3 5 7...
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
vpickve2gr.bu t7, vr3, 0
vpickve2gr.bu t8, vr3, 1
stx.b t7, a0, a6
addi.d a6, a6, 1
stx.b t8, a0, a6
addi.d a6, a6, 1
addi.d t6, t6, 4
.Z1_LOOPX_BASEMAX1:
andi t8, a2, 1
beqz t8, .Z1_LOOPX_BASEMAX_MSET
add.d a2, a7, t6
sub.d t7, a5, t4
ld.bu t8, a2, 0
mul.w t7, t7, t8
ld.bu t8, a2, 1
mul.w t8, t8, t4
add.d t7, t7, t8
addi.d t7, t7, 32
srai.d t7, t7, 6
stx.b t7, a0, a6
addi.d a6, a6, 1
.Z1_LOOPX_BASEMAX_MSET: //memset
add.d t6, a0, a6 //dst
add.d t7, a7, t0 //src
sub.d a2, a3, a6 //size
pixel_set_8bpc_allw t6, t7, a2, t8, t4
.Z1_LOOPY_END:
addi.d t2, t2, 1
add.d a0, a0, a1
add.d t3, t3, t1
blt t2, a4, .Z1_LOOPY
b .Z1_END
.Z1_UA0:
li.w a5, 64
vreplgr2vr.h vr0, a5
vsrai.h vr7, vr0, 1
or t2, zero, zero //y
or t3, t1, t1 //xpos
.Z1_UA0_LOOPY:
andi t4, t3, 0x3e //frac
vreplgr2vr.h vr1, t4
vsub.h vr2, vr0, vr1
or a6, zero, zero //x
srai.d t6, t3, 6 //base
sub.d a2, t0, t6 //a2:base_num
blt a2, zero, .Z1_UA0_BASENUM
b .Z1_UA0_LOOPX
.Z1_UA0_BASENUM:
or a2, zero, zero
.Z1_UA0_LOOPX:
blt a2, a3, .Z1_UA0_LOOPX_BASEMAX
srai.d t8, a3, 3 //loop param
beqz t8, .Z1_UA0_LOOPX_W4
.Z1_UA0_LOOPX_W8:
add.d t5, a7, t6
vld vr5, t5, 0
vld vr6, t5, 1
vsllwil.hu.bu vr5, vr5, 0
vsllwil.hu.bu vr6, vr6, 0
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.d f3, a0, a6
addi.d a6, a6, 8
addi.d t6, t6, 8
addi.d t8, t8, -1
bnez t8, .Z1_UA0_LOOPX_W8
b .Z1_UA0_LOOPY_END
.Z1_UA0_LOOPX_W4:
vldx vr5, a7, t6
vsllwil.hu.bu vr5, vr5, 0
vbsrl.v vr6, vr5, 2
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.s f3, a0, a6
b .Z1_UA0_LOOPY_END
.Z1_UA0_LOOPX_BASEMAX:
srai.d t8, a2, 3 //loop param
beqz t8, .Z1_UA0_LOOPX_BASEMAX4
.Z1_UA0_LOOPX_BASEMAX8:
add.d t5, a7, t6
vld vr5, t5, 0
vld vr6, t5, 1
vsllwil.hu.bu vr5, vr5, 0
vsllwil.hu.bu vr6, vr6, 0
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.d f3, a0, a6
addi.d a6, a6, 8
addi.d t6, t6, 8
addi.d t8, t8, -1
bnez t8, .Z1_UA0_LOOPX_BASEMAX8
.Z1_UA0_LOOPX_BASEMAX4:
andi t8, a2, 4
beqz t8, .Z1_UA0_LOOPX_BASEMAX2
vldx vr5, a7, t6
vsllwil.hu.bu vr5, vr5, 0
vbsrl.v vr6, vr5, 2
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
fstx.s f3, a0, a6
addi.d a6, a6, 4
addi.d t6, t6, 4
.Z1_UA0_LOOPX_BASEMAX2:
andi t8, a2, 2
beqz t8, .Z1_UA0_LOOPX_BASEMAX1
vldx vr5, a7, t6
vsllwil.hu.bu vr5, vr5, 0
vbsrl.v vr6, vr5, 2
vmul.h vr3, vr5, vr2
vmadd.h vr3, vr6, vr1
vadd.h vr3, vr3, vr7
vsrai.h vr3, vr3, 6
vsrlni.b.h vr3, vr3, 0
vpickve2gr.bu t7, vr3, 0
vpickve2gr.bu t8, vr3, 1
stx.b t7, a0, a6
addi.d a6, a6, 1
stx.b t8, a0, a6
addi.d a6, a6, 1
addi.d t6, t6, 2
.Z1_UA0_LOOPX_BASEMAX1:
andi t8, a2, 1
beqz t8, .Z1_UA0_LOOPX_BASEMAX_MSET
add.d a2, a7, t6
sub.d t7, a5, t4
ld.bu t8, a2, 0
mul.w t7, t7, t8
ld.bu t8, a2, 1
mul.w t8, t8, t4
add.d t7, t7, t8
addi.d t7, t7, 32
srai.d t7, t7, 6
stx.b t7, a0, a6
addi.d a6, a6, 1
.Z1_UA0_LOOPX_BASEMAX_MSET: //memset
add.d t6, a0, a6 //dst
add.d t7, a7, t0 //src
sub.d a2, a3, a6 //size
pixel_set_8bpc_allw t6, t7, a2, t8, t4
.Z1_UA0_LOOPY_END:
addi.d t2, t2, 1
add.d a0, a0, a1
add.d t3, t3, t1
blt t2, a4, .Z1_UA0_LOOPY
.Z1_END:
addi.d sp, sp, 128
endfunc