Source code

Revision control

Copy as Markdown

Other Tools

; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
wiener_hshift: dw 4, 4, 1, 1
wiener_vshift: dw 1024, 1024, 4096, 4096
wiener_round: dd 1049600, 1048832
pb_m10_m9: times 2 db -10, -9
pb_m6_m5: times 2 db -6, -5
pb_m2_m1: times 2 db -2, -1
pb_2_3: times 2 db 2, 3
pb_6_7: times 2 db 6, 7
pw_1023: times 2 dw 1023
pw_164_24: dw 164, 24
pw_455_24: dw 455, 24
pd_8: dd 8
pd_25: dd 25
pd_4096: dd 4096
pd_34816: dd 34816
pd_m262128: dd -262128
pf_256: dd 256.0
%define pw_256 sgr_lshuf5
cextern pb_0to63
SECTION .text
DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
INIT_YMM avx2
cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
w, h, edge, flt
%define base t4-wiener_hshift
mov fltq, r6mp
movifnidn wd, wm
movifnidn hd, hm
mov edged, r7m
mov t3d, r8m ; pixel_max
vbroadcasti128 m6, [wiener_shufA]
vpbroadcastd m12, [fltq+ 0] ; x0 x1
lea t4, [wiener_hshift]
vbroadcasti128 m7, [wiener_shufB]
add wd, wd
vpbroadcastd m13, [fltq+ 4] ; x2 x3
shr t3d, 11
vpbroadcastd m14, [fltq+16] ; y0 y1
add lpfq, wq
vpbroadcastd m15, [fltq+20] ; y2 y3
add dstq, wq
vbroadcasti128 m8, [wiener_shufC]
lea t1, [rsp+wq+16]
vbroadcasti128 m9, [wiener_shufD]
neg wq
vpbroadcastd m0, [base+wiener_hshift+t3*4]
vpbroadcastd m10, [base+wiener_round+t3*4]
vpbroadcastd m11, [base+wiener_vshift+t3*4]
pmullw m12, m0 ; upshift filter coefs to make the
pmullw m13, m0 ; horizontal downshift constant
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t6, t1
mov t5, t1
add t1, 384*2
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
mov t4, t1
add t1, 384*2
add r10, strideq
mov [rsp], r10 ; below
call .h
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
mov t2, t1
dec hd
jz .v2
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v3
.main:
lea t0, [t1+384*2]
.main_loop:
call .hv
dec hd
jnz .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .v3
mov lpfq, [rsp]
call .hv_bottom
add lpfq, strideq
call .hv_bottom
.v1:
call .v
RET
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
mov t6, t1
mov t5, t1
mov t4, t1
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
mov t2, t1
dec hd
jz .v2
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v3
lea t0, [t1+384*2]
call .hv
dec hd
jz .v3
add t0, 384*8
call .hv
dec hd
jnz .main
.v3:
call .v
.v2:
call .v
jmp .v1
.extend_right:
movd xm1, r10d
vpbroadcastd m0, [pb_6_7]
mova m2, [pb_0to63]
vpbroadcastb m1, xm1
psubb m0, m1
pminub m0, m2
pshufb m3, m0
vpbroadcastd m0, [pb_m2_m1]
psubb m0, m1
pminub m0, m2
pshufb m4, m0
vpbroadcastd m0, [pb_m10_m9]
psubb m0, m1
pminub m0, m2
pshufb m5, m0
ret
.h:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movq xm3, [leftq]
vpblendd m3, [lpfq+r10-8], 0xfc
add leftq, 8
jmp .h_main
.h_extend_left:
vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located
mova m4, [lpfq+r10] ; before the start of the buffer
shufpd m3, m4, 0x05
pshufb m3, [wiener_lshuf7]
jmp .h_main2
.h_top:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m3, [lpfq+r10-8]
.h_main:
mova m4, [lpfq+r10+0]
.h_main2:
movu m5, [lpfq+r10+8]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -36
jl .h_have_right
call .extend_right
.h_have_right:
pshufb m0, m3, m6
pshufb m1, m4, m7
paddw m0, m1
pshufb m3, m8
pmaddwd m0, m12
pshufb m1, m4, m9
paddw m3, m1
pshufb m1, m4, m6
pmaddwd m3, m13
pshufb m2, m5, m7
paddw m1, m2
vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18)
pshufb m4, m8
pmaddwd m1, m12
pshufb m5, m9
paddw m4, m5
pmaddwd m4, m13
paddd m0, m2
paddd m1, m2
paddd m0, m3
paddd m1, m4
psrad m0, 4
psrad m1, 4
packssdw m0, m1
psraw m0, 1
mova [t1+r10], m0
add r10, 32
jl .h_loop
ret
ALIGN function_align
.hv:
add lpfq, strideq
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movq xm3, [leftq]
vpblendd m3, [lpfq+r10-8], 0xfc
add leftq, 8
jmp .hv_main
.hv_extend_left:
movu m3, [lpfq+r10-8]
pshufb m3, [wiener_lshuf7]
jmp .hv_main
.hv_bottom:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu m3, [lpfq+r10-8]
.hv_main:
mova m4, [lpfq+r10+0]
movu m5, [lpfq+r10+8]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp r10d, -36
jl .hv_have_right
call .extend_right
.hv_have_right:
pshufb m0, m3, m6
pshufb m1, m4, m7
paddw m0, m1
pshufb m3, m8
pmaddwd m0, m12
pshufb m1, m4, m9
paddw m3, m1
pshufb m1, m4, m6
pmaddwd m3, m13
pshufb m2, m5, m7
paddw m1, m2
vpbroadcastd m2, [pd_m262128]
pshufb m4, m8
pmaddwd m1, m12
pshufb m5, m9
paddw m4, m5
pmaddwd m4, m13
paddd m0, m2
paddd m1, m2
mova m2, [t4+r10]
paddw m2, [t2+r10]
mova m5, [t3+r10]
paddd m0, m3
paddd m1, m4
psrad m0, 4
psrad m1, 4
packssdw m0, m1
mova m4, [t5+r10]
paddw m4, [t1+r10]
psraw m0, 1
paddw m3, m0, [t6+r10]
mova [t0+r10], m0
punpcklwd m0, m2, m5
pmaddwd m0, m15
punpckhwd m2, m5
pmaddwd m2, m15
punpcklwd m1, m3, m4
pmaddwd m1, m14
punpckhwd m3, m4
pmaddwd m3, m14
paddd m0, m10
paddd m2, m10
paddd m0, m1
paddd m2, m3
psrad m0, 5
psrad m2, 5
packusdw m0, m2
pmulhuw m0, m11
mova [dstq+r10], m0
add r10, 32
jl .hv_loop
mov t6, t5
mov t5, t4
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, t6
add dstq, strideq
ret
.v:
mov r10, wq
.v_loop:
mova m1, [t4+r10]
paddw m1, [t2+r10]
mova m2, [t3+r10]
mova m4, [t1+r10]
paddw m3, m4, [t6+r10]
paddw m4, [t5+r10]
punpcklwd m0, m1, m2
pmaddwd m0, m15
punpckhwd m1, m2
pmaddwd m1, m15
punpcklwd m2, m3, m4
pmaddwd m2, m14
punpckhwd m3, m4
pmaddwd m3, m14
paddd m0, m10
paddd m1, m10
paddd m0, m2
paddd m1, m3
psrad m0, 5
psrad m1, 5
packusdw m0, m1
pmulhuw m0, m11
mova [dstq+r10], m0
add r10, 32
jl .v_loop
mov t6, t5
mov t5, t4
mov t4, t3
mov t3, t2
mov t2, t1
add dstq, strideq
ret
cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
w, h, edge, flt
%define base t4-wiener_hshift
mov fltq, r6mp
movifnidn wd, wm
movifnidn hd, hm
mov edged, r7m
mov t3d, r8m ; pixel_max
vbroadcasti128 m5, [wiener_shufE]
vpbroadcastw m11, [fltq+ 2] ; x1
vbroadcasti128 m6, [wiener_shufB]
lea t4, [wiener_hshift]
vbroadcasti128 m7, [wiener_shufD]
add wd, wd
vpbroadcastd m12, [fltq+ 4] ; x2 x3
shr t3d, 11
vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
add lpfq, wq
vpbroadcastw m13, [fltq+18] ; y1
add dstq, wq
vpbroadcastd m14, [fltq+20] ; y2 y3
lea t1, [rsp+wq+16]
neg wq
vpbroadcastd m0, [base+wiener_hshift+t3*4]
vpbroadcastd m9, [base+wiener_round+t3*4]
vpbroadcastd m10, [base+wiener_vshift+t3*4]
mova m15, [wiener_lshuf5]
pmullw m11, m0
pmullw m12, m0
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t4, t1
add t1, 384*2
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
mov t3, t1
add t1, 384*2
add r10, strideq
mov [rsp], r10 ; below
call .h
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v2
.main:
mov t0, t4
.main_loop:
call .hv
dec hd
jnz .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .v2
mov lpfq, [rsp]
call .hv_bottom
add lpfq, strideq
call .hv_bottom
.end:
RET
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
mov t4, t1
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v2
lea t0, [t1+384*2]
call .hv
dec hd
jz .v2
add t0, 384*6
call .hv
dec hd
jnz .main
.v2:
call .v
mov t4, t3
mov t3, t2
mov t2, t1
add dstq, strideq
.v1:
call .v
jmp .end
.extend_right:
movd xm2, r10d
vpbroadcastd m0, [pb_2_3]
vpbroadcastd m1, [pb_m6_m5]
vpbroadcastb m2, xm2
psubb m0, m2
psubb m1, m2
mova m2, [pb_0to63]
pminub m0, m2
pminub m1, m2
pshufb m3, m0
pshufb m4, m1
ret
.h:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movd xm3, [leftq+4]
vpblendd m3, [lpfq+r10-4], 0xfe
add leftq, 8
jmp .h_main
.h_extend_left:
vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located
mova m3, [lpfq+r10] ; before the start of the buffer
palignr m3, m4, 12
pshufb m3, m15
jmp .h_main
.h_top:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m3, [lpfq+r10-4]
.h_main:
movu m4, [lpfq+r10+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -34
jl .h_have_right
call .extend_right
.h_have_right:
pshufb m0, m3, m5
pmaddwd m0, m11
pshufb m1, m4, m5
pmaddwd m1, m11
pshufb m2, m3, m6
pshufb m3, m7
paddw m2, m3
pshufb m3, m4, m6
pmaddwd m2, m12
pshufb m4, m7
paddw m3, m4
pmaddwd m3, m12
paddd m0, m8
paddd m1, m8
paddd m0, m2
paddd m1, m3
psrad m0, 4
psrad m1, 4
packssdw m0, m1
psraw m0, 1
mova [t1+r10], m0
add r10, 32
jl .h_loop
ret
ALIGN function_align
.hv:
add lpfq, strideq
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movd xm3, [leftq+4]
vpblendd m3, [lpfq+r10-4], 0xfe
add leftq, 8
jmp .hv_main
.hv_extend_left:
movu m3, [lpfq+r10-4]
pshufb m3, m15
jmp .hv_main
.hv_bottom:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu m3, [lpfq+r10-4]
.hv_main:
movu m4, [lpfq+r10+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp r10d, -34
jl .hv_have_right
call .extend_right
.hv_have_right:
pshufb m0, m3, m5
pmaddwd m0, m11
pshufb m1, m4, m5
pmaddwd m1, m11
pshufb m2, m3, m6
pshufb m3, m7
paddw m2, m3
pshufb m3, m4, m6
pmaddwd m2, m12
pshufb m4, m7
paddw m3, m4
pmaddwd m3, m12
paddd m0, m8
paddd m1, m8
paddd m0, m2
mova m2, [t3+r10]
paddw m2, [t1+r10]
paddd m1, m3
mova m4, [t2+r10]
punpckhwd m3, m2, m4
pmaddwd m3, m14
punpcklwd m2, m4
mova m4, [t4+r10]
psrad m0, 4
psrad m1, 4
packssdw m0, m1
pmaddwd m2, m14
psraw m0, 1
mova [t0+r10], m0
punpckhwd m1, m0, m4
pmaddwd m1, m13
punpcklwd m0, m4
pmaddwd m0, m13
paddd m3, m9
paddd m2, m9
paddd m1, m3
paddd m0, m2
psrad m1, 5
psrad m0, 5
packusdw m0, m1
pmulhuw m0, m10
mova [dstq+r10], m0
add r10, 32
jl .hv_loop
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, t4
add dstq, strideq
ret
.v:
mov r10, wq
.v_loop:
mova m0, [t1+r10]
paddw m2, m0, [t3+r10]
mova m1, [t2+r10]
mova m4, [t4+r10]
punpckhwd m3, m2, m1
pmaddwd m3, m14
punpcklwd m2, m1
pmaddwd m2, m14
punpckhwd m1, m0, m4
pmaddwd m1, m13
punpcklwd m0, m4
pmaddwd m0, m13
paddd m3, m9
paddd m2, m9
paddd m1, m3
paddd m0, m2
psrad m1, 5
psrad m0, 5
packusdw m0, m1
pmulhuw m0, m10
mova [dstq+r10], m0
add r10, 32
jl .v_loop
ret
cglobal sgr_filter_5x5_16bpc, 4, 14, 16, 400*24+16, dst, stride, left, lpf, \
w, h, edge, params
%define base r13-pb_m10_m9
movifnidn wd, wm
mov paramsq, r6mp
lea r13, [pb_m10_m9]
movifnidn hd, hm
mov edged, r7m
vpbroadcastw m7, [paramsq+8] ; w0
add wd, wd
vpbroadcastd m8, [base+pd_8]
add lpfq, wq
vpbroadcastd m9, [base+pd_25]
add dstq, wq
mova xm10, [base+sgr_lshuf5]
lea t3, [rsp+wq*2+400*12+16]
vpbroadcastd m11, [paramsq+0] ; s0
lea t4, [rsp+wq+400*20+16]
vpbroadcastd m12, [base+pw_164_24]
lea t1, [rsp+wq+20]
vbroadcastss m13, [base+pf_256]
neg wq
vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
pxor m6, m6
vpbroadcastd m15, [base+pw_1023]
psllw m7, 4
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t2, t1
call .top_fixup
add t1, 400*6
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov [rsp], r10 ; below
mov t0, t2
dec hd
jz .height1
or edged, 16
call .h
.main:
add lpfq, strideq
call .hv
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
add lpfq, strideq
test hd, hd
jz .odd_height
call .h
add lpfq, strideq
call .hv
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, [rsp]
call .h_top
add lpfq, strideq
call .hv_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .hv
call .prep_n
jmp .odd_height_end
.odd_height:
call .hv
call .n0
call .n1
.odd_height_end:
call .v
call .n0
jmp .end2
.extend_bottom:
call .v
jmp .end
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
lea t2, [t1+400*6]
call .top_fixup
dec hd
jz .no_top_height1
or edged, 16
mov t0, t1
mov t1, t2
jmp .main
.no_top_height1:
call .v
call .prep_n
jmp .odd_height_end
.extend_right:
vpbroadcastw m0, [lpfq-2]
movu m1, [r13+r10+ 0]
movu m2, [r13+r10+16]
vpblendvb m4, m0, m1
vpblendvb m5, m0, m2
ret
.h: ; horizontal boxsum
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 10
jmp .h_main
.h_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm10
vinserti128 m4, [lpfq+wq+10], 1
jmp .h_main
.h_top:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m4, [lpfq+r10- 2]
.h_main:
movu m5, [lpfq+r10+14]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -36
jl .h_have_right
call .extend_right
.h_have_right:
palignr m2, m5, m4, 2
paddw m0, m4, m2
palignr m3, m5, m4, 6
paddw m0, m3
punpcklwd m1, m2, m3
pmaddwd m1, m1
punpckhwd m2, m3
pmaddwd m2, m2
shufpd m5, m4, m5, 0x05
paddw m0, m5
punpcklwd m3, m4, m5
pmaddwd m3, m3
paddd m1, m3
punpckhwd m3, m4, m5
pmaddwd m3, m3
shufps m4, m5, q2121
paddw m0, m4 ; sum
punpcklwd m5, m4, m6
pmaddwd m5, m5
punpckhwd m4, m6
pmaddwd m4, m4
paddd m2, m3
test edgeb, 16 ; y > 0
jz .h_loop_end
paddw m0, [t1+r10+400*0]
paddd m1, [t1+r10+400*2]
paddd m2, [t1+r10+400*4]
.h_loop_end:
paddd m1, m5 ; sumsq
paddd m2, m4
mova [t1+r10+400*0], m0
mova [t1+r10+400*2], m1
mova [t1+r10+400*4], m2
add r10, 32
jl .h_loop
ret
.top_fixup:
lea r10, [wq-4]
.top_fixup_loop: ; the sums of the first row needs to be doubled
mova m0, [t1+r10+400*0]
mova m1, [t1+r10+400*2]
mova m2, [t1+r10+400*4]
paddw m0, m0
paddd m1, m1
paddd m2, m2
mova [t2+r10+400*0], m0
mova [t2+r10+400*2], m1
mova [t2+r10+400*4], m2
add r10, 32
jl .top_fixup_loop
ret
ALIGN function_align
.hv: ; horizontal boxsum + vertical boxsum + ab
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 10
jmp .hv_main
.hv_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm10
vinserti128 m4, [lpfq+wq+10], 1
jmp .hv_main
.hv_bottom:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu m4, [lpfq+r10- 2]
.hv_main:
movu m5, [lpfq+r10+14]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp r10d, -36
jl .hv_have_right
call .extend_right
.hv_have_right:
palignr m3, m5, m4, 2
paddw m0, m4, m3
palignr m1, m5, m4, 6
paddw m0, m1
punpcklwd m2, m3, m1
pmaddwd m2, m2
punpckhwd m3, m1
pmaddwd m3, m3
shufpd m5, m4, m5, 0x05
paddw m0, m5
punpcklwd m1, m4, m5
pmaddwd m1, m1
paddd m2, m1
punpckhwd m1, m4, m5
pmaddwd m1, m1
shufps m4, m5, q2121
paddw m0, m4 ; h sum
punpcklwd m5, m4, m6
pmaddwd m5, m5
punpckhwd m4, m6
pmaddwd m4, m4
paddd m3, m1
paddd m2, m5 ; h sumsq
paddd m3, m4
paddw m1, m0, [t1+r10+400*0]
paddd m4, m2, [t1+r10+400*2]
paddd m5, m3, [t1+r10+400*4]
test hd, hd
jz .hv_last_row
.hv_main2:
paddw m1, [t2+r10+400*0] ; hv sum
paddd m4, [t2+r10+400*2] ; hv sumsq
paddd m5, [t2+r10+400*4]
mova [t0+r10+400*0], m0
mova [t0+r10+400*2], m2
mova [t0+r10+400*4], m3
psrlw m3, m1, 1
paddd m4, m8
pavgw m3, m6 ; (b + 2) >> 2
paddd m5, m8
psrld m4, 4 ; (a + 8) >> 4
punpcklwd m2, m3, m6
psrld m5, 4
punpckhwd m3, m6
pmulld m4, m9 ; a * 25
pmulld m5, m9
pmaddwd m2, m2 ; b * b
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaxud m4, m2
pmaxud m5, m3
psubd m4, m2 ; p
psubd m5, m3
pmulld m4, m11 ; p * s
pmulld m5, m11
pmaddwd m0, m12 ; b * 164
pmaddwd m1, m12
paddw m4, m12
paddw m5, m12
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m13, m4
pcmpgtd m5, m13, m5
mulps m2, m13 ; 256 / (z + 1)
mulps m3, m13
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m14
mova [t4+r10+4], m2
psrld m0, 12 ; b
psrld m1, 12
mova [t3+r10*2+ 8], xm0
vextracti128 [t3+r10*2+40], m0, 1
mova [t3+r10*2+24], xm1
vextracti128 [t3+r10*2+56], m1, 1
add r10, 32
jl .hv_loop
mov t2, t1
mov t1, t0
mov t0, t2
ret
.hv_last_row: ; esoteric edge case for odd heights
mova [t1+r10+400*0], m1
paddw m1, m0
mova [t1+r10+400*2], m4
paddd m4, m2
mova [t1+r10+400*4], m5
paddd m5, m3
jmp .hv_main2
.v: ; vertical boxsum + ab
lea r10, [wq-4]
.v_loop:
mova m0, [t1+r10+400*0]
mova m2, [t1+r10+400*2]
mova m3, [t1+r10+400*4]
paddw m1, m0, [t2+r10+400*0]
paddd m4, m2, [t2+r10+400*2]
paddd m5, m3, [t2+r10+400*4]
paddw m0, m0
paddd m2, m2
paddd m3, m3
paddw m1, m0 ; hv sum
paddd m4, m2 ; hv sumsq
paddd m5, m3
psrlw m3, m1, 1
paddd m4, m8
pavgw m3, m6 ; (b + 2) >> 2
paddd m5, m8
psrld m4, 4 ; (a + 8) >> 4
punpcklwd m2, m3, m6
psrld m5, 4
punpckhwd m3, m6
pmulld m4, m9 ; a * 25
pmulld m5, m9
pmaddwd m2, m2 ; b * b
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaxud m4, m2
pmaxud m5, m3
psubd m4, m2 ; p
psubd m5, m3
pmulld m4, m11 ; p * s
pmulld m5, m11
pmaddwd m0, m12 ; b * 164
pmaddwd m1, m12
paddw m4, m12
paddw m5, m12
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m13, m4
pcmpgtd m5, m13, m5
mulps m2, m13 ; 256 / (z + 1)
mulps m3, m13
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m14
mova [t4+r10+4], m2
psrld m0, 12 ; b
psrld m1, 12
mova [t3+r10*2+ 8], xm0
vextracti128 [t3+r10*2+40], m0, 1
mova [t3+r10*2+24], xm1
vextracti128 [t3+r10*2+56], m1, 1
add r10, 32
jl .v_loop
ret
.prep_n: ; initial neighbor setup
mov r10, wq
.prep_n_loop:
movu m0, [t4+r10*1+ 2]
movu m1, [t3+r10*2+ 4]
movu m2, [t3+r10*2+36]
paddw m3, m0, [t4+r10*1+ 0]
paddd m4, m1, [t3+r10*2+ 0]
paddd m5, m2, [t3+r10*2+32]
paddw m3, [t4+r10*1+ 4]
paddd m4, [t3+r10*2+ 8]
paddd m5, [t3+r10*2+40]
paddw m0, m3
psllw m3, 2
paddd m1, m4
pslld m4, 2
paddd m2, m5
pslld m5, 2
paddw m0, m3 ; a 565
paddd m1, m4 ; b 565
paddd m2, m5
mova [t4+r10*1+400*2+ 0], m0
mova [t3+r10*2+400*4+ 0], m1
mova [t3+r10*2+400*4+32], m2
add r10, 32
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
mov r10, wq
.n0_loop:
movu m0, [t4+r10*1+ 2]
movu m1, [t3+r10*2+ 4]
movu m2, [t3+r10*2+36]
paddw m3, m0, [t4+r10*1+ 0]
paddd m4, m1, [t3+r10*2+ 0]
paddd m5, m2, [t3+r10*2+32]
paddw m3, [t4+r10*1+ 4]
paddd m4, [t3+r10*2+ 8]
paddd m5, [t3+r10*2+40]
paddw m0, m3
psllw m3, 2
paddd m1, m4
pslld m4, 2
paddd m2, m5
pslld m5, 2
paddw m0, m3 ; a 565
paddd m1, m4 ; b 565
paddd m2, m5
paddw m3, m0, [t4+r10*1+400*2+ 0]
paddd m4, m1, [t3+r10*2+400*4+ 0]
paddd m5, m2, [t3+r10*2+400*4+32]
mova [t4+r10*1+400*2+ 0], m0
mova [t3+r10*2+400*4+ 0], m1
mova [t3+r10*2+400*4+32], m2
mova m0, [dstq+r10]
punpcklwd m1, m0, m6 ; src
punpcklwd m2, m3, m6 ; a
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
psubd m1, m2 ; b - a * src + (1 << 8)
psubd m4, m3
psrad m1, 9
psrad m4, 9
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m15
mova [dstq+r10], m0
add r10, 32
jl .n0_loop
add dstq, strideq
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
mov r10, wq
.n1_loop:
mova m0, [dstq+r10]
mova m3, [t4+r10*1+400*2+ 0]
mova m4, [t3+r10*2+400*4+ 0]
mova m5, [t3+r10*2+400*4+32]
punpcklwd m1, m0, m6 ; src
punpcklwd m2, m3, m6 ; a
pmaddwd m2, m1
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
psubd m1, m2 ; b - a * src + (1 << 7)
psubd m4, m3
psrad m1, 8
psrad m4, 8
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m15
mova [dstq+r10], m0
add r10, 32
jl .n1_loop
add dstq, strideq
ret
cglobal sgr_filter_3x3_16bpc, 4, 14, 15, 400*42+8, dst, stride, left, lpf, \
w, h, edge, params
movifnidn wd, wm
mov paramsq, r6mp
lea r13, [pb_m10_m9]
add wd, wd
movifnidn hd, hm
mov edged, r7m
vpbroadcastw m7, [paramsq+10] ; w1
add lpfq, wq
vpbroadcastd m8, [base+pd_8]
add dstq, wq
vpbroadcastd m9, [paramsq+ 4] ; s1
lea t3, [rsp+wq*2+400*12+8]
mova xm10, [base+sgr_lshuf3]
lea t4, [rsp+wq+400*32+8]
vpbroadcastd m11, [base+pw_455_24]
lea t1, [rsp+wq+12]
vbroadcastss m12, [base+pf_256]
neg wq
vpbroadcastd m13, [base+pd_34816]
pxor m6, m6
vpbroadcastd m14, [base+pw_1023]
psllw m7, 4
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t2, t1
add t1, 400*6
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov [rsp], r10 ; below
call .hv0
.main:
dec hd
jz .height1
add lpfq, strideq
call .hv1
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
add lpfq, strideq
call .hv0
test hd, hd
jz .odd_height
add lpfq, strideq
call .hv1
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, [rsp]
call .hv0_bottom
add lpfq, strideq
call .hv1_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .v1
call .prep_n
jmp .odd_height_end
.odd_height:
call .v1
call .n0
call .n1
.odd_height_end:
call .v0
call .v1
call .n0
jmp .end2
.extend_bottom:
call .v0
call .v1
jmp .end
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
lea r10, [wq-4]
lea t2, [t1+400*6]
.top_fixup_loop:
mova m0, [t1+r10+400*0]
mova m1, [t1+r10+400*2]
mova m2, [t1+r10+400*4]
mova [t2+r10+400*0], m0
mova [t2+r10+400*2], m1
mova [t2+r10+400*4], m2
add r10, 32
jl .top_fixup_loop
call .v0
jmp .main
.extend_right:
vpbroadcastw m0, [lpfq-2]
movu m1, [r13+r10+ 2]
movu m2, [r13+r10+18]
vpblendvb m4, m0, m1
vpblendvb m5, m0, m2
ret
.h: ; horizontal boxsum
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 12
jmp .h_main
.h_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm10
vinserti128 m4, [lpfq+wq+12], 1
jmp .h_main
.h_top:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m4, [lpfq+r10+ 0]
.h_main:
movu m5, [lpfq+r10+16]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -34
jl .h_have_right
call .extend_right
.h_have_right:
palignr m0, m5, m4, 2
paddw m1, m4, m0
punpcklwd m2, m4, m0
pmaddwd m2, m2
punpckhwd m3, m4, m0
pmaddwd m3, m3
palignr m5, m4, 4
paddw m1, m5 ; sum
punpcklwd m4, m5, m6
pmaddwd m4, m4
punpckhwd m5, m6
pmaddwd m5, m5
paddd m2, m4 ; sumsq
paddd m3, m5
mova [t1+r10+400*0], m1
mova [t1+r10+400*2], m2
mova [t1+r10+400*4], m3
add r10, 32
jl .h_loop
ret
ALIGN function_align
.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 12
jmp .hv0_main
.hv0_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm10
vinserti128 m4, [lpfq+wq+12], 1
jmp .hv0_main
.hv0_bottom:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
.hv0_loop:
movu m4, [lpfq+r10+ 0]
.hv0_main:
movu m5, [lpfq+r10+16]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv0_have_right
cmp r10d, -34
jl .hv0_have_right
call .extend_right
.hv0_have_right:
palignr m0, m5, m4, 2
paddw m1, m4, m0
punpcklwd m2, m4, m0
pmaddwd m2, m2
punpckhwd m3, m4, m0
pmaddwd m3, m3
palignr m5, m4, 4
paddw m1, m5 ; sum
punpcklwd m4, m5, m6
pmaddwd m4, m4
punpckhwd m5, m6
pmaddwd m5, m5
paddd m2, m4 ; sumsq
paddd m3, m5
paddw m0, m1, [t1+r10+400*0]
paddd m4, m2, [t1+r10+400*2]
paddd m5, m3, [t1+r10+400*4]
mova [t1+r10+400*0], m1
mova [t1+r10+400*2], m2
mova [t1+r10+400*4], m3
paddw m1, m0, [t2+r10+400*0]
paddd m2, m4, [t2+r10+400*2]
paddd m3, m5, [t2+r10+400*4]
mova [t2+r10+400*0], m0
mova [t2+r10+400*2], m4
mova [t2+r10+400*4], m5
paddd m2, m8
paddd m3, m8
psrld m2, 4 ; (a + 8) >> 4
psrld m3, 4
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaxud m4, m2
psubd m4, m2 ; p
pmaxud m5, m3
psubd m5, m3
pmulld m4, m9 ; p * s
pmulld m5, m9
pmaddwd m0, m11 ; b * 455
pmaddwd m1, m11
paddw m4, m11
paddw m5, m11
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m12, m4
pcmpgtd m5, m12, m5
mulps m2, m12 ; 256 / (z + 1)
mulps m3, m12
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m13
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m2
mova [t3+r10*2+400*0+ 8], xm0
vextracti128 [t3+r10*2+400*0+40], m0, 1
mova [t3+r10*2+400*0+24], xm1
vextracti128 [t3+r10*2+400*0+56], m1, 1
add r10, 32
jl .hv0_loop
ret
ALIGN function_align
.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 12
jmp .hv1_main
.hv1_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm10
vinserti128 m4, [lpfq+wq+12], 1
jmp .hv1_main
.hv1_bottom:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
.hv1_loop:
movu m4, [lpfq+r10+ 0]
.hv1_main:
movu m5, [lpfq+r10+16]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv1_have_right
cmp r10d, -34
jl .hv1_have_right
call .extend_right
.hv1_have_right:
palignr m1, m5, m4, 2
paddw m0, m4, m1
punpcklwd m2, m4, m1
pmaddwd m2, m2
punpckhwd m3, m4, m1
pmaddwd m3, m3
palignr m5, m4, 4
paddw m0, m5 ; h sum
punpcklwd m1, m5, m6
pmaddwd m1, m1
punpckhwd m5, m6
pmaddwd m5, m5
paddd m2, m1 ; h sumsq
paddd m3, m5
paddw m1, m0, [t2+r10+400*0]
paddd m4, m2, [t2+r10+400*2]
paddd m5, m3, [t2+r10+400*4]
mova [t2+r10+400*0], m0
mova [t2+r10+400*2], m2
mova [t2+r10+400*4], m3
paddd m4, m8
paddd m5, m8
psrld m4, 4 ; (a + 8) >> 4
psrld m5, 4
pslld m2, m4, 3
pslld m3, m5, 3
paddd m4, m2 ; ((a + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaxud m4, m2
psubd m4, m2 ; p
pmaxud m5, m3
psubd m5, m3
pmulld m4, m9 ; p * s
pmulld m5, m9
pmaddwd m0, m11 ; b * 455
pmaddwd m1, m11
paddw m4, m11
paddw m5, m11
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m12, m4
pcmpgtd m5, m12, m5
mulps m2, m12 ; 256 / (z + 1)
mulps m3, m12
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m13
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*2 +4], m2
mova [t3+r10*2+400*4+ 8], xm0
vextracti128 [t3+r10*2+400*4+40], m0, 1
mova [t3+r10*2+400*4+24], xm1
vextracti128 [t3+r10*2+400*4+56], m1, 1
add r10, 32
jl .hv1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.v0: ; vertical boxsums + ab (even rows)
lea r10, [wq-4]
.v0_loop:
mova m0, [t1+r10+400*0]
mova m4, [t1+r10+400*2]
mova m5, [t1+r10+400*4]
paddw m0, m0
paddd m4, m4
paddd m5, m5
paddw m1, m0, [t2+r10+400*0]
paddd m2, m4, [t2+r10+400*2]
paddd m3, m5, [t2+r10+400*4]
mova [t2+r10+400*0], m0
mova [t2+r10+400*2], m4
mova [t2+r10+400*4], m5
paddd m2, m8
paddd m3, m8
psrld m2, 4 ; (a + 8) >> 4
psrld m3, 4
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaxud m4, m2
psubd m4, m2 ; p
pmaxud m5, m3
psubd m5, m3
pmulld m4, m9 ; p * s
pmulld m5, m9
pmaddwd m0, m11 ; b * 455
pmaddwd m1, m11
paddw m4, m11
paddw m5, m11
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m12, m4
pcmpgtd m5, m12, m5
mulps m2, m12 ; 256 / (z + 1)
mulps m3, m12
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m13
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m2
mova [t3+r10*2+400*0+ 8], xm0
vextracti128 [t3+r10*2+400*0+40], m0, 1
mova [t3+r10*2+400*0+24], xm1
vextracti128 [t3+r10*2+400*0+56], m1, 1
add r10, 32
jl .v0_loop
ret
.v1: ; vertical boxsums + ab (odd rows)
lea r10, [wq-4]
.v1_loop:
mova m0, [t1+r10+400*0]
mova m4, [t1+r10+400*2]
mova m5, [t1+r10+400*4]
paddw m1, m0, [t2+r10+400*0]
paddd m2, m4, [t2+r10+400*2]
paddd m3, m5, [t2+r10+400*4]
mova [t2+r10+400*0], m0
mova [t2+r10+400*2], m4
mova [t2+r10+400*4], m5
paddd m2, m8
paddd m3, m8
psrld m2, 4 ; (a + 8) >> 4
psrld m3, 4
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaxud m4, m2
psubd m4, m2 ; p
pmaxud m5, m3
psubd m5, m3
pmulld m4, m9 ; p * s
pmulld m5, m9
pmaddwd m0, m11 ; b * 455
pmaddwd m1, m11
paddw m4, m11
paddw m5, m11
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m12, m4
pcmpgtd m5, m12, m5
mulps m2, m12 ; 256 / (z + 1)
mulps m3, m12
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m13
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*2+ 4], m2
mova [t3+r10*2+400*4+ 8], xm0
vextracti128 [t3+r10*2+400*4+40], m0, 1
mova [t3+r10*2+400*4+24], xm1
vextracti128 [t3+r10*2+400*4+56], m1, 1
add r10, 32
jl .v1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.prep_n: ; initial neighbor setup
mov r10, wq
.prep_n_loop:
mova xm0, [t4+r10*1+400*0+0]
paddw xm0, [t4+r10*1+400*0+4]
paddw xm2, xm0, [t4+r10*1+400*0+2]
mova m1, [t3+r10*2+400*0+0]
paddd m1, [t3+r10*2+400*0+8]
paddd m3, m1, [t3+r10*2+400*0+4]
psllw xm2, 2 ; a[-1] 444
pslld m3, 2 ; b[-1] 444
psubw xm2, xm0 ; a[-1] 343
psubd m3, m1 ; b[-1] 343
mova [t4+r10*1+400* 4], xm2
mova [t3+r10*2+400* 8], m3
mova xm0, [t4+r10*1+400*2+0]
paddw xm0, [t4+r10*1+400*2+4]
paddw xm2, xm0, [t4+r10*1+400*2+2]
mova m1, [t3+r10*2+400*4+0]
paddd m1, [t3+r10*2+400*4+8]
paddd m3, m1, [t3+r10*2+400*4+4]
psllw xm2, 2 ; a[ 0] 444
pslld m3, 2 ; b[ 0] 444
mova [t4+r10*1+400* 6], xm2
mova [t3+r10*2+400*12], m3
psubw xm2, xm0 ; a[ 0] 343
psubd m3, m1 ; b[ 0] 343
mova [t4+r10*1+400* 8], xm2
mova [t3+r10*2+400*16], m3
add r10, 16
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
mov r10, wq
.n0_loop:
mova m3, [t4+r10*1+400*0+0]
paddw m3, [t4+r10*1+400*0+4]
paddw m1, m3, [t4+r10*1+400*0+2]
psllw m1, 2 ; a[ 1] 444
psubw m2, m1, m3 ; a[ 1] 343
paddw m3, m2, [t4+r10*1+400*4]
paddw m3, [t4+r10*1+400*6]
mova [t4+r10*1+400*4], m2
mova [t4+r10*1+400*6], m1
mova m4, [t3+r10*2+400*0+0]
paddd m4, [t3+r10*2+400*0+8]
paddd m1, m4, [t3+r10*2+400*0+4]
pslld m1, 2 ; b[ 1] 444
psubd m2, m1, m4 ; b[ 1] 343
paddd m4, m2, [t3+r10*2+400* 8+ 0]
paddd m4, [t3+r10*2+400*12+ 0]
mova [t3+r10*2+400* 8+ 0], m2
mova [t3+r10*2+400*12+ 0], m1
mova m5, [t3+r10*2+400*0+32]
paddd m5, [t3+r10*2+400*0+40]
paddd m1, m5, [t3+r10*2+400*0+36]
pslld m1, 2
psubd m2, m1, m5
paddd m5, m2, [t3+r10*2+400* 8+32]
paddd m5, [t3+r10*2+400*12+32]
mova [t3+r10*2+400* 8+32], m2
mova [t3+r10*2+400*12+32], m1
mova m0, [dstq+r10]
punpcklwd m1, m0, m6
punpcklwd m2, m3, m6
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
psubd m1, m2 ; b - a * src + (1 << 8)
psubd m4, m3
psrad m1, 9
psrad m4, 9
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m14
mova [dstq+r10], m0
add r10, 32
jl .n0_loop
add dstq, strideq
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
mov r10, wq
.n1_loop:
mova m3, [t4+r10*1+400*2+0]
paddw m3, [t4+r10*1+400*2+4]
paddw m1, m3, [t4+r10*1+400*2+2]
psllw m1, 2 ; a[ 1] 444
psubw m2, m1, m3 ; a[ 1] 343
paddw m3, m2, [t4+r10*1+400*6]
paddw m3, [t4+r10*1+400*8]
mova [t4+r10*1+400*6], m1
mova [t4+r10*1+400*8], m2
mova m4, [t3+r10*2+400*4+0]
paddd m4, [t3+r10*2+400*4+8]
paddd m1, m4, [t3+r10*2+400*4+4]
pslld m1, 2 ; b[ 1] 444
psubd m2, m1, m4 ; b[ 1] 343
paddd m4, m2, [t3+r10*2+400*12+ 0]
paddd m4, [t3+r10*2+400*16+ 0]
mova [t3+r10*2+400*12+ 0], m1
mova [t3+r10*2+400*16+ 0], m2
mova m5, [t3+r10*2+400*4+32]
paddd m5, [t3+r10*2+400*4+40]
paddd m1, m5, [t3+r10*2+400*4+36]
pslld m1, 2
psubd m2, m1, m5
paddd m5, m2, [t3+r10*2+400*12+32]
paddd m5, [t3+r10*2+400*16+32]
mova [t3+r10*2+400*12+32], m1
mova [t3+r10*2+400*16+32], m2
mova m0, [dstq+r10]
punpcklwd m1, m0, m6
punpcklwd m2, m3, m6
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
psubd m1, m2 ; b - a * src + (1 << 8)
psubd m4, m3
psrad m1, 9
psrad m4, 9
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m14
mova [dstq+r10], m0
add r10, 32
jl .n1_loop
add dstq, strideq
ret
cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
w, h, edge, params
movifnidn wd, wm
mov paramsq, r6mp
lea r13, [pb_m10_m9]
add wd, wd
movifnidn hd, hm
mov edged, r7m
add lpfq, wq
vpbroadcastd m15, [paramsq+8] ; w0 w1
add dstq, wq
vpbroadcastd m13, [paramsq+0] ; s0
lea t3, [rsp+wq*2+400*24+8]
vpbroadcastd m14, [paramsq+4] ; s1
lea t4, [rsp+wq+400*52+8]
vpbroadcastd m9, [base+pd_8]
lea t1, [rsp+wq+12]
vpbroadcastd m10, [base+pd_34816]
neg wq
vbroadcastss m11, [base+pf_256]
pxor m7, m7
vpbroadcastd m12, [base+pw_455_24]
psllw m15, 2
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t2, t1
call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
add t1, 400*12
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov [rsp], r10 ; below
call .hv0
.main:
dec hd
jz .height1
add lpfq, strideq
call .hv1
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
add lpfq, strideq
call .hv0
test hd, hd
jz .odd_height
add lpfq, strideq
call .hv1
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, [rsp]
call .hv0_bottom
add lpfq, strideq
call .hv1_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .v1
call .prep_n
jmp .odd_height_end
.odd_height:
call .v1
call .n0
call .n1
.odd_height_end:
call .v0
call .v1
call .n0
jmp .end2
.extend_bottom:
call .v0
call .v1
jmp .end
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
lea r10, [wq-4]
lea t2, [t1+400*12]
.top_fixup_loop:
mova m0, [t1+r10+400* 0]
mova m1, [t1+r10+400* 2]
mova m2, [t1+r10+400* 4]
paddw m0, m0
mova m3, [t1+r10+400* 6]
paddd m1, m1
mova m4, [t1+r10+400* 8]
paddd m2, m2
mova m5, [t1+r10+400*10]
mova [t2+r10+400* 0], m0
mova [t2+r10+400* 2], m1
mova [t2+r10+400* 4], m2
mova [t2+r10+400* 6], m3
mova [t2+r10+400* 8], m4
mova [t2+r10+400*10], m5
add r10, 32
jl .top_fixup_loop
call .v0
jmp .main
.h: ; horizontal boxsum
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 10
jmp .h_main
.h_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, [base+sgr_lshuf5]
vinserti128 m4, [lpfq+wq+10], 1
jmp .h_main
.h_top:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m4, [lpfq+r10- 2]
.h_main:
movu m5, [lpfq+r10+14]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -36
jl .h_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
.h_have_right:
palignr m3, m5, m4, 2
palignr m0, m5, m4, 4
paddw m1, m3, m0
punpcklwd m2, m3, m0
pmaddwd m2, m2
punpckhwd m3, m0
pmaddwd m3, m3
palignr m0, m5, m4, 6
paddw m1, m0 ; sum3
punpcklwd m6, m0, m7
pmaddwd m6, m6
punpckhwd m0, m7
pmaddwd m0, m0
paddd m2, m6 ; sumsq3
shufpd m6, m4, m5, 0x05
punpcklwd m5, m6, m4
paddw m8, m4, m6
pmaddwd m5, m5
punpckhwd m6, m4
pmaddwd m6, m6
paddd m3, m0
mova [t1+r10+400* 6], m1
mova [t1+r10+400* 8], m2
mova [t1+r10+400*10], m3
paddw m8, m1 ; sum5
paddd m5, m2 ; sumsq5
paddd m6, m3
mova [t1+r10+400* 0], m8
mova [t1+r10+400* 2], m5
mova [t1+r10+400* 4], m6
add r10, 32
jl .h_loop
ret
ALIGN function_align
.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 10
jmp .hv0_main
.hv0_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, [base+sgr_lshuf5]
vinserti128 m4, [lpfq+wq+10], 1
jmp .hv0_main
.hv0_bottom:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
.hv0_loop:
movu m4, [lpfq+r10- 2]
.hv0_main:
movu m5, [lpfq+r10+14]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv0_have_right
cmp r10d, -36
jl .hv0_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
.hv0_have_right:
palignr m3, m5, m4, 2
palignr m0, m5, m4, 4
paddw m1, m3, m0
punpcklwd m2, m3, m0
pmaddwd m2, m2
punpckhwd m3, m0
pmaddwd m3, m3
palignr m0, m5, m4, 6
paddw m1, m0 ; h sum3
punpcklwd m6, m0, m7
pmaddwd m6, m6
punpckhwd m0, m7
pmaddwd m0, m0
paddd m2, m6 ; h sumsq3
shufpd m6, m4, m5, 0x05
punpcklwd m5, m6, m4
paddw m8, m4, m6
pmaddwd m5, m5
punpckhwd m6, m4
pmaddwd m6, m6
paddd m3, m0
paddw m8, m1 ; h sum5
paddd m5, m2 ; h sumsq5
paddd m6, m3
mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
mova [t3+r10*2+400*0+40], m6
paddw m8, [t1+r10+400* 0]
paddd m5, [t1+r10+400* 2]
paddd m6, [t1+r10+400* 4]
mova [t1+r10+400* 0], m8
mova [t1+r10+400* 2], m5
mova [t1+r10+400* 4], m6
paddw m0, m1, [t1+r10+400* 6]
paddd m4, m2, [t1+r10+400* 8]
paddd m5, m3, [t1+r10+400*10]
mova [t1+r10+400* 6], m1
mova [t1+r10+400* 8], m2
mova [t1+r10+400*10], m3
paddw m1, m0, [t2+r10+400* 6]
paddd m2, m4, [t2+r10+400* 8]
paddd m3, m5, [t2+r10+400*10]
mova [t2+r10+400* 6], m0
mova [t2+r10+400* 8], m4
mova [t2+r10+400*10], m5
paddd m2, m9
paddd m3, m9
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m7 ; (b3 + 2) >> 2
punpcklwd m2, m3, m7
pmaddwd m2, m2
punpckhwd m3, m7
pmaddwd m3, m3
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
pmaxud m4, m2
psubd m4, m2 ; p3
pmaxud m5, m3
psubd m5, m3
pmulld m4, m14 ; p3 * s1
pmulld m5, m14
pmaddwd m0, m12 ; b3 * 455
pmaddwd m1, m12
paddw m4, m12
paddw m5, m12
psrld m4, 20 ; z3 + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z3 + 1)
rcpps m3, m5
pcmpgtd m4, m11, m4
pcmpgtd m5, m11, m5
mulps m2, m11 ; 256 / (z3 + 1)
mulps m3, m11
psrld m4, 24 ; z3 < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x3
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*2+ 4], m2
mova [t3+r10*2+400*4+ 8], xm0
vextracti128 [t3+r10*2+400*4+40], m0, 1
mova [t3+r10*2+400*4+24], xm1
vextracti128 [t3+r10*2+400*4+56], m1, 1
add r10, 32
jl .hv0_loop
ret
ALIGN function_align
.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
vpbroadcastq xm5, [leftq]
vinserti128 m5, [lpfq+wq], 1
mova m4, [lpfq+wq]
add leftq, 8
palignr m4, m5, 10
jmp .hv1_main
.hv1_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, [base+sgr_lshuf5]
vinserti128 m4, [lpfq+wq+10], 1
jmp .hv1_main
.hv1_bottom:
lea r10, [wq-4]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
.hv1_loop:
movu m4, [lpfq+r10- 2]
.hv1_main:
movu m5, [lpfq+r10+14]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv1_have_right
cmp r10d, -36
jl .hv1_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
.hv1_have_right:
palignr m6, m5, m4, 2
palignr m3, m5, m4, 4
paddw m2, m6, m3
punpcklwd m0, m6, m3
pmaddwd m0, m0
punpckhwd m6, m3
pmaddwd m6, m6
palignr m3, m5, m4, 6
paddw m2, m3 ; h sum3
punpcklwd m1, m3, m7
pmaddwd m1, m1
punpckhwd m3, m7
pmaddwd m3, m3
paddd m0, m1 ; h sumsq3
shufpd m1, m4, m5, 0x05
punpckhwd m5, m4, m1
paddw m8, m4, m1
pmaddwd m5, m5
punpcklwd m4, m1
pmaddwd m4, m4
paddd m6, m3
paddw m1, m2, [t2+r10+400* 6]
mova [t2+r10+400* 6], m2
paddw m8, m2 ; h sum5
paddd m2, m0, [t2+r10+400* 8]
paddd m3, m6, [t2+r10+400*10]
mova [t2+r10+400* 8], m0
mova [t2+r10+400*10], m6
paddd m4, m0 ; h sumsq5
paddd m5, m6
paddd m2, m9
paddd m3, m9
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
pslld m0, m2, 3
pslld m6, m3, 3
paddd m2, m0 ; ((a3 + 8) >> 4) * 9
paddd m3, m6
psrlw m6, m1, 1
pavgw m6, m7 ; (b3 + 2) >> 2
punpcklwd m0, m6, m7
pmaddwd m0, m0
punpckhwd m6, m7
pmaddwd m6, m6
pmaxud m2, m0
psubd m2, m0 ; p3
pmaxud m3, m6
psubd m3, m6
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
pmulld m2, m14 ; p3 * s1
pmulld m3, m14
pmaddwd m0, m12 ; b3 * 455
pmaddwd m1, m12
paddw m2, m12
paddw m3, m12
psrld m2, 20 ; z + 1
psrld m3, 20
cvtdq2ps m2, m2
cvtdq2ps m3, m3
rcpps m6, m2 ; 1 / (z + 1)
rcpps m7, m3
pcmpgtd m2, m11, m2
pcmpgtd m3, m11, m3
mulps m6, m11 ; 256 / (z + 1)
mulps m7, m11
psrld m2, 24 ; z < 255 ? 255 : 0
psrld m3, 24
cvtps2dq m6, m6
cvtps2dq m7, m7
pminsw m6, m2 ; x
pminsw m7, m3
pmulld m0, m6
packssdw m6, m7
pmulld m7, m1
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m7, m10
psrld m0, 12
psrld m7, 12
paddw m1, m8, [t2+r10+400*0]
paddd m2, m4, [t2+r10+400*2]
paddd m3, m5, [t2+r10+400*4]
paddw m1, [t1+r10+400*0]
paddd m2, [t1+r10+400*2]
paddd m3, [t1+r10+400*4]
mova [t2+r10+400*0], m8
mova [t2+r10+400*2], m4
mova [t2+r10+400*4], m5
mova [t4+r10*1+400*4 +4], m6
mova [t3+r10*2+400*8+ 8], xm0
vextracti128 [t3+r10*2+400*8+40], m0, 1
mova [t3+r10*2+400*8+24], xm7
vextracti128 [t3+r10*2+400*8+56], m7, 1
vpbroadcastd m4, [base+pd_25]
vpbroadcastd m6, [base+pw_164_24]
pxor m7, m7
paddd m2, m9
paddd m3, m9
psrld m2, 4 ; (a5 + 8) >> 4
psrld m3, 4
pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
pmulld m3, m4
psrlw m5, m1, 1
pavgw m5, m7 ; (b5 + 2) >> 2
punpcklwd m4, m5, m7
pmaddwd m4, m4
punpckhwd m5, m7
pmaddwd m5, m5
punpcklwd m0, m1, m7 ; b5
punpckhwd m1, m7
pmaxud m2, m4
psubd m2, m4 ; p5
pmaxud m3, m5
psubd m3, m5
pmulld m2, m13 ; p5 * s0
pmulld m3, m13
pmaddwd m0, m6 ; b5 * 164
pmaddwd m1, m6
paddw m2, m6
paddw m3, m6
psrld m2, 20 ; z5 + 1
psrld m3, 20
cvtdq2ps m2, m2
cvtdq2ps m3, m3
rcpps m4, m2 ; 1 / (z5 + 1)
rcpps m5, m3
pcmpgtd m2, m11, m2
pcmpgtd m3, m11, m3
mulps m4, m11 ; 256 / (z5 + 1)
mulps m5, m11
psrld m2, 24 ; z5 < 255 ? 255 : 0
psrld m3, 24
cvtps2dq m4, m4
cvtps2dq m5, m5
pminsw m4, m2 ; x5
pminsw m5, m3
pmulld m0, m4
pmulld m1, m5
packssdw m4, m5
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m4
mova [t3+r10*2+400*0+ 8], xm0
vextracti128 [t3+r10*2+400*0+40], m0, 1
mova [t3+r10*2+400*0+24], xm1
vextracti128 [t3+r10*2+400*0+56], m1, 1
add r10, 32
jl .hv1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.v0: ; vertical boxsums + ab3 (even rows)
lea r10, [wq-4]
.v0_loop:
mova m0, [t1+r10+400* 6]
mova m4, [t1+r10+400* 8]
mova m5, [t1+r10+400*10]
paddw m0, m0
paddd m4, m4
paddd m5, m5
paddw m1, m0, [t2+r10+400* 6]
paddd m2, m4, [t2+r10+400* 8]
paddd m3, m5, [t2+r10+400*10]
mova [t2+r10+400* 6], m0
mova [t2+r10+400* 8], m4
mova [t2+r10+400*10], m5
paddd m2, m9
paddd m3, m9
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m7 ; (b3 + 2) >> 2
punpcklwd m2, m3, m7
pmaddwd m2, m2
punpckhwd m3, m7
pmaddwd m3, m3
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
pmaxud m4, m2
psubd m4, m2 ; p3
pmaxud m5, m3
psubd m5, m3
pmulld m4, m14 ; p3 * s1
pmulld m5, m14
pmaddwd m0, m12 ; b3 * 455
pmaddwd m1, m12
paddw m4, m12
paddw m5, m12
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m11, m4
pcmpgtd m5, m11, m5
mulps m2, m11 ; 256 / (z + 1)
mulps m3, m11
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
psrld m0, 12
psrld m1, 12
mova m3, [t1+r10+400*0]
mova m4, [t1+r10+400*2]
mova m5, [t1+r10+400*4]
mova [t3+r10*2+400*8+ 8], m3
mova [t3+r10*2+400*0+ 8], m4
mova [t3+r10*2+400*0+40], m5
paddw m3, m3 ; cc5
paddd m4, m4
paddd m5, m5
mova [t1+r10+400*0], m3
mova [t1+r10+400*2], m4
mova [t1+r10+400*4], m5
mova [t4+r10*1+400*2+ 4], m2
mova [t3+r10*2+400*4+ 8], xm0
vextracti128 [t3+r10*2+400*4+40], m0, 1
mova [t3+r10*2+400*4+24], xm1
vextracti128 [t3+r10*2+400*4+56], m1, 1
add r10, 32
jl .v0_loop
ret
.v1: ; vertical boxsums + ab (odd rows)
lea r10, [wq-4]
.v1_loop:
mova m4, [t1+r10+400* 6]
mova m5, [t1+r10+400* 8]
mova m6, [t1+r10+400*10]
paddw m1, m4, [t2+r10+400* 6]
paddd m2, m5, [t2+r10+400* 8]
paddd m3, m6, [t2+r10+400*10]
mova [t2+r10+400* 6], m4
mova [t2+r10+400* 8], m5
mova [t2+r10+400*10], m6
paddd m2, m9
paddd m3, m9
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m7 ; (b3 + 2) >> 2
punpcklwd m2, m3, m7
pmaddwd m2, m2
punpckhwd m3, m7
pmaddwd m3, m3
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
pmaxud m4, m2
psubd m4, m2 ; p3
pmaxud m5, m3
psubd m5, m3
pmulld m4, m14 ; p3 * s1
pmulld m5, m14
pmaddwd m0, m12 ; b3 * 455
pmaddwd m1, m12
paddw m4, m12
paddw m5, m12
psrld m4, 20 ; z + 1
psrld m5, 20
cvtdq2ps m4, m4
cvtdq2ps m5, m5
rcpps m2, m4 ; 1 / (z + 1)
rcpps m3, m5
pcmpgtd m4, m11, m4
pcmpgtd m5, m11, m5
mulps m2, m11 ; 256 / (z + 1)
mulps m3, m11
psrld m4, 24 ; z < 255 ? 255 : 0
psrld m5, 24
cvtps2dq m2, m2
cvtps2dq m3, m3
pminsw m2, m4 ; x
pminsw m3, m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
psrld m0, 12
psrld m8, m1, 12
mova [t4+r10*1+400*4+4], m2
mova m4, [t3+r10*2+400*8+ 8]
mova m5, [t3+r10*2+400*0+ 8]
mova m6, [t3+r10*2+400*0+40]
paddw m1, m4, [t2+r10+400*0]
paddd m2, m5, [t2+r10+400*2]
paddd m3, m6, [t2+r10+400*4]
paddw m1, [t1+r10+400*0]
paddd m2, [t1+r10+400*2]
paddd m3, [t1+r10+400*4]
mova [t2+r10+400*0], m4
mova [t2+r10+400*2], m5
mova [t2+r10+400*4], m6
mova [t3+r10*2+400*8+ 8], xm0
vextracti128 [t3+r10*2+400*8+40], m0, 1
mova [t3+r10*2+400*8+24], xm8
vextracti128 [t3+r10*2+400*8+56], m8, 1
vpbroadcastd m4, [base+pd_25]
vpbroadcastd m6, [base+pw_164_24]
paddd m2, m9
paddd m3, m9
psrld m2, 4 ; (a5 + 8) >> 4
psrld m3, 4
pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
pmulld m3, m4
psrlw m5, m1, 1
pavgw m5, m7 ; (b5 + 2) >> 2
punpcklwd m4, m5, m7
pmaddwd m4, m4
punpckhwd m5, m7
pmaddwd m5, m5
punpcklwd m0, m1, m7 ; b5
punpckhwd m1, m7
pmaxud m2, m4
psubd m2, m4 ; p5
pmaxud m3, m5
psubd m3, m5
pmulld m2, m13 ; p5 * s0
pmulld m3, m13
pmaddwd m0, m6 ; b5 * 164
pmaddwd m1, m6
paddw m2, m6
paddw m3, m6
psrld m2, 20 ; z5 + 1
psrld m3, 20
cvtdq2ps m2, m2
cvtdq2ps m3, m3
rcpps m4, m2 ; 1 / (z5 + 1)
rcpps m5, m3
pcmpgtd m2, m11, m2
pcmpgtd m3, m11, m3
mulps m4, m11 ; 256 / (z5 + 1)
mulps m5, m11
psrld m2, 24 ; z5 < 255 ? 255 : 0
psrld m3, 24
cvtps2dq m4, m4
cvtps2dq m5, m5
pminsw m4, m2 ; x5
pminsw m5, m3
pmulld m0, m4
pmulld m1, m5
packssdw m4, m5
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m4
mova [t3+r10*2+400*0+ 8], xm0
vextracti128 [t3+r10*2+400*0+40], m0, 1
mova [t3+r10*2+400*0+24], xm1
vextracti128 [t3+r10*2+400*0+56], m1, 1
add r10, 32
jl .v1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.prep_n: ; initial neighbor setup
mov r10, wq
.prep_n_loop:
movu xm0, [t4+r10*1+400*0+2]
paddw xm2, xm0, [t4+r10*1+400*0+0]
paddw xm2, [t4+r10*1+400*0+4]
movu m1, [t3+r10*2+400*0+4]
paddd m3, m1, [t3+r10*2+400*0+0]
paddd m3, [t3+r10*2+400*0+8]
paddw xm0, xm2
paddd m1, m3
psllw xm2, 2
pslld m3, 2
paddw xm0, xm2 ; a5 565
paddd m1, m3 ; b5 565
mova [t4+r10*1+400* 6], xm0
mova [t3+r10*2+400*12], m1
mova xm0, [t4+r10*1+400*2+0]
paddw xm0, [t4+r10*1+400*2+4]
paddw xm2, xm0, [t4+r10*1+400*2+2]
mova m1, [t3+r10*2+400*4+0]
paddd m1, [t3+r10*2+400*4+8]
paddd m3, m1, [t3+r10*2+400*4+4]
psllw xm2, 2 ; a3[-1] 444
pslld m3, 2 ; b3[-1] 444
psubw xm2, xm0 ; a3[-1] 343
psubd m3, m1 ; b3[-1] 343
mova [t4+r10*1+400* 8], xm2
mova [t3+r10*2+400*16], m3
mova xm0, [t4+r10*1+400*4+0]
paddw xm0, [t4+r10*1+400*4+4]
paddw xm2, xm0, [t4+r10*1+400*4+2]
mova m1, [t3+r10*2+400*8+0]
paddd m1, [t3+r10*2+400*8+8]
paddd m3, m1, [t3+r10*2+400*8+4]
psllw xm2, 2 ; a3[ 0] 444
pslld m3, 2 ; b3[ 0] 444
mova [t4+r10*1+400*10], xm2
mova [t3+r10*2+400*20], m3
psubw xm2, xm0 ; a3[ 0] 343
psubd m3, m1 ; b3[ 0] 343
mova [t4+r10*1+400*12], xm2
mova [t3+r10*2+400*24], m3
add r10, 16
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
mov r10, wq
vpbroadcastd m6, [base+pd_4096]
.n0_loop:
movu xm2, [t4+r10*1+2]
paddw xm0, xm2, [t4+r10*1+0]
paddw xm0, [t4+r10*1+4]
paddw xm2, xm0
psllw xm0, 2
paddw xm0, xm2 ; a5
movu m1, [t3+r10*2+4]
paddd m4, m1, [t3+r10*2+0]
paddd m4, [t3+r10*2+8]
paddd m1, m4
pslld m4, 2
paddd m4, m1 ; b5
paddw xm2, xm0, [t4+r10*1+400* 6]
mova [t4+r10*1+400* 6], xm0
paddd m0, m4, [t3+r10*2+400*12]
mova [t3+r10*2+400*12], m4
mova xm3, [t4+r10*1+400*2+0]
paddw xm3, [t4+r10*1+400*2+4]
paddw xm5, xm3, [t4+r10*1+400*2+2]
psllw xm5, 2 ; a3[ 1] 444
psubw xm4, xm5, xm3 ; a3[ 1] 343
paddw xm3, xm4, [t4+r10*1+400* 8]
paddw xm3, [t4+r10*1+400*10]
mova [t4+r10*1+400* 8], xm4
mova [t4+r10*1+400*10], xm5
mova m1, [t3+r10*2+400*4+0]
paddd m1, [t3+r10*2+400*4+8]
paddd m5, m1, [t3+r10*2+400*4+4]
pslld m5, 2 ; b3[ 1] 444
psubd m4, m5, m1 ; b3[ 1] 343
paddd m1, m4, [t3+r10*2+400*16]
paddd m1, [t3+r10*2+400*20]
mova [t3+r10*2+400*16], m4
mova [t3+r10*2+400*20], m5
pmovzxwd m4, [dstq+r10]
pmovzxwd m2, xm2 ; a5
pmovzxwd m3, xm3 ; a3
pmaddwd m2, m4 ; a5 * src
pmaddwd m3, m4 ; a3 * src
pslld m4, 13
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
psrld m0, 9
pslld m1, 7
pblendw m0, m1, 0xaa
pmaddwd m0, m15
paddd m4, m6
paddd m0, m4
psrad m0, 7
vextracti128 xm1, m0, 1
packusdw xm0, xm1 ; clip
psrlw xm0, 6
mova [dstq+r10], xm0
add r10, 16
jl .n0_loop
add dstq, strideq
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
mov r10, wq
vpbroadcastd m6, [base+pd_4096]
.n1_loop:
mova xm3, [t4+r10*1+400*4+0]
paddw xm3, [t4+r10*1+400*4+4]
paddw xm5, xm3, [t4+r10*1+400*4+2]
psllw xm5, 2 ; a3[ 1] 444
psubw xm4, xm5, xm3 ; a3[ 1] 343
paddw xm3, xm4, [t4+r10*1+400*12]
paddw xm3, [t4+r10*1+400*10]
mova [t4+r10*1+400*10], xm5
mova [t4+r10*1+400*12], xm4
mova m1, [t3+r10*2+400*8+0]
paddd m1, [t3+r10*2+400*8+8]
paddd m5, m1, [t3+r10*2+400*8+4]
pslld m5, 2 ; b3[ 1] 444
psubd m4, m5, m1 ; b3[ 1] 343
paddd m1, m4, [t3+r10*2+400*24]
paddd m1, [t3+r10*2+400*20]
mova [t3+r10*2+400*20], m5
mova [t3+r10*2+400*24], m4
pmovzxwd m4, [dstq+r10]
pmovzxwd m2, [t4+r10*1+400* 6]
pmovzxwd m3, xm3
mova m0, [t3+r10*2+400*12]
pmaddwd m2, m4 ; a5 * src
pmaddwd m3, m4 ; a3 * src
pslld m4, 13
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
psrld m0, 8
pslld m1, 7
pblendw m0, m1, 0xaa
pmaddwd m0, m15
paddd m4, m6
paddd m0, m4
psrad m0, 7
vextracti128 xm1, m0, 1
packusdw xm0, xm1 ; clip
psrlw xm0, 6
mova [dstq+r10], xm0
add r10, 16
jl .n1_loop
add dstq, strideq
ret
%endif ; ARCH_X86_64