Source code

Revision control

Copy as Markdown

Other Tools

; Copyright © 2019-2021, VideoLAN and dav1d authors
; Copyright © 2019, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%include "x86/filmgrain_common.asm"
SECTION_RODATA
pw_1024: times 8 dw 1024
pb_27_17_17_27: db 27, 17, 17, 27
times 6 db 0, 32
pb_23_22_h: db 23, 22
times 7 db 0, 32
pb_27_17: times 8 db 27, 17
pb_17_27: times 8 db 17, 27
pb_23_22: times 8 db 23, 22
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
mul_bits: dw 256, 128, 64, 32, 16
round_vals: dw 32, 64, 128, 256, 512
max: dw 255, 240, 235
min: dw 0, 16
pw_1: dw 1
%macro JMP_TABLE 2-*
%xdefine %1_8bpc_%2_table %%table
%xdefine %%base %1_8bpc_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .ar%3 - %%base
%rotate 1
%endrep
%endmacro
JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
SECTION .text
%if ARCH_X86_32
%define PIC_ptr(a) base+a
%else
%define PIC_ptr(a) a
%endif
%macro SCRATCH 3
%if ARCH_X86_32
mova [rsp+%3*mmsize], m%1
%define m%2 [rsp+%3*mmsize]
%else
SWAP %1, %2
%endif
%endmacro
INIT_XMM ssse3
cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
LEA r4, $$
%define base r4-$$
movq m1, [base+rnd_next_upperbit_mask]
movq m4, [base+mul_bits]
movq m7, [base+hmul_bits]
mov r2d, [fg_dataq+FGData.grain_scale_shift]
movd m2, [base+round+r2*2]
movd m0, [fg_dataq+FGData.seed]
mova m5, [base+pb_mask]
pshuflw m2, m2, q0000
pshuflw m0, m0, q0000
mov r2, -73*82
sub bufq, r2
lea r3, [base+gaussian_sequence]
.loop:
pand m6, m0, m1
psrlw m3, m6, 10
por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
pmullw m6, m4 ; bits 0x0f00 are set
pshufb m3, m5, m6 ; set 15th bit for next 4 seeds
psllq m6, m3, 30
por m3, m6
psllq m6, m3, 15
por m3, m6 ; aggregate each bit into next seed's high bit
pmulhuw m6, m0, m7
por m3, m6 ; 4 next output seeds
pshuflw m0, m3, q3333
psrlw m3, 5
%if ARCH_X86_64
movq r6, m3
mov r8, r6
movzx r5d, r6w
shr r6d, 16
shr r8, 32
movzx r7, r8w
shr r8, 16
movd m6, [r3+r5*2]
pinsrw m6, [r3+r6*2], 1
pinsrw m6, [r3+r7*2], 2
pinsrw m6, [r3+r8*2], 3
%else
movd r6, m3
pshuflw m3, m3, q3232
movzx r5, r6w
shr r6, 16
movd m6, [r3+r5*2]
pinsrw m6, [r3+r6*2], 1
movd r6, m3
movzx r5, r6w
shr r6, 16
pinsrw m6, [r3+r5*2], 2
pinsrw m6, [r3+r6*2], 3
%endif
pmulhrsw m6, m2
packsswb m6, m6
movd [bufq+r2], m6
add r2, 4
jl .loop
; auto-regression code
movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
jmp r2
.ar1:
%if ARCH_X86_32
DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
%elif WIN64
DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
mov bufq, r0
%else
DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
%endif
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
movd m4, [fg_dataq+FGData.ar_coeffs_y]
mov ecx, [fg_dataq+FGData.ar_coeff_shift]
%if ARCH_X86_32
mov r1m, cf3d
DEFINE_ARGS buf, shift, val3, min, max, x, val0
%define hd r0mp
%define cf3d r1mp
%elif WIN64
DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
%else
DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
%endif
pxor m6, m6
pcmpgtb m7, m6, m4
punpcklbw m4, m7
pinsrw m4, [base+pw_1], 3
pshufd m5, m4, q1111
pshufd m4, m4, q0000
movd m3, [base+round_vals+shiftq*2-12] ; rnd
pshuflw m3, m3, q0000
sub bufq, 82*73-(82*3+79)
mov hd, 70
mov mind, -128
mov maxd, 127
.y_loop_ar1:
mov xq, -76
movsx val3d, byte [bufq+xq-1]
.x_loop_ar1:
movq m0, [bufq+xq-82-1] ; top/left
pcmpgtb m7, m6, m0
punpcklbw m0, m7
psrldq m2, m0, 2 ; top
psrldq m1, m0, 4 ; top/right
punpcklwd m0, m2
punpcklwd m1, m3
pmaddwd m0, m4
pmaddwd m1, m5
paddd m0, m1
.x_loop_ar1_inner:
movd val0d, m0
psrldq m0, 4
imul val3d, cf3d
add val3d, val0d
sar val3d, shiftb
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
cmovns val3d, maxd
cmp val3d, mind
cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
jz .x_loop_ar1_end
test xq, 3
jnz .x_loop_ar1_inner
jmp .x_loop_ar1
.x_loop_ar1_end:
add bufq, 82
dec hd
jg .y_loop_ar1
.ar0:
RET
.ar2:
%if ARCH_X86_32
ALLOC_STACK -16*8
%endif
DEFINE_ARGS buf, fg_data, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd m6, [base+round_vals-12+shiftq*2]
movd m7, [base+byte_blend+1]
SCRATCH 7, 15, 7
movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
pxor m7, m7
pshuflw m6, m6, q0000
punpcklwd m6, m7
pcmpgtb m4, m7, m0
pcmpgtb m5, m7, m1
punpcklbw m0, m4
punpcklbw m1, m5
DEFINE_ARGS buf, fg_data, h, x
pshufd m4, m1, q0000
pshufd m5, m1, q1111
pshufd m3, m0, q3333
pshufd m2, m0, q2222
pshufd m1, m0, q1111
pshufd m0, m0, q0000
SCRATCH 0, 8, 0
SCRATCH 1, 9, 1
SCRATCH 2, 10, 2
SCRATCH 3, 11, 3
SCRATCH 4, 12, 4
SCRATCH 5, 13, 5
SCRATCH 6, 14, 6
sub bufq, 82*73-(82*3+79)
mov hd, 70
.y_loop_ar2:
mov xq, -76
.x_loop_ar2:
movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
pcmpgtb m2, m7, m0
punpckhbw m1, m0, m2
punpcklbw m0, m2
psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
punpcklwd m2, m0, m5
punpcklwd m3, m4
pmaddwd m2, m8
pmaddwd m3, m11
paddd m2, m3
psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
psrldq m6, m0, 8 ; y=-2,x=[+2,+5]
punpcklwd m4, m5
punpcklwd m6, m1
psrldq m5, m1, 6 ; y=-1,x=[+1,+5]
psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
punpcklwd m5, m1
pmaddwd m4, m9
pmaddwd m6, m10
pmaddwd m5, m12
paddd m4, m6
paddd m2, m5
paddd m2, m4
paddd m2, m14
movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
.x_loop_ar2_inner:
pcmpgtb m4, m7, m0
punpcklbw m1, m0, m4
pmaddwd m3, m1, m13
paddd m3, m2
psrldq m1, 4 ; y=0,x=0
psrldq m2, 4 ; shift top to next pixel
psrad m3, [fg_dataq+FGData.ar_coeff_shift]
; don't packssdw since we only care about one value
paddw m3, m1
packsswb m3, m3
pslldq m3, 2
pand m3, m15
pandn m1, m15, m0
por m0, m1, m3
psrldq m0, 1
; overwrite 2 pixels, but that's ok
movd [bufq+xq-1], m0
inc xq
jz .x_loop_ar2_end
test xq, 3
jnz .x_loop_ar2_inner
jmp .x_loop_ar2
.x_loop_ar2_end:
add bufq, 82
dec hd
jg .y_loop_ar2
RET
.ar3:
DEFINE_ARGS buf, fg_data, shift
%if ARCH_X86_32
ALLOC_STACK -16*14
%elif WIN64
SUB rsp, 16*6
%assign stack_size_padded (stack_size_padded+16*6)
%assign stack_size (stack_size+16*6)
%else
ALLOC_STACK -16*6
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd m6, [base+round_vals-12+shiftq*2]
movd m7, [base+byte_blend]
movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
pxor m3, m3
pcmpgtb m4, m3, m0
pcmpgtb m3, m2
pshuflw m6, m6, q0000
SCRATCH 6, 14, 12
SCRATCH 7, 15, 13
punpckhbw m1, m0, m4
punpcklbw m0, m4
punpcklbw m2, m3
pshufd m3, m0, q1111
pshufd m4, m0, q2222
pshufd m5, m0, q3333
pshufd m0, m0, q0000
mova [rsp+ 0*16], m0
mova [rsp+ 1*16], m3
mova [rsp+ 2*16], m4
mova [rsp+ 3*16], m5
pshufd m6, m1, q1111
pshufd m7, m1, q2222
pshufd m5, m1, q3333
pshufd m1, m1, q0000
pshufd m3, m2, q1111
psrldq m0, m2, 10
pinsrw m2, [base+pw_1], 5
pshufd m4, m2, q2222
pshufd m2, m2, q0000
pinsrw m0, [base+round_vals+shiftq*2-10], 3
mova [rsp+ 4*16], m1
mova [rsp+ 5*16], m6
SCRATCH 7, 8, 6
SCRATCH 5, 9, 7
SCRATCH 2, 10, 8
SCRATCH 3, 11, 9
SCRATCH 4, 12, 10
SCRATCH 0, 13, 11
DEFINE_ARGS buf, fg_data, h, x
sub bufq, 82*73-(82*3+79)
mov hd, 70
.y_loop_ar3:
mov xq, -76
.x_loop_ar3:
movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
pxor m3, m3
pcmpgtb m3, m0
punpckhbw m2, m0, m3
punpcklbw m0, m3
psrldq m5, m0, 2
psrldq m6, m0, 4
psrldq m7, m0, 6
punpcklwd m4, m0, m5
punpcklwd m6, m7
pmaddwd m4, [rsp+ 0*16]
pmaddwd m6, [rsp+ 1*16]
paddd m4, m6
movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
pxor m5, m5
pcmpgtb m5, m1
punpckhbw m3, m1, m5
punpcklbw m1, m5
palignr m6, m2, m0, 10
palignr m7, m2, m0, 12
psrldq m0, 8
punpcklwd m0, m6
punpcklwd m7, m1
pmaddwd m0, [rsp+ 2*16]
pmaddwd m7, [rsp+ 3*16]
paddd m0, m7
paddd m0, m4
psrldq m4, m1, 2
psrldq m5, m1, 4
psrldq m6, m1, 6
psrldq m7, m1, 8
punpcklwd m4, m5
punpcklwd m6, m7
pmaddwd m4, [rsp+ 4*16]
pmaddwd m6, [rsp+ 5*16]
paddd m4, m6
paddd m0, m4
movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
pxor m7, m7
pcmpgtb m7, m2
punpckhbw m5, m2, m7
punpcklbw m2, m7
palignr m7, m3, m1, 10
palignr m3, m1, 12
psrldq m1, m2, 2
punpcklwd m7, m3
punpcklwd m3, m2, m1
pmaddwd m7, m8
pmaddwd m3, m9
paddd m7, m3
paddd m0, m7
psrldq m6, m2, 4
psrldq m1, m2, 6
psrldq m3, m2, 8
palignr m4, m5, m2, 10
palignr m5, m5, m2, 12
punpcklwd m6, m1
punpcklwd m3, m4
punpcklwd m5, m14
pmaddwd m6, m10
pmaddwd m3, m11
pmaddwd m5, m12
paddd m0, m6
paddd m3, m5
paddd m0, m3
movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
pxor m5, m5
pcmpgtb m5, m1
punpcklbw m2, m1, m5
pmaddwd m2, m13
pshufd m3, m2, q1111
paddd m2, m3 ; left+cur
paddd m2, m0 ; add top
psrldq m0, 4
psrad m2, [fg_dataq+FGData.ar_coeff_shift]
; don't packssdw since we only care about one value
packsswb m2, m2
pslldq m2, 3
pand m2, m15
pandn m3, m15, m1
por m1, m2, m3
movd [bufq+xq-3], m1
psrldq m1, 1
inc xq
jz .x_loop_ar3_end
test xq, 3
jnz .x_loop_ar3_inner
jmp .x_loop_ar3
.x_loop_ar3_end:
add bufq, 82
dec hd
jg .y_loop_ar3
RET
%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
INIT_XMM ssse3
cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
movifnidn r2, r2mp
movifnidn r3, r3mp
LEA r4, $$
%define base r4-$$
movq m1, [base+rnd_next_upperbit_mask]
movq m4, [base+mul_bits]
movq m7, [base+hmul_bits]
mov r5d, [fg_dataq+FGData.grain_scale_shift]
movd m6, [base+round+r5*2]
mova m5, [base+pb_mask]
movd m0, [fg_dataq+FGData.seed]
movd m2, [base+pw_seed_xor+uvq*4]
pxor m0, m2
pshuflw m6, m6, q0000
pshuflw m0, m0, q0000
lea r6, [base+gaussian_sequence]
%if %2
%if ARCH_X86_64
mov r7d, 73-35*%3
%else
mov r3mp, 73-35*%3
%endif
add bufq, 44
.loop_y:
mov r5, -44
.loop_x:
%else
mov r5, -82*73
sub bufq, r5
.loop:
%endif
pand m2, m0, m1
psrlw m3, m2, 10
por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
pmullw m2, m4 ; bits 0x0f00 are set
pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
psllq m2, m3, 30
por m3, m2
psllq m2, m3, 15
por m3, m2 ; aggregate each bit into next seed's high bit
pmulhuw m2, m0, m7
por m2, m3 ; 4 next output seeds
pshuflw m0, m2, q3333
psrlw m2, 5
%if ARCH_X86_64
movd r9d, m2
pshuflw m2, m2, q3232
movzx r8, r9w
shr r9, 16
movd m3, [r6+r8*2]
pinsrw m3, [r6+r9*2], 1
movd r9d, m2
movzx r8, r9w
shr r9, 16
pinsrw m3, [r6+r8*2], 2
pinsrw m3, [r6+r9*2], 3
%else
movd r2, m2
pshuflw m2, m2, q3232
movzx r1, r2w
shr r2, 16
movd m3, [r6+r1*2]
pinsrw m3, [r6+r2*2], 1
movd r2, m2
movzx r1, r2w
shr r2, 16
pinsrw m3, [r6+r1*2], 2
pinsrw m3, [r6+r2*2], 3
%endif
pmulhrsw m3, m6
packsswb m3, m3
movd [bufq+r5], m3
add r5, 4
%if %2
jl .loop_x
add bufq, 82
%if ARCH_X86_64
dec r7d
%else
dec r3mp
%endif
jg .loop_y
%else
jl .loop
%endif
%if ARCH_X86_32
mov r2, r2mp
%endif
; auto-regression code
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
jmp r5
.ar0:
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
%if ARCH_X86_32
ALLOC_STACK -2*16
%endif
imul uvd, 28
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
movd m4, [base+hmul_bits+shiftq*2]
DEFINE_ARGS buf, bufy, h, x
pxor m0, m0
pcmpgtb m0, m5
punpcklbw m5, m0
movd m7, [base+pb_1]
%if %2
movd m6, [base+hmul_bits+2+%3*2]
%endif
pshuflw m5, m5, q0000
pshuflw m4, m4, q0000
pshufd m7, m7, q0000
%if %2
pshuflw m6, m6, q0000
%endif
punpcklqdq m5, m5
punpcklqdq m4, m4
%if %2
punpcklqdq m6, m6
%endif
pcmpeqw m1, m1
pslldq m1, 12>>%2
SCRATCH 1, 8, 0
SCRATCH 4, 9, 1
%if %2
sub bufq, 82*(73-35*%3)+82-(82*3+41)
%else
sub bufq, 82*70-3
%endif
add bufyq, 3+82*3
mov hd, 70-35*%3
.y_loop_ar0:
xor xd, xd
.x_loop_ar0:
; first 32 pixels
%if %2
movu m1, [bufyq+xq*2]
%if %3
movu m2, [bufyq+xq*2+82]
%endif
movu m3, [bufyq+xq*2+16]
%if %3
movu m4, [bufyq+xq*2+82+16]
%endif
pmaddubsw m0, m7, m1
%if %3
pmaddubsw m1, m7, m2
%endif
pmaddubsw m2, m7, m3
%if %3
pmaddubsw m3, m7, m4
paddw m0, m1
paddw m2, m3
%endif
pmulhrsw m0, m6
pmulhrsw m2, m6
%else
movu m0, [bufyq+xq]
pxor m6, m6
pcmpgtb m6, m0
punpckhbw m2, m0, m6
punpcklbw m0, m6
%endif
pmullw m0, m5
pmullw m2, m5
pmulhrsw m0, m9
pmulhrsw m2, m9
movu m1, [bufq+xq]
pxor m4, m4
pcmpgtb m4, m1
punpckhbw m3, m1, m4
%if %2
punpcklbw m1, m4
paddw m2, m3
paddw m0, m1
%else
punpcklbw m6, m1, m4
paddw m2, m3
paddw m0, m6
%endif
packsswb m0, m2
%if %2
movu [bufq+xq], m0
add xd, 16
cmp xd, 32
jl .x_loop_ar0
; last 6/12 pixels
movu m1, [bufyq+xq*(1+%2)]
%if %3
movu m2, [bufyq+xq*2+82]
%endif
pmaddubsw m0, m7, m1
%if %3
pmaddubsw m1, m7, m2
paddw m0, m1
%endif
pmulhrsw m0, m6
pmullw m0, m5
pmulhrsw m0, m9
movq m1, [bufq+xq]
pxor m4, m4
pcmpgtb m4, m1
punpcklbw m2, m1, m4
paddw m0, m2
packsswb m0, m0
pandn m2, m8, m0
pand m1, m8
por m2, m1
movq [bufq+xq], m2
%else
add xd, 16
cmp xd, 80
je .y_loop_final_ar0
movu [bufq+xq-16], m0
jmp .x_loop_ar0
.y_loop_final_ar0:
pandn m2, m8, m0
pand m1, m8
por m2, m1
movu [bufq+xq-16], m2
%endif
add bufq, 82
add bufyq, 82<<%3
dec hd
jg .y_loop_ar0
RET
.ar1:
%if ARCH_X86_32
RESET_STACK_STATE
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
imul uvd, 28
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
%if ARCH_X86_32
mov r3mp, cf3d
DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
%elif WIN64
DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
mov bufq, r0
%else
DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd m3, [base+round_vals+shiftq*2-12] ; rnd
%if %2
movd m7, [base+pb_1]
movd m6, [base+hmul_bits+2+%3*2]
%endif
psrldq m4, 1
%if ARCH_X86_32
DEFINE_ARGS buf, shift, val0, val3, min, max, x
%elif WIN64
DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
%else
DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
%endif
pxor m5, m5
punpcklwd m3, m5
%if %2
punpcklwd m6, m6
%endif
pcmpgtb m5, m4
punpcklbw m4, m5
pshufd m5, m4, q1111
pshufd m4, m4, q0000
pshufd m3, m3, q0000
%if %2
pshufd m7, m7, q0000
pshufd m6, m6, q0000
sub bufq, 82*(73-35*%3)+44-(82*3+41)
%else
sub bufq, 82*69+3
%endif
%if ARCH_X86_32
add r1mp, 79+82*3
mov r0mp, 70-35*%3
%else
add bufyq, 79+82*3
mov hd, 70-35*%3
%endif
mov mind, -128
mov maxd, 127
.y_loop_ar1:
mov xq, -(76>>%2)
movsx val3d, byte [bufq+xq-1]
.x_loop_ar1:
%if %2
%if ARCH_X86_32
mov r2, r1mp
movq m0, [r2+xq*2]
%if %3
movq m1, [r2+xq*2+82]
%endif
%else
movq m0, [bufyq+xq*2]
%if %3
movq m1, [bufyq+xq*2+82]
%endif
%endif
pmaddubsw m2, m7, m0
%if %3
pmaddubsw m0, m7, m1
paddw m2, m0
%endif
pmulhrsw m2, m6
%else
%if ARCH_X86_32
mov r2, r1mp
movd m2, [r2+xq]
%else
movd m2, [bufyq+xq]
%endif
pxor m0, m0
pcmpgtb m0, m2
punpcklbw m2, m0
%endif
movq m0, [bufq+xq-82-1] ; top/left
pxor m1, m1
pcmpgtb m1, m0
punpcklbw m0, m1
psrldq m1, m0, 4 ; top/right
punpcklwd m1, m2
psrldq m2, m0, 2 ; top
punpcklwd m0, m2
pmaddwd m0, m4
pmaddwd m1, m5
paddd m0, m1
paddd m0, m3
.x_loop_ar1_inner:
movd val0d, m0
psrldq m0, 4
%if ARCH_X86_32
imul val3d, r3mp
%else
imul val3d, cf3d
%endif
add val3d, val0d
sar val3d, shiftb
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
cmovns val3d, maxd
cmp val3d, mind
cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
jz .x_loop_ar1_end
test xq, 3
jnz .x_loop_ar1_inner
jmp .x_loop_ar1
.x_loop_ar1_end:
add bufq, 82
%if ARCH_X86_32
add r1mp, 82<<%3
dec r0mp
%else
add bufyq, 82<<%3
dec hd
%endif
jg .y_loop_ar1
RET
.ar2:
%if ARCH_X86_32
ALLOC_STACK -8*16
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 28
movd m7, [base+round_vals-12+shiftq*2]
movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12
pxor m2, m2
pcmpgtb m2, m0
punpckhbw m1, m0, m2
punpcklbw m0, m2
pinsrw m1, [base+pw_1], 5
punpcklwd m7, m7
pshufd m7, m7, q0000
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
pshufd m4, m1, q0000
pshufd m5, m1, q1111
pshufd m6, m1, q2222
pshufd m3, m0, q3333
pshufd m2, m0, q2222
pshufd m1, m0, q1111
pshufd m0, m0, q0000
SCRATCH 0, 8, 0
SCRATCH 1, 9, 1
SCRATCH 2, 10, 2
SCRATCH 3, 11, 3
SCRATCH 4, 12, 4
SCRATCH 5, 13, 5
SCRATCH 6, 14, 6
SCRATCH 7, 15, 7
%if %2
movd m7, [base+hmul_bits+2+%3*2]
movd m6, [base+pb_1]
punpcklwd m7, m7
pshufd m6, m6, q0000
pshufd m7, m7, q0000
sub bufq, 82*(73-35*%3)+44-(82*3+41)
%else
sub bufq, 82*69+3
%endif
add bufyq, 79+82*3
mov hd, 70-35*%3
.y_loop_ar2:
mov xq, -(76>>%2)
.x_loop_ar2:
pxor m2, m2
movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
pcmpgtb m2, m0
punpckhbw m1, m0, m2
punpcklbw m0, m2
psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
punpcklwd m2, m0, m5
punpcklwd m3, m4
pmaddwd m2, m8
pmaddwd m3, m11
paddd m2, m3
psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
psrldq m0, 8 ; y=-2,x=[+2,+5]
punpcklwd m4, m5
punpcklwd m0, m1
psrldq m3, m1, 6 ; y=-1,x=[+1,+5]
psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
punpcklwd m3, m1
pmaddwd m4, m9
pmaddwd m0, m10
pmaddwd m3, m12
paddd m4, m0
paddd m2, m3
paddd m2, m4
%if %2
movq m1, [bufyq+xq*2]
%if %3
movq m3, [bufyq+xq*2+82]
%endif
pmaddubsw m0, m6, m1
%if %3
pmaddubsw m1, m6, m3
paddw m0, m1
%endif
pmulhrsw m0, m7
%else
movd m0, [bufyq+xq]
pxor m1, m1
pcmpgtb m1, m0
punpcklbw m0, m1
%endif
punpcklwd m0, m15
pmaddwd m0, m14
paddd m2, m0
movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
pxor m4, m4
movd m5, [base+byte_blend+1]
punpcklbw m5, m5
.x_loop_ar2_inner:
pcmpgtb m1, m4, m0
punpcklbw m0, m1
pmaddwd m3, m0, m13
paddd m3, m2
psrldq m2, 4 ; shift top to next pixel
psrad m3, [fg_dataq+FGData.ar_coeff_shift]
pslldq m3, 4
pand m3, m5
paddw m0, m3
packsswb m0, m0
movd [bufq+xq-2], m0
psrldq m0, 1
inc xq
jz .x_loop_ar2_end
test xq, 3
jnz .x_loop_ar2_inner
jmp .x_loop_ar2
.x_loop_ar2_end:
add bufq, 82
add bufyq, 82<<%3
dec hd
jg .y_loop_ar2
RET
.ar3:
%if ARCH_X86_32
RESET_STACK_STATE
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
%if ARCH_X86_32
ALLOC_STACK -15*16
%else
SUB rsp, 16*7
%assign stack_size_padded (stack_size_padded+16*7)
%assign stack_size (stack_size+16*7)
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 28
movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
pxor m3, m3
pcmpgtb m3, m0
punpckhbw m1, m0, m3
punpcklbw m0, m3
pshufd m2, m0, q1111
pshufd m3, m0, q2222
pshufd m4, m0, q3333
pshufd m0, m0, q0000
pshufd m5, m1, q1111
pshufd m6, m1, q2222
pshufd m7, m1, q3333
pshufd m1, m1, q0000
mova [rsp+ 0*16], m0
mova [rsp+ 1*16], m2
mova [rsp+ 2*16], m3
mova [rsp+ 3*16], m4
mova [rsp+ 4*16], m1
mova [rsp+ 5*16], m5
mova [rsp+ 6*16], m6
SCRATCH 7, 8, 7
movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma]
pxor m4, m4
pcmpgtb m4, m2
punpckhbw m5, m2, m4
punpcklbw m2, m4
pshufd m4, m2, q3232
punpcklwd m3, m4, m5
pshuflw m5, m4, q3321
pshufd m4, m3, q0000
pshufd m3, m2, q1111
pshufd m2, m2, q0000
pinsrw m5, [base+round_vals+shiftq*2-10], 3
SCRATCH 2, 9, 8
SCRATCH 3, 10, 9
SCRATCH 4, 11, 10
SCRATCH 5, 12, 11
movd m2, [base+round_vals-12+shiftq*2]
%if %2
movd m1, [base+pb_1]
movd m3, [base+hmul_bits+2+%3*2]
%endif
pxor m0, m0
punpcklwd m2, m0
%if %2
punpcklwd m3, m3
%endif
pshufd m2, m2, q0000
%if %2
pshufd m1, m1, q0000
pshufd m3, m3, q0000
SCRATCH 1, 13, 12
%endif
SCRATCH 2, 14, 13
%if %2
SCRATCH 3, 15, 14
%endif
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
%if %2
sub bufq, 82*(73-35*%3)+44-(82*3+41)
%else
sub bufq, 82*69+3
%endif
add bufyq, 79+82*3
mov hd, 70-35*%3
.y_loop_ar3:
mov xq, -(76>>%2)
.x_loop_ar3:
movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
pxor m4, m4
pcmpgtb m4, m0
punpckhbw m3, m0, m4
punpcklbw m0, m4
psrldq m5, m0, 2
psrldq m6, m0, 4
psrldq m7, m0, 6
punpcklwd m4, m0, m5
punpcklwd m6, m7
pmaddwd m4, [rsp+ 0*16]
pmaddwd m6, [rsp+ 1*16]
paddd m4, m6
palignr m2, m3, m0, 10
palignr m3, m0, 12
psrldq m0, 8
movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
pxor m6, m6
pcmpgtb m6, m1
punpckhbw m5, m1, m6
punpcklbw m1, m6
punpcklwd m0, m2
punpcklwd m3, m1
pmaddwd m0, [rsp+ 2*16]
pmaddwd m3, [rsp+ 3*16]
paddd m0, m3
paddd m0, m4
movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
pxor m7, m7
pcmpgtb m7, m2
punpckhbw m6, m2, m7
punpcklbw m2, m7
palignr m3, m5, m1, 10
palignr m5, m1, 12
psrldq m4, m2, 2
punpcklwd m3, m5
punpcklwd m5, m2, m4
pmaddwd m3, [rsp+ 6*16]
pmaddwd m5, m8
paddd m3, m5
paddd m0, m3
psrldq m3, m1, 2
psrldq m4, m1, 4
psrldq m5, m1, 6
psrldq m1, 8
punpcklwd m3, m4
punpcklwd m5, m1
pmaddwd m3, [rsp+ 4*16]
pmaddwd m5, [rsp+ 5*16]
paddd m3, m5
paddd m0, m3
%if %2
movq m1, [bufyq+xq*2]
%if %3
movq m3, [bufyq+xq*2+82]
%endif
pmaddubsw m7, m13, m1
%if %3
pmaddubsw m5, m13, m3
paddw m7, m5
%endif
pmulhrsw m7, m15
%else
movd m7, [bufyq+xq]
pxor m1, m1
pcmpgtb m1, m7
punpcklbw m7, m1
%endif
psrldq m1, m2, 4
psrldq m3, m2, 6
palignr m4, m6, m2, 10
palignr m6, m2, 12
psrldq m2, 8
punpcklwd m1, m3
punpcklwd m2, m4
punpcklwd m6, m7
pmaddwd m1, m9
pmaddwd m2, m10
pmaddwd m6, m11
paddd m1, m2
paddd m0, m6
paddd m0, m1
paddd m0, m14
movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
pxor m4, m4
movd m5, [base+byte_blend]
.x_loop_ar3_inner:
pcmpgtb m2, m4, m1
punpcklbw m3, m1, m2
pmaddwd m2, m3, m12
pshufd m3, m2, q1111
paddd m2, m3 ; left+cur
paddd m2, m0 ; add top
psrldq m0, 4
psrad m2, [fg_dataq+FGData.ar_coeff_shift]
; don't packssdw, we only care about one value
packsswb m2, m2
pandn m3, m5, m1
pslld m2, 24
pand m2, m5
por m1, m2, m3
movd [bufq+xq-3], m1
psrldq m1, 1
inc xq
jz .x_loop_ar3_end
test xq, 3
jnz .x_loop_ar3_inner
jmp .x_loop_ar3
.x_loop_ar3_end:
add bufq, 82
add bufyq, 82<<%3
dec hd
jg .y_loop_ar3
RET
%endmacro
generate_grain_uv_fn 420, 1, 1
generate_grain_uv_fn 422, 1, 0
generate_grain_uv_fn 444, 0, 0
%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
%assign %%idx 0
%define %%tmp %2
%if %0 == 6
%define %%tmp %6
%endif
%rep 4
%if %%idx == 0
movd %5 %+ d, %2
pshuflw %%tmp, %2, q3232
%else
movd %5 %+ d, %%tmp
%if %%idx == 2
punpckhqdq %%tmp, %%tmp
%elif %%idx == 4
psrlq %%tmp, 32
%endif
%endif
movzx %4 %+ d, %5 %+ w
shr %5 %+ d, 16
%if %%idx == 0
movd %1, [%3+%4]
%else
pinsrw %1, [%3+%4], %%idx + 0
%endif
pinsrw %1, [%3+%5], %%idx + 1
%assign %%idx %%idx+2
%endrep
%endmacro
INIT_XMM ssse3
; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
dst, src, scaling, unused1, fg_data, picptr, unused2
; copy stack arguments to new position post-alignment, so that we
; don't have to keep the old stack location in a separate register
mov r0, r0m
mov r1, r2m
mov r2, r4m
mov r3, r6m
mov r4, r7m
mov r5, r8m
mov [rsp+5*mmsize+ 4*gprsize], r0
mov [rsp+5*mmsize+ 6*gprsize], r1
mov [rsp+5*mmsize+ 8*gprsize], r2
mov [rsp+5*mmsize+10*gprsize], r3
mov [rsp+5*mmsize+11*gprsize], r4
mov [rsp+5*mmsize+12*gprsize], r5
%else
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
dst, src, scaling, unused1, fg_data, picptr, unused2
%endif
mov srcq, srcm
mov fg_dataq, r3m
mov scalingq, r5m
%if STACK_ALIGNMENT < mmsize
%define r0m [rsp+5*mmsize+ 4*gprsize]
%define r1m [rsp+5*mmsize+ 5*gprsize]
%define r2m [rsp+5*mmsize+ 6*gprsize]
%define r3m [rsp+5*mmsize+ 7*gprsize]
%define r4m [rsp+5*mmsize+ 8*gprsize]
%define r5m [rsp+5*mmsize+ 9*gprsize]
%define r6m [rsp+5*mmsize+10*gprsize]
%define r7m [rsp+5*mmsize+11*gprsize]
%define r8m [rsp+5*mmsize+12*gprsize]
%endif
LEA r5, pb_mask
%define base r5-pb_mask
mov r5m, picptrq
%else
cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
lea r7, [pb_mask]
%define base r7-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
movd m4, [base+max+r6*4]
movd m5, [base+min+r6*2]
punpcklwd m3, m3
punpcklwd m4, m4
punpcklwd m5, m5
pshufd m3, m3, q0000
pshufd m4, m4, q0000
pshufd m5, m5, q0000
SCRATCH 3, 11, 0
SCRATCH 4, 12, 1
SCRATCH 5, 13, 2
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
%endif
mov sbyd, r8m
mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
test overlapd, overlapd
jz .no_vertical_overlap
mova m6, [base+pw_1024]
mova m7, [base+pb_27_17_17_27]
SCRATCH 6, 14, 3
SCRATCH 7, 15, 4
test sbyd, sbyd
jnz .vertical_overlap
; fall-through
.no_vertical_overlap:
mov r8m, overlapd
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
imul seed, (173 << 24) | 37
%else
imul seed, sbyd, (173 << 24) | 37
%endif
add seed, (105 << 24) | 178
rol seed, 8
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov r3m, seed
mov wq, r4m
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
unused1, unused2, see, unused3
%endif
lea src_bakq, [srcq+wq]
neg wq
sub dstmp, srcq
%if ARCH_X86_32
mov r1m, src_bakq
mov r4m, wq
DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
%endif
.loop_x:
%if ARCH_X86_32
mov seed, r3m
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
offx, offy, see, unused
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164
lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
%if ARCH_X86_32
; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
; r6m=grain_lut, r7m=h, r8m=overlap_v|h
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
h, offxy, see, unused
%endif
.loop_x_odd:
mov hd, r7m
mov grain_lutq, grain_lutmp
.loop_y:
; src
mova m0, [srcq]
pxor m2, m2
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m2, m4
pmullw m3, m5
pmulhrsw m2, m11
pmulhrsw m3, m11
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
movifnidn dstq, dstmp
mova [dstq+srcq], m0
add srcq, r2mp
add grain_lutq, 82
dec hd
jg .loop_y
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end
%if ARCH_X86_32
mov srcq, r1mp
add srcq, r4mp
%else
lea srcq, [src_bakq+wq]
%endif
btc dword r8m, 2
jc .next_blk
add offxyd, 16
test dword r8m, 2 ; r8m & 2 = have_top_overlap
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxyd
%endif
jnz .loop_x_odd_v_overlap
.next_blk:
test dword r8m, 1
jz .loop_x
test dword r8m, 2
jnz .loop_x_hv_overlap
; horizontal overlap (without vertical overlap)
.loop_x_h_overlap:
%if ARCH_X86_32
; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
; r6m=grain_lut, r7m=h, r8m=overlap_v|h
DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
add offxyd, 16 ; left_offxyd
mov [rsp+5*mmsize+0*gprsize], offxyd
DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
mov seed, r3m
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
offx, offy, see, left_offxy
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
mov offxd, offyd
%else
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164
lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
h, offxy, see, left_offxy
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
.loop_y_h_overlap:
; src
mova m0, [srcq]
pxor m2, m2
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+0*gprsize]
movd m7, [grain_lutq+r5]
%else
movd m7, [grain_lutq+left_offxyq]
%endif
punpcklbw m7, m3
pmaddubsw m6, m15, m7
pmulhrsw m6, m14
packsswb m6, m6
shufps m6, m3, q3210
pcmpgtb m2, m6
punpcklbw m7, m6, m2
punpckhbw m6, m2
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m7, m4
pmullw m6, m5
pmulhrsw m7, m11
pmulhrsw m6, m11
; dst = clip_pixel(src, noise)
paddw m0, m7
paddw m1, m6
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
movifnidn dstq, dstmp
mova [dstq+srcq], m0
add srcq, r2mp
add grain_lutq, 82
dec hd
jg .loop_y_h_overlap
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end
%if ARCH_X86_32
mov srcq, r1m
add srcq, r4m
%else
lea srcq, [src_bakq+wq]
%endif
xor dword r8m, 4
add offxyd, 16
; since this half-block had left-overlap, the next does not
test dword r8m, 2 ; have_top_overlap
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxyd
%endif
jmp .loop_x_odd_v_overlap
.end:
RET
.vertical_overlap:
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
%endif
or overlapd, 2 ; top_overlap: overlap & 2
mov r8m, overlapd
movzx sbyd, sbyb
%if ARCH_X86_32
imul r4, [fg_dataq+FGData.seed], 0x00010001
DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
%else
imul seed, [fg_dataq+FGData.seed], 0x00010001
%endif
imul tmpd, sbyd, 173 * 0x00010001
imul sbyd, 37 * 0x01000100
add tmpd, (105 << 16) | 188
add sbyd, (178 << 24) | (141 << 8)
and tmpd, 0x00ff00ff
and sbyd, 0xff00ff00
xor seed, tmpd
%if ARCH_X86_32
xor sbyd, seed ; (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov r3m, seed
mov wq, r4m
%else
xor seed, sbyd ; (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
tmp, unused2, see, unused3
%endif
lea src_bakq, [srcq+wq]
neg wq
sub dstmp, srcq
%if ARCH_X86_32
mov r1m, src_bakq
mov r4m, wq
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%endif
.loop_x_v_overlap:
%if ARCH_X86_32
mov seed, r3m
%endif
; we assume from the block above that bits 8-15 of tmpd are zero'ed,
; because of the 'and tmpd, 0x00ff00ff' above
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp tmpb ; parity of top_seed
shr seed, 16
shl tmpd, 16
test seeb, seeh
setp tmpb ; parity of cur_seed
or r6d, 0x00010001
xor tmpd, r6d
mov seed, tmpd
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
offx, offy, see, unused, top_offxy
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*2+0x10001*747+32*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
h, offxy, see, unused, top_offxy
%endif
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+5*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
shr offxyd, 16
.loop_x_odd_v_overlap:
%if ARCH_X86_32
mov r5, r5m
lea r5, [base+pb_27_17]
mov [rsp+5*mmsize+12], r5
%else
mova m8, [pb_27_17]
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
.loop_y_v_overlap:
; src
mova m0, [srcq]
pxor m2, m2
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+1*gprsize]
movu m7, [grain_lutq+r5]
%else
movu m7, [grain_lutq+top_offxyq]
%endif
punpckhbw m6, m7, m3
punpcklbw m7, m3
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+12]
pmaddubsw m3, [r5], m6
pmaddubsw m6, [r5], m7
%else
pmaddubsw m3, m8, m6
pmaddubsw m6, m8, m7
%endif
pmulhrsw m3, m14
pmulhrsw m6, m14
packsswb m6, m3
pcmpgtb m7, m2, m6
punpcklbw m2, m6, m7
punpckhbw m6, m7
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m2, m4
pmullw m6, m5
pmulhrsw m2, m11
pmulhrsw m6, m11
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m1, m6
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
movifnidn dstq, dstmp
mova [dstq+srcq], m0
%if ARCH_X86_32
add dword [rsp+5*mmsize+12], mmsize
%else
mova m8, [pb_17_27]
%endif
add srcq, r2mp
add grain_lutq, 82
dec hw
jz .end_y_v_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
btc hd, 16
jnc .loop_y_v_overlap
jmp .loop_y
.end_y_v_overlap:
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end_hv
%if ARCH_X86_32
mov srcq, r1mp
add srcq, r4mp
%else
lea srcq, [src_bakq+wq]
%endif
btc dword r8m, 2
jc .loop_x_hv_overlap
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
jmp .loop_x_odd_v_overlap
.loop_x_hv_overlap:
%if ARCH_X86_32
mov r5, r5m
lea r5, [base+pb_27_17]
mov [rsp+5*mmsize+12], r5
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
mov r5, [rsp+5*mmsize+1*gprsize]
mov r4, offxyd
add r5, 16
add r4, 16
mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
xor tmpd, tmpd
mov seed, r3m
%else
mova m8, [pb_27_17]
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
tmp, unused2, see, unused3
; we assume from the block above that bits 8-15 of tmpd are zero'ed
%endif
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp tmpb ; parity of top_seed
shr seed, 16
shl tmpd, 16
test seeb, seeh
setp tmpb ; parity of cur_seed
or r6d, 0x00010001
xor tmpd, r6d
mov seed, tmpd
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
offx, offy, see, left_offxy, top_offxy, topleft_offxy
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offyq+16]
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*2+0x10001*747+32*82]
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
movzx r5, offxyw ; top_offxy
mov [rsp+5*mmsize+1*gprsize], r5
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy
movzx top_offxyd, offxyw
%endif
shr offxyd, 16
mov hd, r7m
mov grain_lutq, grain_lutmp
.loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
movu m6, [grain_lutq+r5]
mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
movd m4, [grain_lutq+r0]
movd m7, [grain_lutq+r5]
%else
movu m6, [grain_lutq+top_offxyq]
movd m4, [grain_lutq+left_offxyq]
movd m7, [grain_lutq+topleft_offxyq]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m4, m3
punpcklbw m7, m6
pmaddubsw m2, m15, m4
pmaddubsw m4, m15, m7
pmulhrsw m2, m14
pmulhrsw m4, m14
packsswb m2, m2
packsswb m4, m4
shufps m2, m3, q3210
shufps m4, m6, q3210
; followed by v interpolation (top | cur -> cur)
punpcklbw m3, m4, m2
punpckhbw m4, m2
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+12]
pmaddubsw m7, [r5], m4
pmaddubsw m4, [r5], m3
%else
pmaddubsw m7, m8, m4
pmaddubsw m4, m8, m3
%endif
pmulhrsw m7, m14
pmulhrsw m4, m14
packsswb m4, m7
pxor m2, m2
pcmpgtb m7, m2, m4
punpcklbw m3, m4, m7
punpckhbw m4, m7
; src
mova m0, [srcq]
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; scaling[src]
%if ARCH_X86_32
vpgatherdw m5, m0, scalingq-1, r0, r5, m7
vpgatherdw m6, m1, scalingq-1, r0, r5, m7
%else
vpgatherdw m5, m0, scalingq-1, r13, r14, m7
vpgatherdw m6, m1, scalingq-1, r13, r14, m7
%endif
REPX {psrlw x, 8}, m5, m6
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m3, m5
pmullw m4, m6
pmulhrsw m3, m11
pmulhrsw m4, m11
; dst = clip_pixel(src, noise)
paddw m0, m3
paddw m1, m4
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
movifnidn dstq, dstmp
mova [dstq+srcq], m0
%if ARCH_X86_32
add dword [rsp+5*mmsize+12], mmsize
%else
mova m8, [pb_17_27]
%endif
add srcq, r2mp
add grain_lutq, 82
dec hw
jz .end_y_hv_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
btc hd, 16
jnc .loop_y_hv_overlap
jmp .loop_y_h_overlap
.end_y_hv_overlap:
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end_hv
%if ARCH_X86_32
mov srcq, r1m
add srcq, r4m
%else
lea srcq, [src_bakq+wq]
%endif
xor dword r8m, 4
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
jmp .loop_x_odd_v_overlap
.end_hv:
RET
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
INIT_XMM ssse3
%if ARCH_X86_32
; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
; sby, luma, lstride, uv_pl, is_id)
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
tmp, src, scaling, h, fg_data, picptr, unused
mov r0, r0m
mov r1, r2m
mov r2, r4m
mov r3, r6m
mov r4, r7m
mov [rsp+7*mmsize+3*gprsize], r0
mov [rsp+7*mmsize+5*gprsize], r1
mov [rsp+7*mmsize+7*gprsize], r2
mov [rsp+7*mmsize+9*gprsize], r3
mov [rsp+7*mmsize+10*gprsize], r4
mov r0, r8m
mov r1, r9m
mov r2, r10m
mov r4, r11m
mov r3, r12m
mov [rsp+7*mmsize+11*gprsize], r0
mov [rsp+7*mmsize+12*gprsize], r1
mov [rsp+7*mmsize+13*gprsize], r2
mov [rsp+7*mmsize+14*gprsize], r4
%else
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
tmp, src, scaling, h, fg_data, picptr, unused
%endif
mov srcq, srcm
mov fg_dataq, r3m
mov scalingq, r5m
%if STACK_ALIGNMENT < mmsize
%define r0m [rsp+7*mmsize+ 3*gprsize]
%define r1m [rsp+7*mmsize+ 4*gprsize]
%define r2m [rsp+7*mmsize+ 5*gprsize]
%define r3m [rsp+7*mmsize+ 6*gprsize]
%define r4m [rsp+7*mmsize+ 7*gprsize]
%define r5m [rsp+7*mmsize+ 8*gprsize]
%define r6m [rsp+7*mmsize+ 9*gprsize]
%define r7m [rsp+7*mmsize+10*gprsize]
%define r8m [rsp+7*mmsize+11*gprsize]
%define r9m [rsp+7*mmsize+12*gprsize]
%define r10m [rsp+7*mmsize+13*gprsize]
%define r11m [rsp+7*mmsize+14*gprsize]
%define r12m [rsp+7*mmsize+15*gprsize]
%endif
LEA r5, pb_mask
%define base r5-pb_mask
mov r5m, r5
%else
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
lea r8, [pb_mask]
%define base r8-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
lea tmpd, [r6d*2]
%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
test r3, r3
%else
cmp dword r12m, 0 ; is_idm
%endif
movd m5, [base+min+r6*2]
cmovne r6d, tmpd
movd m4, [base+max+r6*2]
punpcklwd m3, m3
punpcklwd m5, m5
punpcklwd m4, m4
pshufd m3, m3, q0000
pshufd m5, m5, q0000
pshufd m4, m4, q0000
SCRATCH 3, 11, 0
SCRATCH 4, 12, 1
SCRATCH 5, 13, 2
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
%endif
%if %1
mov r6d, dword r11m
movd m0, [fg_dataq+FGData.uv_mult+r6*4]
movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
punpcklbw m6, m1, m0
movd m7, [fg_dataq+FGData.uv_offset+r6*4]
punpcklwd m6, m6
punpcklwd m7, m7
pshufd m6, m6, q0000
pshufd m7, m7, q0000
SCRATCH 6, 14, 3
SCRATCH 7, 15, 4
%endif
mov sbyd, r8m
mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
test overlapd, overlapd
jz %%no_vertical_overlap
%if ARCH_X86_32
%if %2
mova m1, [base+pb_23_22_h]
%else
mova m1, [base+pb_27_17_17_27]
%endif
mova m0, [base+pw_1024]
%else
%if %2
mova m1, [pb_23_22_h]
%else
mova m1, [pb_27_17_17_27]
%endif
mova m0, [pw_1024]
%endif
SCRATCH 0, 8, 5
SCRATCH 1, 9, 6
test sbyd, sbyd
jnz %%vertical_overlap
; fall-through
%%no_vertical_overlap:
mov r8m, overlapd
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
imul seed, (173 << 24) | 37
%else
imul seed, sbyd, (173 << 24) | 37
%endif
add seed, (105 << 24) | 178
rol seed, 8
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
%define luma_bakq lumaq
mov wq, r4m
%if %3
shl r10mp, 1
%endif
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
mov lstrideq, r10mp
%endif
mov lumaq, r9mp
lea src_bakq, [srcq+wq]
lea luma_bakq, [lumaq+wq*(1+%2)]
neg wq
sub r0mp, srcq
%if ARCH_X86_32
mov r1m, src_bakq
mov r11m, luma_bakq
mov r4m, wq
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%else
mov r11mp, src_bakq
mov r12mp, strideq
%endif
%%loop_x:
%if ARCH_X86_32
mov seed, r3m
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, overlap, unused1, unused2, lstride
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
%endif
%%loop_x_odd:
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y:
; src
%if ARCH_X86_32
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
mov r5, r5m
movd m7, [base+pb_1]
%else
movd m7, [pb_1]
%endif
pshufd m7, m7, q0000
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
pmaddubsw m4, m14
psraw m6, 6
psraw m4, 6
paddw m6, m15
paddw m4, m15
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
REPX {psrlw x, 8}, m7, m5
; unpack chroma_source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq+ 0]
pcmpgtb m6, m2, m3
punpcklbw m2, m3, m6
punpckhbw m3, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
pmullw m2, m7
pmullw m3, m5
pmulhrsw m2, m11
pmulhrsw m3, m11
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
movifnidn dstq, dstmp
mova [dstq+srcq], m0
%if ARCH_X86_32
add srcq, r2mp
; we already incremented lumaq above
%else
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*2]
%else
add lumaq, lstrideq
%endif
%endif
add grain_lutq, 82
dec hw
jg %%loop_y
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
mov wq, r4m
%endif
add wq, 16
jge %%end
%if ARCH_X86_32
mov srcq, r1mp
mov lumaq, r11mp
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2 == 0
; adjust top_offxy
%if ARCH_X86_32
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
add offxyd, 16
btc dword r8m, 2
jc %%loop_x_even
test dword r8m, 2
jz %%loop_x_odd
jmp %%loop_x_odd_v_overlap
%%loop_x_even:
%endif
test dword r8m, 1
jz %%loop_x
; r8m = sbym
test dword r8m, 2
jne %%loop_x_hv_overlap
; horizontal overlap (without vertical overlap)
%%loop_x_h_overlap:
%if ARCH_X86_32
%if %2
lea r6, [offxyd+16]
mov [rsp+7*mmsize+0*gprsize], r6
%else
mov [rsp+7*mmsize+0*gprsize], offxyd
%endif
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
mov seed, r3m
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, lstride
%if %2
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
%else
mov left_offxyd, offyd
%endif
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, lstride
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y_h_overlap:
; src
%if ARCH_X86_32
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
mov r5, r5m
movd m7, [base+pb_1]
%else
movd m7, [pb_1]
%endif
pshufd m7, m7, q0000
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
pmaddubsw m4, m14
psraw m6, 6
psraw m4, 6
paddw m6, m15
paddw m4, m15
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
REPX {psrlw x, 8}, m7, m5
; unpack chroma_source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
movu m4, [grain_lutq+offxyq+ 0]
%if ARCH_X86_32
mov r0, [rsp+7*mmsize+0*gprsize]
movd m2, [grain_lutq+r0+ 0]
%else
movd m2, [grain_lutq+left_offxyq+ 0]
%endif
punpcklbw m2, m4
pmaddubsw m3, m9, m2
pmulhrsw m3, m8
packsswb m3, m3
shufps m3, m4, q3210
pxor m4, m4
pcmpgtb m4, m3
punpcklbw m2, m3, m4
punpckhbw m3, m4
; noise = round2(scaling[luma_src] * grain, scaling_shift)
pmullw m2, m7
pmullw m3, m5
pmulhrsw m2, m11
pmulhrsw m3, m11
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
movifnidn dstq, dstmp
mova [dstq+srcq], m0
%if ARCH_X86_32
add srcq, r2mp
; lumaq has already been incremented above
%else
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*2]
%else
add lumaq, lstrideq
%endif
%endif
add grain_lutq, 82
dec hw
jg %%loop_y_h_overlap
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
mov wq, r4m
%endif
add wq, 16
jge %%end
%if ARCH_X86_32
mov srcq, r1mp
mov lumaq, r11mp
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2 == 0
xor dword r8m, 4
; adjust top_offxyd
%if ARCH_X86_32
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
add offxyd, 16
%endif
; r8m = sbym
test dword r8m, 2
%if %2
jne %%loop_x_hv_overlap
jmp %%loop_x_h_overlap
%else
jne %%loop_x_odd_v_overlap
jmp %%loop_x_odd
%endif
%%end:
RET
%%vertical_overlap:
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
%endif
or overlapd, 2 ; top_overlap: overlap & 2
mov r8m, overlapd
movzx sbyd, sbyb
%if ARCH_X86_32
imul r4, [fg_dataq+FGData.seed], 0x00010001
DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
%else
imul seed, [fg_dataq+FGData.seed], 0x00010001
%endif
imul tmpd, sbyd, 173 * 0x00010001
imul sbyd, 37 * 0x01000100
add tmpd, (105 << 16) | 188
add sbyd, (178 << 24) | (141 << 8)
and tmpd, 0x00ff00ff
and sbyd, 0xff00ff00
xor seed, tmpd
%if ARCH_X86_32
xor sbyd, seed ; (cur_seed << 16) | top_seed
DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
mov r3m, seed
mov wq, r4m
%if %3
shl r10mp, 1
%endif
%else
xor seed, sbyd ; (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
mov lstrideq, r10mp
%endif
mov lumaq, r9mp
lea src_bakq, [srcq+wq]
lea luma_bakq, [lumaq+wq*(1+%2)]
neg wq
sub r0mp, srcq
%if ARCH_X86_32
mov r1m, src_bakq
mov r11m, luma_bakq
mov r4m, wq
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%else
mov r11mp, src_bakq
mov r12mp, strideq
%endif
%%loop_x_v_overlap:
%if ARCH_X86_32
mov seed, r3m
xor tmpd, tmpd
%endif
; we assume from the block above that bits 8-15 of tmpd are zero'ed
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp tmpb ; parity of top_seed
shr seed, 16
shl tmpd, 16
test seeb, seeh
setp tmpb ; parity of cur_seed
or r6d, 0x00010001
xor tmpd, r6d
mov seed, tmpd
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, overlap, top_offxy, unused, lstride
mov offxd, seed
mov offyd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
%endif
movzx top_offxyd, offxyw
shr offxyd, 16
%if ARCH_X86_32
mov [rsp+7*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%endif
%%loop_x_odd_v_overlap:
mov hd, r7m
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
%endif
%if %3
mova m1, [PIC_ptr(pb_23_22)]
%else
mova m1, [PIC_ptr(pb_27_17)]
%endif
%%loop_y_v_overlap:
%if ARCH_X86_32
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
mov r5, r5m
movd m7, [base+pb_1]
%else
movd m7, [pb_1]
%endif
pshufd m7, m7, q0000
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
pmaddubsw m4, m14
psraw m6, 6
psraw m4, 6
paddw m6, m15
paddw m4, m15
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
REPX {psrlw x, 8}, m7, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r0, [rsp+7*mmsize+1*gprsize]
movu m4, [grain_lutq+r0]
%else
movu m4, [grain_lutq+top_offxyq]
%endif
punpckhbw m6, m4, m3
punpcklbw m4, m3
pmaddubsw m2, m1, m6
pmaddubsw m3, m1, m4
pmulhrsw m2, m8
pmulhrsw m3, m8
packsswb m3, m2
pxor m6, m6
pcmpgtb m6, m3
punpcklbw m2, m3, m6
punpckhbw m3, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
pmullw m2, m7
pmullw m3, m5
pmulhrsw m2, m11
pmulhrsw m3, m11
; unpack chroma_source
pxor m4, m4
punpckhbw m6, m0, m4
punpcklbw m0, m4 ; m0-1: src as word
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m6, m3
pmaxsw m0, m13
pmaxsw m6, m13
pminsw m0, m12
pminsw m6, m12
packuswb m0, m6
movifnidn dstq, dstmp
mova [dstq+srcq], m0
dec hw
je %%end_y_v_overlap
%if ARCH_X86_32
add srcq, r2mp
; lumaq has already been incremented above
%else
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*2]
%else
add lumaq, lstrideq
%endif
%endif
add grain_lutq, 82
%if %3 == 0
btc hd, 16
%if ARCH_X86_32
mov r5, r5m
%endif
mova m1, [PIC_ptr(pb_17_27)]
jnc %%loop_y_v_overlap
%endif
jmp %%loop_y
%%end_y_v_overlap:
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
mov wq, r4m
%endif
add wq, 16
jge %%end_hv
%if ARCH_X86_32
mov srcq, r1mp
mov lumaq, r11mp
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2
; since fg_dataq.overlap is guaranteed to be set, we never jump
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
%else
%if ARCH_X86_32
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
add offxyd, 16
btc dword r8m, 2
jnc %%loop_x_odd_v_overlap
%endif
%%loop_x_hv_overlap:
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
mov r6, [rsp+7*mmsize+1*gprsize]
%if %2
lea r0, [r3d+16]
add r6, 16
mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
%else
mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
%endif
mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
mov seed, r3m
xor tmpd, tmpd
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
%if %2
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offxyq+16]
%else
mov topleft_offxyq, top_offxyq
mov left_offxyq, offxyq
%endif
; we assume from the block above that bits 8-15 of tmpd are zero'ed
%endif
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp tmpb ; parity of top_seed
shr seed, 16
shl tmpd, 16
test seeb, seeh
setp tmpb ; parity of cur_seed
or r6d, 0x00010001
xor tmpd, r6d
mov seed, tmpd
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
mov offxd, seed
mov offyd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
%endif
movzx top_offxyd, offxyw
shr offxyd, 16
%if ARCH_X86_32
mov [rsp+7*mmsize+1*gprsize], top_offxyd
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
%endif
%if %3
mova m3, [PIC_ptr(pb_23_22)]
%else
mova m3, [PIC_ptr(pb_27_17)]
%endif
%%loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
%if ARCH_X86_32
mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
movd m1, [grain_lutq+r0]
mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
%else
movd m1, [grain_lutq+topleft_offxyq]
%endif
movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
%else
movu m6, [grain_lutq+top_offxyq]
movd m4, [grain_lutq+left_offxyq]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
punpcklbw m4, m2
pmaddubsw m0, m9, m1
pmaddubsw m1, m9, m4
REPX {pmulhrsw x, m8}, m0, m1
packsswb m0, m1
shufps m4, m0, m2, q3232
shufps m0, m6, q3210
; followed by v interpolation (top | cur -> cur)
punpcklbw m2, m0, m4
punpckhbw m0, m4
pmaddubsw m4, m3, m0
pmaddubsw m1, m3, m2
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
; src
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
mov r5, r5m
movd m7, [base+pb_1]
%else
movd m7, [pb_1]
%endif
pshufd m7, m7, q0000
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
pmaddubsw m4, m14
psraw m6, 6
psraw m4, 6
paddw m6, m15
paddw m4, m15
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
%if %3
vpgatherdw m7, m4, scalingq-1, r2, r12
vpgatherdw m5, m6, scalingq-1, r2, r12
%else
vpgatherdw m7, m4, scalingq-1, r2, r13
vpgatherdw m5, m6, scalingq-1, r2, r13
%endif
%endif
REPX {psrlw x, 8}, m7, m5
; unpack grain
pxor m4, m4
pcmpgtb m4, m1
punpcklbw m2, m1, m4
punpckhbw m1, m4
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m2, m7
pmullw m1, m5
pmulhrsw m2, m11
pmulhrsw m1, m11
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
; unpack chroma source
pxor m4, m4
punpckhbw m5, m0, m4
punpcklbw m0, m4 ; m0-1: src as word
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m5, m1
pmaxsw m0, m13
pmaxsw m5, m13
pminsw m0, m12
pminsw m5, m12
packuswb m0, m5
movifnidn dstq, dstmp
mova [dstq+srcq], m0
%if ARCH_X86_32
add srcq, r2mp
; lumaq has been adjusted above already
%else
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*(1+%2)]
%else
add lumaq, r10mp
%endif
%endif
add grain_lutq, 82
dec hw
%if %3
jg %%loop_y_h_overlap
%else
jle %%end_y_hv_overlap
%if ARCH_X86_32
mov r5, r5m
%endif
mova m3, [PIC_ptr(pb_17_27)]
btc hd, 16
jnc %%loop_y_hv_overlap
%if ARCH_X86_64
mov lstrideq, r10mp
%endif
jmp %%loop_y_h_overlap
%%end_y_hv_overlap:
%if ARCH_X86_64
mov lstrideq, r10mp
%endif
%endif
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
mov wq, r4m
%endif
add wq, 16
jge %%end_hv
%if ARCH_X86_32
mov srcq, r1mp
mov lumaq, r11mp
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2
jmp %%loop_x_hv_overlap
%else
%if ARCH_X86_32
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
add offxyd, 16
xor dword r8m, 4
jmp %%loop_x_odd_v_overlap
%endif
%%end_hv:
RET
%endmacro
%%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
%%FGUV_32x32xN_LOOP 0, %2, %3
%endmacro
FGUV_FN 420, 1, 1
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
%endif
FGUV_FN 422, 1, 0
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
%endif
FGUV_FN 444, 0, 0