1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

vp9: Add 8bpc intra prediction AVX2 asm

This commit is contained in:
Henrik Gramner
2025-09-01 02:03:00 +02:00
committed by Henrik Gramner
parent 9a34ddc345
commit a66a260ae9
2 changed files with 309 additions and 171 deletions

View File

@@ -207,11 +207,8 @@ ipred_dir_tm_h_funcs(8, avx);
ipred_dir_tm_h_funcs(16, avx);
ipred_dir_tm_h_funcs(32, avx);
ipred_func(32, v, avx);
ipred_dc_funcs(32, avx2);
ipred_func(32, h, avx2);
ipred_func(32, tm, avx2);
ipred_all_funcs(32, avx2);
ipred_func(32, v, avx2);
#undef ipred_func
#undef ipred_dir_tm_h_funcs
@@ -388,7 +385,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel_func(1, 0, 32, put, , avx);
init_fpel_func(0, 0, 64, put, , avx);
init_ipred(32, avx, v, VERT);
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
@@ -408,9 +404,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
init_subpel3_32_64(1, avg, 8, avx2);
#endif
}
init_dc_ipred(32, avx2);
init_ipred(32, avx2, h, HOR);
init_ipred(32, avx2, tm, TM_VP8);
init_all_ipred(32, avx2);
init_ipred(32, avx2, v, VERT);
}
#if ARCH_X86_64

View File

@@ -2,6 +2,7 @@
;* VP9 Intra prediction SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;* Copyright (c) 2025 Two Orioles, LLC
;*
;* Parts based on:
;* H.264 intra prediction asm optimizations
@@ -230,40 +231,6 @@ DC_16to32_FUNCS
INIT_XMM ssse3
DC_16to32_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
mova m0, [lq]
mova m1, [aq]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
pxor m2, m2
psadbw m0, m2
psadbw m1, m2
paddw m0, m1
vextracti128 xm1, m0, 1
paddw xm0, xm1
movhlps xm1, xm0
paddw xm0, xm1
pmulhrsw xm0, [pw_512]
vpbroadcastb m0, xm0
mov cntd, 4
.loop:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
%endif
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
@@ -395,44 +362,6 @@ INIT_XMM ssse3
DC_1D_16to32_FUNCS top, a
DC_1D_16to32_FUNCS left, l
%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
%if HAVE_AVX2_EXTERNAL
cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
mova m0, [%2q]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
pxor m2, m2
psadbw m0, m2
vextracti128 xm1, m0, 1
paddw xm0, xm1
movhlps xm1, xm0
paddw xm0, xm1
pmulhrsw xm0, [pw_1024]
vpbroadcastb m0, xm0
mov cntd, 4
.loop:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
%endif
%endmacro
INIT_YMM avx2
DC_1D_AVX2_FUNCS top, a
DC_1D_AVX2_FUNCS left, l
; v
INIT_MMX mmx
cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
movq m0, [aq]
@@ -486,29 +415,6 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
jg .loop
RET
INIT_YMM avx
cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
.loop:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
; h
%macro H_XMM_FUNCS 2
%if notcpuflag(avx)
cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
@@ -642,34 +548,6 @@ H_XMM_FUNCS 4, 8
INIT_XMM avx
H_XMM_FUNCS 4, 8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
mova m5, [pb_1]
mova m6, [pb_2]
mova m7, [pb_3]
pxor m4, m4
lea stride3q, [strideq*3]
mov cntq, 7
.loop:
movd xm3, [lq+cntq*4]
vinserti128 m3, m3, xm3, 1
pshufb m0, m3, m7
pshufb m1, m3, m6
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufb m2, m3, m5
pshufb m3, m4
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
dec cntq
jge .loop
RET
%endif
; tm
%macro TM_MMX_FUNCS 0
cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
pxor m1, m1
@@ -898,46 +776,9 @@ TM_XMM_FUNCS
INIT_XMM avx
TM_XMM_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
pxor m3, m3
pinsrw xm2, [aq-1], 0
vinserti128 m2, m2, xm2, 1
mova m0, [aq]
DEFINE_ARGS dst, stride, l, cnt
mova m4, [pw_m256]
mova m5, [pw_m255]
pshufb m2, m4
punpckhbw m1, m0, m3
punpcklbw m0, m3
psubw m1, m2
psubw m0, m2
mov cntq, 15
.loop:
pinsrw xm7, [lq+cntq*2], 0
vinserti128 m7, m7, xm7, 1
pshufb m3, m7, m5
pshufb m7, m4
paddw m2, m3, m0
paddw m3, m1
paddw m6, m7, m0
paddw m7, m1
packuswb m2, m3
packuswb m6, m7
mova [dstq+strideq*0], m2
mova [dstq+strideq*1], m6
lea dstq, [dstq+strideq*2]
dec cntq
jge .loop
RET
%endif
; dl
%macro LOWPASS 4 ; left [dst], center, right, tmp
%macro LOWPASS 4-5 [pb_1] ; left [dst], center, right, tmp, pb_1
pxor m%4, m%1, m%3
pand m%4, [pb_1]
pand m%4, %5
pavgb m%1, m%3
psubusb m%1, m%4
pavgb m%1, m%2
@@ -2041,4 +1882,306 @@ HU_XMM_FUNCS 7
INIT_XMM avx
HU_XMM_FUNCS 7
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
pxor m1, m1
psadbw m0, m1, [lq]
psadbw m1, [aq]
movd xm2, [pw_512]
paddw m0, m1
vextracti128 xm1, m0, 1
.main:
paddw xm0, xm1
punpckhqdq xm1, xm0, xm0
paddw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastb m0, xm0
.main2:
lea r2, [strideq*3]
mov r3d, 8
.loop:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+r2 ], m0
lea dstq, [dstq+strideq*4]
dec r3d
jg .loop
RET
cglobal vp9_ipred_dc_top_32x32, 0, 4, 3, dst, stride, l, a
mov lq, amp
%if ARCH_X86_32
jmp mangle(private_prefix %+ _vp9_ipred_dc_left_32x32 %+ SUFFIX).main
%endif
%assign function_align 1
cglobal vp9_ipred_dc_left_32x32, 0, 4, 3, dst, stride, l, a
movifnidn lq, lmp
.main:
movifnidn dstq, dstmp
movifnidn strideq, stridemp
pxor xm1, xm1
psadbw xm0, xm1, [lq]
psadbw xm1, [lq+16]
movd xm2, [pw_1024]
jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main
cglobal vp9_ipred_v_32x32, 2, 4, 3, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main2
%assign function_align 16
cglobal vp9_ipred_h_32x32, 3, 5, 6, dst, stride, l
vpbroadcastd m2, [pb_3]
mov r3d, 7
vpbroadcastd m3, [pb_2]
pxor m5, m5
vpbroadcastd m4, [pb_1]
lea r4, [strideq*3]
.loop:
vpbroadcastd m1, [lq+r3*4]
pshufb m0, m1, m2
mova [dstq+strideq*0], m0
pshufb m0, m1, m3
mova [dstq+strideq*1], m0
pshufb m0, m1, m4
mova [dstq+strideq*2], m0
pshufb m1, m5
mova [dstq+r4 ], m1
lea dstq, [dstq+strideq*4]
dec r3d
jge .loop
RET
cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
vpbroadcastd m0, [aq-1]
mova m7, [aq]
pxor m1, m1
vpbroadcastd m4, [pw_m255]
mov r3d, 15
vpbroadcastd m5, [pw_m256]
pshufb m0, m5
punpcklbw m6, m7, m1
punpckhbw m7, m1
psubw m6, m0
psubw m7, m0
.loop:
vpbroadcastd m3, [lq+r3*2]
pshufb m2, m3, m4
pshufb m3, m5
paddw m0, m2, m6
paddw m2, m7
paddw m1, m3, m6
paddw m3, m7
packuswb m0, m2
packuswb m1, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
dec r3d
jge .loop
RET
cglobal vp9_ipred_dl_32x32, 2, 5, 6, dst, stride, l, a
movifnidn aq, amp
vpbroadcastb m2, [aq+31]
vinserti128 m3, m2, [aq+16], 0
mova m0, [aq+ 0]
vpbroadcastd m5, [pb_1]
palignr m4, m3, m0, 2
lea r3, [strideq*2]
palignr m3, m0, 1
LOWPASS 0, 3, 4, 1, m5
lea r4, [strideq*3]
vperm2i128 m1, m0, m2, 0x31
mov r2d, 8
.loop:
shufpd m3, m0, m1, 0x05
mova [dstq+r3*0], m0
punpckhqdq m4, m1, m2
mova [dstq+r3*4], m3
palignr m0, m1, m0, 1
mova [dstq+r3*8], m1
palignr m1, m2, m1, 1
mova [dstq+r4*8], m4
add dstq, strideq
dec r2d
jg .loop
RET
cglobal vp9_ipred_dr_32x32, 4, 5, 7, dst, stride, l, a
mova m3, [lq+ 0]
movu m1, [aq- 1]
mova m0, [aq+ 0]
vpbroadcastd m6, [pb_1]
vperm2i128 m2, m3, m1, 0x21
lea r3, [strideq*2]
palignr m4, m1, m2, 15
LOWPASS 0, 1, 4, 5, m6
pslldq xm4, xm3, 1
palignr m2, m3, 1
vinserti128 m4, [lq+15], 1
LOWPASS 2, 3, 4, 5, m6
lea r4, [strideq*3]
vperm2i128 m1, m2, m0, 0x21
mov r2d, 8
.loop:
shufpd m3, m1, m0, 0x05
mova [dstq+r3*0], m0
shufpd m4, m2, m1, 0x05
mova [dstq+r3*4], m3
palignr m0, m1, 15
mova [dstq+r3*8], m1
palignr m1, m2, 15
mova [dstq+r4*8], m4
add dstq, strideq
pslldq m2, 1
dec r2d
jg .loop
RET
cglobal vp9_ipred_hd_32x32, 4, 6, 7, dst, stride, l, a
movu m1, [aq-1]
mova m0, [lq]
vpbroadcastd m6, [pb_1]
vperm2i128 m4, m0, m1, 0x21
palignr m3, m4, m0, 1
palignr m4, m0, 2
LOWPASS 4, 3, 0, 2, m6
pavgb m3, m0
movu xm0, [aq+15]
punpcklbw m2, m3, m4
punpckhbw m3, m4
palignr m4, m0, m1, 2
palignr m0, m1, 1
LOWPASS 4, 0, 1, 5, m6
lea r2, [strideq*8]
vinserti128 m0, m2, xm3, 1
lea r3, [dstq+r2*1]
vpblendd m1, m2, m3, 0x0f
lea r4, [dstq+r2*2]
vperm2i128 m2, m3, 0x31
lea r5, [r3 +r2*2]
vperm2i128 m3, m4, 0x21
.loop:
sub r2, strideq
mova [r5 +r2], m0
palignr m0, m1, m0, 2
mova [r4 +r2], m1
palignr m1, m2, m1, 2
mova [r3 +r2], m2
palignr m2, m3, m2, 2
mova [dstq+r2], m3
palignr m3, m4, m3, 2
psrldq m4, 2
jg .loop
RET
cglobal vp9_ipred_hu_32x32, 3, 5, 6, dst, stride, l, a
mova m0, [lq]
vpbroadcastb xm3, [lq+31]
vpbroadcastd m1, [pb_1]
vbroadcasti128 m4, [pb_2toE_3xF]
vperm2i128 m3, m0, 0x03
palignr m5, m3, m0, 2
palignr m3, m0, 1
LOWPASS 5, 3, 0, 2, m1
vpbroadcastd m1, [pb_15]
pavgb m3, m0
punpcklbw m2, m3, m5
punpckhbw m3, m5
vinserti128 m0, m2, xm3, 1
pshufb m5, m1
vperm2i128 m1, m2, m3, 0x12
lea r3, [strideq*2]
vperm2i128 m2, m3, 0x31
lea r4, [strideq*3]
vperm2i128 m3, m5, 0x31
mov r2d, 8
.loop:
mova [dstq+r3*0], m0
palignr m0, m1, m0, 2
mova [dstq+r3*4], m1
palignr m1, m2, m1, 2
mova [dstq+r3*8], m2
palignr m2, m3, m2, 2
mova [dstq+r4*8], m3
pshufb m3, m4
add dstq, strideq
dec r2d
jg .loop
RET
cglobal vp9_ipred_vl_32x32, 2, 5, 6, dst, stride, l, a
movifnidn aq, amp
vpbroadcastb m4, [aq+31]
vinserti128 m0, m4, [aq+16], 0
mova m1, [aq+ 0]
vpbroadcastd m5, [pb_1]
palignr m2, m0, m1, 2
palignr m0, m1, 1
LOWPASS 2, 0, 1, 3, m5
pavgb m0, m1
lea r3, [strideq*2]
vperm2i128 m1, m0, m4, 0x31
lea r4, [strideq+r3*8]
vperm2i128 m3, m2, m4, 0x31
mov r2d, 8
.loop:
shufpd m4, m0, m1, 0x05
mova [dstq+strideq*0], m0
shufpd m5, m2, m3, 0x05
mova [dstq+strideq*1], m2
palignr m0, m1, m0, 1
mova [dstq+r3*8 ], m4
psrldq m1, 1
mova [dstq+r4 ], m5
palignr m2, m3, m2, 1
add dstq, r3
psrldq m3, 1
dec r2d
jg .loop
RET
cglobal vp9_ipred_vr_32x32, 4, 5, 7, dst, stride, l, a
mova m4, [lq+ 0]
movu m0, [aq- 1]
vpbroadcastd m6, [pb_1]
vperm2i128 m2, m4, m0, 0x21
pslldq xm5, xm4, 1
palignr m3, m2, m4, 1
vinserti128 m5, [lq+15], 1
LOWPASS 3, 4, 5, 1, m6
mova m1, [aq+ 0]
vbroadcasti128 m4, [pb_02468ACE_13579BDF]
palignr m2, m0, m2, 15
LOWPASS 2, 0, 1, 5, m6
pshufb m3, m4
lea r3, [strideq*2]
vpermq m3, m3, q2031
pavgb m0, m1
vinserti128 m1, m3, xm0, 1
lea r4, [strideq+r3*8]
vperm2i128 m3, m2, 0x21
mov r2d, 8
.loop:
shufpd m4, m1, m0, 0x05
mova [dstq+strideq*0], m0
shufpd m5, m3, m2, 0x05
mova [dstq+strideq*1], m2
palignr m0, m1, 15
mova [dstq+r3*8 ], m4
pslldq m1, 1
mova [dstq+r4 ], m5
palignr m2, m3, 15
add dstq, r3
pslldq m3, 1
dec r2d
jg .loop
RET
%endif
; FIXME 127, 128, 129 ?