mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-03-28 12:32:17 +02:00
avcodec/vp9: avx2 implementation of ipred_dl_16x16_16
vp9_diag_downleft_16x16_10bpp_c: 263.0 vp9_diag_downleft_16x16_10bpp_sse2: 44.7 vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 vp9_diag_downleft_16x16_10bpp_avx: 31.9 vp9_diag_downleft_16x16_10bpp_avx2: 25.7 vp9_diag_downleft_16x16_12bpp_c: 264.7 vp9_diag_downleft_16x16_12bpp_sse2: 44.4 vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 vp9_diag_downleft_16x16_12bpp_avx: 32.4 vp9_diag_downleft_16x16_12bpp_avx2: 25.5 Benchmarked with 10000 runs Signed-off-by: Ilia <zakne0ne@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
5eb4f95bef
commit
2f3d10a01a
libavcodec/x86
@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2);
|
|||||||
decl_ipred_fns(dc, 16, mmxext, sse2);
|
decl_ipred_fns(dc, 16, mmxext, sse2);
|
||||||
decl_ipred_fns(dc_top, 16, mmxext, sse2);
|
decl_ipred_fns(dc_top, 16, mmxext, sse2);
|
||||||
decl_ipred_fns(dc_left, 16, mmxext, sse2);
|
decl_ipred_fns(dc_left, 16, mmxext, sse2);
|
||||||
|
decl_ipred_fn(dl, 16, 16, avx2);
|
||||||
|
|
||||||
#define decl_ipred_dir_funcs(type) \
|
#define decl_ipred_dir_funcs(type) \
|
||||||
decl_ipred_fns(type, 16, sse2, sse2); \
|
decl_ipred_fns(type, 16, sse2, sse2); \
|
||||||
@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
|
|||||||
init_fpel_func(2, 1, 32, avg, _16, avx2);
|
init_fpel_func(2, 1, 32, avg, _16, avx2);
|
||||||
init_fpel_func(1, 1, 64, avg, _16, avx2);
|
init_fpel_func(1, 1, 64, avg, _16, avx2);
|
||||||
init_fpel_func(0, 1, 128, avg, _16, avx2);
|
init_fpel_func(0, 1, 128, avg, _16, avx2);
|
||||||
|
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* HAVE_YASM */
|
#endif /* HAVE_YASM */
|
||||||
|
@ -847,6 +847,45 @@ DL_FUNCS
|
|||||||
INIT_XMM avx
|
INIT_XMM avx
|
||||||
DL_FUNCS
|
DL_FUNCS
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
|
||||||
|
movifnidn aq, amp
|
||||||
|
mova m0, [aq] ; abcdefghijklmnop
|
||||||
|
vpbroadcastw xm1, [aq+30] ; pppppppp
|
||||||
|
vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
|
||||||
|
vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
|
||||||
|
vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
|
||||||
|
LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
|
||||||
|
vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
|
||||||
|
DEFINE_ARGS dst, stride, stride3, cnt
|
||||||
|
mov cntd, 2
|
||||||
|
lea stride3q, [strideq*3]
|
||||||
|
.loop:
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
vpalignr m3, m2, m0, 2
|
||||||
|
vpalignr m4, m2, m0, 4
|
||||||
|
mova [dstq+strideq*1], m3
|
||||||
|
mova [dstq+strideq*2], m4
|
||||||
|
vpalignr m3, m2, m0, 6
|
||||||
|
vpalignr m4, m2, m0, 8
|
||||||
|
mova [dstq+stride3q ], m3
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
mova [dstq+strideq*0], m4
|
||||||
|
vpalignr m3, m2, m0, 10
|
||||||
|
vpalignr m4, m2, m0, 12
|
||||||
|
mova [dstq+strideq*1], m3
|
||||||
|
mova [dstq+strideq*2], m4
|
||||||
|
vpalignr m3, m2, m0, 14
|
||||||
|
mova [dstq+stride3q ], m3
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
mova m0, m2
|
||||||
|
vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
|
||||||
|
dec cntd
|
||||||
|
jg .loop
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
|
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
|
||||||
cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
|
cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
|
||||||
movh m0, [lq] ; wxyz....
|
movh m0, [lq] ; wxyz....
|
||||||
|
Loading…
x
Reference in New Issue
Block a user