diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 3bba80bd87..7608bb32e1 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -1,5 +1,5 @@ /* - * MMX/SSE constants used across x86 dsp optimizations. + * MMX/SSE/AVX constants used across x86 dsp optimizations. * * This file is part of FFmpeg. * @@ -47,7 +47,9 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x020 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL, 0x0101010101010101ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL, + 0x0303030303030303ULL, 0x0303030303030303ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 4bf74ff76c..3ebf171adc 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -44,8 +44,8 @@ extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_128; extern const uint64_t ff_pw_255; -extern const xmm_reg ff_pb_1; -extern const xmm_reg ff_pb_3; +extern const ymm_reg ff_pb_1; +extern const ymm_reg ff_pb_3; extern const xmm_reg ff_pb_80; extern const xmm_reg ff_pb_F8; extern const uint64_t ff_pb_FC; diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 3fd274d17f..b04e678118 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -241,6 +241,13 @@ ipred_funcs(hd, ssse3, avx); ipred_funcs(vl, ssse3, avx); ipred_funcs(vr, ssse3, avx); +ipred_func(32, dc, avx2); +ipred_func(32, dc_left, avx2); +ipred_func(32, dc_top, avx2); +ipred_func(32, v, avx2); +ipred_func(32, h, avx2); +ipred_func(32, tm, avx2); + #undef ipred_funcs #undef ipred_func_set #undef ipred_func @@ -388,6 +395,15 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_ipred(TX_32X32, 32, avx); } + if (EXTERNAL_AVX2(cpu_flags)) { + dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2; + dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2; + dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2; + dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2; + dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2; + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2; + } + #undef init_fpel #undef init_subpel1 #undef init_subpel2 diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm index 3faf1c564d..2cab05ab2f 100644 --- a/libavcodec/x86/vp9intrapred.asm +++ b/libavcodec/x86/vp9intrapred.asm @@ -29,10 +29,10 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_m256: times 8 dw -256 -pw_m255: times 8 dw -255 +pw_m256: times 16 dw -256 +pw_m255: times 16 dw -255 pw_512: times 8 dw 512 pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 @@ -72,7 +72,7 @@ pb_3to1_5x0: db 3, 2, 1 times 9 db 0 pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -pb_2: times 16 db 2 +pb_2: times 32 db 2 pb_15: times 16 db 15 cextern pb_1 @@ -180,6 +180,40 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a jg .loop RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_512] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) %macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l) @@ -267,6 +301,37 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a dec cntd jg .loop RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_1024] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif %endmacro DC_1D_FUNCS top, a @@ -327,6 +392,29 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a jg .loop RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + ; h INIT_XMM ssse3 @@ -417,6 +505,32 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt H_XMM_FUNCS ssse3 H_XMM_FUNCS avx +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd xm3, [lq+cntq*4] + vinserti128 m3, m3, xm3, 1 + pshufb m0, m3, m7 + pshufb m1, m3, m6 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m2, m3, m5 + pshufb m3, m4 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endif + ; tm INIT_MMX ssse3 @@ -554,6 +668,41 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a TM_XMM_FUNCS ssse3 TM_XMM_FUNCS avx +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + pinsrw xm2, [aq-1], 0 + vinserti128 m2, m2, xm2, 1 + mova m0, [aq] + DEFINE_ARGS dst, stride, l, cnt + mova m4, [pw_m256] + mova m5, [pw_m255] + pshufb m2, m4 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + psubw m1, m2 + psubw m0, m2 + mov cntq, 15 +.loop: + pinsrw xm7, [lq+cntq*2], 0 + vinserti128 m7, m7, xm7, 1 + pshufb m3, m7, m5 + pshufb m7, m4 + paddw m2, m3, m0 + paddw m3, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m3 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endif + ; dl %macro LOWPASS 4 ; left [dst], center, right, tmp diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h index 2cecc980a5..616ad6c96f 100644 --- a/libavutil/x86/asm.h +++ b/libavutil/x86/asm.h @@ -25,6 +25,7 @@ #include "config.h" typedef struct xmm_reg { uint64_t a, b; } xmm_reg; +typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg; #if ARCH_X86_64 # define OPSIZE "q"