From 6b579cf547a75a0cbda5cb7f10eab9ca07522b0a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 6 Oct 2015 11:03:45 -0400 Subject: [PATCH] vp9: add 10bpp simd (mmxext/ssse3) for idct_idct_4x4. --- libavcodec/x86/constants.c | 2 + libavcodec/x86/constants.h | 1 + libavcodec/x86/vp9dsp_init_16bpp_template.c | 12 +++ libavcodec/x86/vp9itxfm.asm | 50 +---------- libavcodec/x86/vp9itxfm_16bpp.asm | 96 +++++++++++++++++++++ libavcodec/x86/vp9itxfm_template.asm | 47 ++++++++++ 6 files changed, 159 insertions(+), 49 deletions(-) diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 9592fa73e1..7e3d490ec1 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -85,6 +85,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000 0x0000001000000010ULL, 0x0000001000000010ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL, 0x0000002000000020ULL, 0x0000002000000020ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL, + 0x0000200000002000ULL, 0x0000200000002000ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 3605b63504..496933fa86 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -65,6 +65,7 @@ extern const xmm_reg ff_ps_neg; extern const ymm_reg ff_pd_1; extern const ymm_reg ff_pd_16; extern const ymm_reg ff_pd_32; +extern const ymm_reg ff_pd_8192; extern const ymm_reg ff_pd_65535; # if ARCH_X86_64 diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index 7a56c3bc32..6e12af3d3d 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -125,6 +125,10 @@ lpf_mix2_wrappers_set(BPC, avx); decl_ipred_fns(tm, BPC, mmxext, sse2); decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); +#if BPC == 10 +decl_itxfm_func(idct, idct, 4, BPC, mmxext); +decl_itxfm_func(idct, idct, 4, BPC, ssse3); +#endif #endif /* HAVE_YASM */ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) @@ -170,6 +174,9 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); if (!bitexact) { init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext); +#if BPC == 10 + init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext); +#endif } } @@ -182,6 +189,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) if (EXTERNAL_SSSE3(cpu_flags)) { init_lpf_funcs(BPC, ssse3); +#if BPC == 10 + if (!bitexact) { + init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, ssse3); + } +#endif } if (EXTERNAL_AVX(cpu_flags)) { diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index c564f276cf..200f15e790 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -71,8 +71,6 @@ pw_13377x2: times 8 dw 13377*2 pw_m13377_13377: times 4 dw -13377, 13377 pw_13377_0: times 4 dw 13377, 0 -pd_8192: times 4 dd 8192 - cextern pw_8 cextern pw_16 cextern pw_32 @@ -80,38 +78,10 @@ cextern pw_512 cextern pw_1024 cextern pw_2048 cextern pw_m1 +cextern pd_8192 SECTION .text -; (a*x + b*y + round) >> shift -%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 - pmaddwd m%1, m%2, %4 - pmaddwd m%2, %5 - paddd m%1, %3 - paddd m%2, %3 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 - VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] - VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 -%if %0 == 7 - punpckhwd m%6, m%2, m%1 - punpcklwd m%2, m%1 - VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 -%else - punpckhwd m%8, m%4, m%3 - punpcklwd m%2, m%4, m%3 - VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 -%endif -%endmacro - %macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 punpckhwd m%4, m%2, m%1 punpcklwd m%2, m%1 @@ -191,24 +161,6 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob ; void vp9_idct_idct_4x4_add_(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); ;------------------------------------------------------------------------------------------- -%macro VP9_IDCT4_1D_FINALIZE 0 - SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 - SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 - SWAP 0, 3, 2 ; 3102 -> 0123 -%endmacro - -%macro VP9_IDCT4_1D 0 -%if cpuflag(ssse3) - SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) - pmulhrsw m2, m6 ; m2=t0 - pmulhrsw m0, m6 ; m0=t1 -%else ; <= sse2 - VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 -%endif - VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 - VP9_IDCT4_1D_FINALIZE -%endmacro - ; 2x2 top left corner %macro VP9_IDCT4_2x2_1D 0 pmulhrsw m0, m5 ; m0=t1 diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm index 58869e692b..e067438530 100644 --- a/libavcodec/x86/vp9itxfm_16bpp.asm +++ b/libavcodec/x86/vp9itxfm_16bpp.asm @@ -25,8 +25,18 @@ SECTION_RODATA +cextern pw_8 cextern pw_1023 +cextern pw_2048 cextern pw_4095 +cextern pd_8192 + +; FIXME these should probably be shared between 8bpp and 10/12bpp +pw_m11585_11585: times 4 dw -11585, 11585 +pw_11585_11585: times 8 dw 11585 +pw_m15137_6270: times 4 dw -15137, 6270 +pw_6270_15137: times 4 dw 6270, 15137 +pw_11585x2: times 8 dw 11585*2 SECTION .text @@ -118,3 +128,89 @@ INIT_MMX mmxext IWHT4_FN 10, 1023 INIT_MMX mmxext IWHT4_FN 12, 4095 + +; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits +; in 15+1 words without additional effort, since the coefficients are 15bpp. + +%macro IDCT4_10_FN 0 +cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob + cmp eobd, 1 + jg .idctfull + + ; dc-only +%if cpuflag(ssse3) + movd m0, [blockq] + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 +%else + DEFINE_ARGS dst, stride, block, coef + mov coefd, dword [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (8 << 14) + 8192 + sar coefd, 14 + 4 + movd m0, coefd +%endif + pshufw m0, m0, 0 + pxor m4, m4 + mova m5, [pw_1023] + movh [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 +%endif + VP9_STORE_2X 0, 0, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4, 5 + RET + +.idctfull: + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + psraw m0, 4 + psraw m1, 4 + psraw m2, 4 + psraw m3, 4 +%endif + mova m5, [pw_1023] + VP9_STORE_2X 0, 1, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 2, 3, 6, 7, 4, 5 + RET +%endmacro + +INIT_MMX mmxext +IDCT4_10_FN +INIT_MMX ssse3 +IDCT4_10_FN diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm index 43cf3aa015..f1a05a5926 100644 --- a/libavcodec/x86/vp9itxfm_template.asm +++ b/libavcodec/x86/vp9itxfm_template.asm @@ -35,3 +35,50 @@ paddw m3, m2 SWAP 3, 2, 1 %endmacro + +; (a*x + b*y + round) >> shift +%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 + pmaddwd m%1, m%2, %4 + pmaddwd m%2, %5 + paddd m%1, %3 + paddd m%2, %3 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 + VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] + VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 + punpckhwd m%6, m%2, m%1 + punpcklwd m%2, m%1 + VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 +%else + punpckhwd m%8, m%4, m%3 + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 +%endif +%endmacro + +%macro VP9_IDCT4_1D_FINALIZE 0 + SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 + SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 + SWAP 0, 3, 2 ; 3102 -> 0123 +%endmacro + +%macro VP9_IDCT4_1D 0 +%if cpuflag(ssse3) + SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) + pmulhrsw m2, m6 ; m2=t0 + pmulhrsw m0, m6 ; m0=t1 +%else ; <= sse2 + VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 +%endif + VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro