diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index aa3df00633..d78b89626c 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -40,7 +40,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(8, const xmm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, 0x0100010001000100ULL, 0x0100010001000100ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index e75fff9b9a..1c24dda3a7 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -42,7 +42,7 @@ extern const uint64_t ff_pw_53; extern const xmm_reg ff_pw_64; extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_128; -extern const uint64_t ff_pw_255; +extern const xmm_reg ff_pw_255; extern const xmm_reg ff_pw_512; extern const xmm_reg ff_pw_1024; extern const xmm_reg ff_pw_2048; diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 37173fbec8..7acf4f7480 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -243,40 +243,58 @@ lpf_funcs(88, 16, avx); void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *l, const uint8_t *a) -#define ipred_funcs(type, opt) \ -ipred_func(4, type, opt); \ -ipred_func(8, type, opt); \ -ipred_func(16, type, opt); \ -ipred_func(32, type, opt) - -ipred_funcs(dc, ssse3); -ipred_funcs(dc_left, ssse3); -ipred_funcs(dc_top, ssse3); - -#undef ipred_funcs - ipred_func(8, v, mmx); -ipred_func(16, v, sse2); -ipred_func(32, v, sse2); -#define ipred_func_set(size, type, opt1, opt2) \ -ipred_func(size, type, opt1); \ -ipred_func(size, type, opt2) +#define ipred_dc_funcs(size, opt) \ +ipred_func(size, dc, opt); \ +ipred_func(size, dc_left, opt); \ +ipred_func(size, dc_top, opt) -#define ipred_funcs(type, opt1, opt2) \ -ipred_func(4, type, opt1); \ -ipred_func_set(8, type, opt1, opt2); \ -ipred_func_set(16, type, opt1, opt2); \ -ipred_func_set(32, type, opt1, opt2) +ipred_dc_funcs(4, mmxext); +ipred_dc_funcs(8, mmxext); -ipred_funcs(h, ssse3, avx); -ipred_funcs(tm, ssse3, avx); -ipred_funcs(dl, ssse3, avx); -ipred_funcs(dr, ssse3, avx); -ipred_funcs(hu, ssse3, avx); -ipred_funcs(hd, ssse3, avx); -ipred_funcs(vl, ssse3, avx); -ipred_funcs(vr, ssse3, avx); +#define ipred_dir_tm_funcs(size, opt) \ +ipred_func(size, tm, opt); \ +ipred_func(size, dl, opt); \ +ipred_func(size, dr, opt); \ +ipred_func(size, hd, opt); \ +ipred_func(size, hu, opt); \ +ipred_func(size, vl, opt); \ +ipred_func(size, vr, opt) + +ipred_dir_tm_funcs(4, mmxext); + +ipred_func(16, v, sse); +ipred_func(32, v, sse); + +ipred_dc_funcs(16, sse2); +ipred_dc_funcs(32, sse2); + +#define ipred_dir_tm_h_funcs(size, opt) \ +ipred_dir_tm_funcs(size, opt); \ +ipred_func(size, h, opt) + +ipred_dir_tm_h_funcs(8, sse2); +ipred_dir_tm_h_funcs(16, sse2); +ipred_dir_tm_h_funcs(32, sse2); + +ipred_func(4, h, sse2); + +#define ipred_all_funcs(size, opt) \ +ipred_dc_funcs(size, opt); \ +ipred_dir_tm_h_funcs(size, opt) + +// FIXME hd/vl_4x4_ssse3 does not exist +ipred_all_funcs(4, ssse3); +ipred_all_funcs(8, ssse3); +ipred_all_funcs(16, ssse3); +ipred_all_funcs(32, ssse3); + +ipred_dir_tm_h_funcs(8, avx); +ipred_dir_tm_h_funcs(16, avx); +ipred_dir_tm_h_funcs(32, avx); + +ipred_func(32, v, avx); ipred_func(32, dc, avx2); ipred_func(32, dc_left, avx2); @@ -285,9 +303,14 @@ ipred_func(32, v, avx2); ipred_func(32, h, avx2); ipred_func(32, tm, avx2); -#undef ipred_funcs -#undef ipred_func_set +ipred_dc_funcs(32, avx2); +ipred_func(32, h, avx2); +ipred_func(32, tm, avx2); + #undef ipred_func +#undef ipred_dir_tm_h_funcs +#undef ipred_dir_tm_funcs +#undef ipred_dc_funcs #endif /* HAVE_YASM */ @@ -340,23 +363,32 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) } \ } while (0) -#define init_ipred(tx, sz, opt) do { \ - dsp->intra_pred[tx][HOR_PRED] = ff_vp9_ipred_h_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = ff_vp9_ipred_dl_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = ff_vp9_ipred_dr_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][HOR_DOWN_PRED] = ff_vp9_ipred_hd_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][VERT_LEFT_PRED] = ff_vp9_ipred_vl_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][HOR_UP_PRED] = ff_vp9_ipred_hu_##sz##x##sz##_##opt; \ - if (ARCH_X86_64 || tx != TX_32X32) { \ - dsp->intra_pred[tx][VERT_RIGHT_PRED] = ff_vp9_ipred_vr_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][TM_VP8_PRED] = ff_vp9_ipred_tm_##sz##x##sz##_##opt; \ - } \ +#define init_ipred(sz, opt, t, e) \ + dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt + +#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext +#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext +#define init_dir_tm_ipred(sz, opt) do { \ + init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ + init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ + init_ipred(sz, opt, hd, HOR_DOWN); \ + init_ipred(sz, opt, vl, VERT_LEFT); \ + init_ipred(sz, opt, hu, HOR_UP); \ + init_ipred(sz, opt, tm, TM_VP8); \ + init_ipred(sz, opt, vr, VERT_RIGHT); \ } while (0) -#define init_dc_ipred(tx, sz, opt) do { \ - init_ipred(tx, sz, opt); \ - dsp->intra_pred[tx][DC_PRED] = ff_vp9_ipred_dc_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_##sz##x##sz##_##opt; \ - dsp->intra_pred[tx][TOP_DC_PRED] = ff_vp9_ipred_dc_top_##sz##x##sz##_##opt; \ +#define init_dir_tm_h_ipred(sz, opt) do { \ + init_dir_tm_ipred(sz, opt); \ + init_ipred(sz, opt, h, HOR); \ +} while (0) +#define init_dc_ipred(sz, opt) do { \ + init_ipred(sz, opt, dc, DC); \ + init_ipred(sz, opt, dc_left, LEFT_DC); \ + init_ipred(sz, opt, dc_top, TOP_DC); \ +} while (0) +#define init_all_ipred(sz, opt) do { \ + init_dc_ipred(sz, opt); \ + init_dir_tm_h_ipred(sz, opt); \ } while (0) if (EXTERNAL_MMX(cpu_flags)) { @@ -366,7 +398,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) dsp->itxfm_add[4 /* lossless */][ADST_DCT] = dsp->itxfm_add[4 /* lossless */][DCT_ADST] = dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; - dsp->intra_pred[TX_8X8][VERT_PRED] = ff_vp9_ipred_v_8x8_mmx; + init_ipred(8, mmx, v, VERT); } if (EXTERNAL_MMXEXT(cpu_flags)) { @@ -375,12 +407,17 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_fpel(4, 1, 4, avg, mmxext); init_fpel(3, 1, 8, avg, mmxext); dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; + init_dc_ipred(4, mmxext); + init_dc_ipred(8, mmxext); + init_dir_tm_ipred(4, mmxext); } if (EXTERNAL_SSE(cpu_flags)) { init_fpel(2, 0, 16, put, sse); init_fpel(1, 0, 32, put, sse); init_fpel(0, 0, 64, put, sse); + init_ipred(16, sse, v, VERT); + init_ipred(32, sse, v, VERT); } if (EXTERNAL_SSE2(cpu_flags)) { @@ -405,8 +442,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; - dsp->intra_pred[TX_16X16][VERT_PRED] = ff_vp9_ipred_v_16x16_sse2; - dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_sse2; + init_dc_ipred(16, sse2); + init_dc_ipred(32, sse2); + init_dir_tm_h_ipred(8, sse2); + init_dir_tm_h_ipred(16, sse2); + init_dir_tm_h_ipred(32, sse2); + init_ipred(4, sse2, h, HOR); } if (EXTERNAL_SSSE3(cpu_flags)) { @@ -429,10 +470,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; init_lpf(ssse3); - init_dc_ipred(TX_4X4, 4, ssse3); - init_dc_ipred(TX_8X8, 8, ssse3); - init_dc_ipred(TX_16X16, 16, ssse3); - init_dc_ipred(TX_32X32, 32, ssse3); + init_all_ipred(4, ssse3); + init_all_ipred(8, ssse3); + init_all_ipred(16, ssse3); + init_all_ipred(32, ssse3); } if (EXTERNAL_AVX(cpu_flags)) { @@ -451,9 +492,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_fpel(1, 0, 32, put, avx); init_fpel(0, 0, 64, put, avx); init_lpf(avx); - init_ipred(TX_8X8, 8, avx); - init_ipred(TX_16X16, 16, avx); - init_ipred(TX_32X32, 32, avx); + init_dir_tm_h_ipred(8, avx); + init_dir_tm_h_ipred(16, avx); + init_dir_tm_h_ipred(32, avx); + init_ipred(32, avx, v, VERT); } if (EXTERNAL_AVX2(cpu_flags)) { @@ -465,12 +507,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_subpel3_32_64(1, avg, avx2); #endif } - dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2; - dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2; - dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2; - dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2; - dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2; - dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2; + init_dc_ipred(32, avx2); + init_ipred(32, avx2, h, HOR); + init_ipred(32, avx2, tm, TM_VP8); } #undef init_fpel diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm index 66212269a5..169676f088 100644 --- a/libavcodec/x86/vp9intrapred.asm +++ b/libavcodec/x86/vp9intrapred.asm @@ -66,11 +66,23 @@ pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pb_2: times 32 db 2 pb_15: times 16 db 15 +pb_15x0_1xm1: times 15 db 0 + db -1 pb_0to2_5x3: db 0, 1, 2 times 5 db 3 +pb_6xm1_2x0: times 6 db -1 + times 2 db 0 +pb_6x0_2xm1: times 6 db 0 + times 2 db -1 cextern pb_1 cextern pb_3 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_255 cextern pw_512 cextern pw_1024 cextern pw_2048 @@ -80,14 +92,21 @@ SECTION .text ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) -INIT_MMX ssse3 +%macro DC_4to8_FUNCS 0 cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a movd m0, [lq] punpckldq m0, [aq] pxor m1, m1 psadbw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_4096] pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] @@ -95,7 +114,6 @@ cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a movd [dstq+strideq*1], m0 RET -INIT_MMX ssse3 cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a movq m0, [lq] movq m1, [aq] @@ -105,8 +123,15 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a psadbw m0, m2 psadbw m1, m2 paddw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_2048] pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 @@ -117,8 +142,14 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RET +%endmacro -INIT_XMM ssse3 +INIT_MMX mmxext +DC_4to8_FUNCS +INIT_MMX ssse3 +DC_4to8_FUNCS + +%macro DC_16to32_FUNCS 0 cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a mova m0, [lq] mova m1, [aq] @@ -130,8 +161,16 @@ cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a paddw m0, m1 movhlps m1, m0 paddw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_1024] pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif mov cntd, 4 .loop: mova [dstq+strideq*0], m0 @@ -143,7 +182,6 @@ cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a jg .loop RET -INIT_XMM ssse3 cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a mova m0, [lq] mova m1, [lq+16] @@ -161,8 +199,16 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a paddw m0, m2 movhlps m1, m0 paddw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_512] pshufb m0, m4 +%else + paddw m0, [pw_32] + psraw m0, 6 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif mov cntd, 8 .loop: mova [dstq+strideq*0+ 0], m0 @@ -177,6 +223,12 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a dec cntd jg .loop RET +%endmacro + +INIT_XMM sse2 +DC_16to32_FUNCS +INIT_XMM ssse3 +DC_16to32_FUNCS %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 @@ -214,14 +266,20 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) -%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l) -INIT_MMX ssse3 +%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l) cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a movd m0, [%2q] pxor m1, m1 psadbw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_8192] pshufb m0, m1 +%else + paddw m0, [pw_2] + psraw m0, 2 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] @@ -229,15 +287,21 @@ cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a movd [dstq+strideq*1], m0 RET -INIT_MMX ssse3 cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a movq m0, [%2q] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] pxor m1, m1 psadbw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_4096] pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 @@ -248,8 +312,16 @@ cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RET +%endmacro -INIT_XMM ssse3 +INIT_MMX mmxext +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l +INIT_MMX ssse3 +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l + +%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l) cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a mova m0, [%2q] DEFINE_ARGS dst, stride, stride3, cnt @@ -258,8 +330,16 @@ cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a psadbw m0, m2 movhlps m1, m0 paddw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_2048] pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif mov cntd, 4 .loop: mova [dstq+strideq*0], m0 @@ -271,7 +351,6 @@ cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a jg .loop RET -INIT_XMM ssse3 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a mova m0, [%2q] mova m1, [%2q+16] @@ -283,8 +362,16 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a paddw m0, m1 movhlps m1, m0 paddw m0, m1 +%if cpuflag(ssse3) pmulhrsw m0, [pw_1024] pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif mov cntd, 8 .loop: mova [dstq+strideq*0+ 0], m0 @@ -299,9 +386,17 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a dec cntd jg .loop RET +%endmacro +INIT_XMM sse2 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l +INIT_XMM ssse3 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l + +%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l) %if HAVE_AVX2_EXTERNAL -INIT_YMM avx2 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a mova m0, [%2q] DEFINE_ARGS dst, stride, stride3, cnt @@ -332,8 +427,9 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a %endif %endmacro -DC_1D_FUNCS top, a -DC_1D_FUNCS left, l +INIT_YMM avx2 +DC_1D_AVX2_FUNCS top, a +DC_1D_AVX2_FUNCS left, l ; v @@ -353,7 +449,7 @@ cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a movq [dstq+stride3q ], m0 RET -INIT_XMM sse2 +INIT_XMM sse cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a mova m0, [aq] DEFINE_ARGS dst, stride, stride3, cnt @@ -369,7 +465,7 @@ cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a jg .loop RET -INIT_XMM sse2 +INIT_XMM sse cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a mova m0, [aq] mova m1, [aq+16] @@ -390,8 +486,7 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a jg .loop RET -%if HAVE_AVX2_EXTERNAL -INIT_YMM avx2 +INIT_YMM avx cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a mova m0, [aq] DEFINE_ARGS dst, stride, stride3, cnt @@ -411,14 +506,20 @@ cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a dec cntd jg .loop RET -%endif ; h -INIT_XMM ssse3 +%macro H_XMM_FUNCS 2 +%if notcpuflag(avx) cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 movd m0, [lq] +%if cpuflag(ssse3) pshufb m0, [pb_4x3_4x2_4x1_4x0] +%else + punpcklbw m0, m0 + pshuflw m0, m0, q0123 + punpcklwd m0, m0 +%endif lea stride3q, [strideq*3] movd [dstq+strideq*0], m0 psrldq m0, 4 @@ -428,18 +529,26 @@ cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 psrldq m0, 4 movd [dstq+stride3q ], m0 RET +%endif -%macro H_XMM_FUNCS 1 -INIT_XMM %1 -cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt +cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) mova m2, [pb_8x1_8x0] mova m3, [pb_8x3_8x2] +%endif lea stride3q, [strideq*3] mov cntq, 1 .loop: movd m0, [lq+cntq*4] +%if cpuflag(ssse3) pshufb m1, m0, m3 pshufb m0, m2 +%else + punpcklbw m0, m0 + punpcklwd m0, m0 + pshufd m1, m0, q2233 + pshufd m0, m0, q0011 +%endif movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 movq [dstq+strideq*2], m0 @@ -449,22 +558,35 @@ cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt jge .loop RET -INIT_XMM %1 -cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt +cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) mova m5, [pb_1] mova m6, [pb_2] mova m7, [pb_3] pxor m4, m4 +%endif lea stride3q, [strideq*3] mov cntq, 3 .loop: movd m3, [lq+cntq*4] +%if cpuflag(ssse3) pshufb m0, m3, m7 pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 +%if cpuflag(ssse3) pshufb m2, m3, m5 pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] @@ -472,24 +594,37 @@ cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt jge .loop RET -INIT_XMM %1 -cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt +cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) mova m5, [pb_1] mova m6, [pb_2] mova m7, [pb_3] pxor m4, m4 +%endif lea stride3q, [strideq*3] mov cntq, 7 .loop: movd m3, [lq+cntq*4] +%if cpuflag(ssse3) pshufb m0, m3, m7 pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif mova [dstq+strideq*0+ 0], m0 mova [dstq+strideq*0+16], m0 mova [dstq+strideq*1+ 0], m1 mova [dstq+strideq*1+16], m1 +%if cpuflag(ssse3) pshufb m2, m3, m5 pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif mova [dstq+strideq*2+ 0], m2 mova [dstq+strideq*2+16], m2 mova [dstq+stride3q + 0], m3 @@ -500,8 +635,12 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt RET %endmacro -H_XMM_FUNCS ssse3 -H_XMM_FUNCS avx +INIT_XMM sse2 +H_XMM_FUNCS 2, 4 +INIT_XMM ssse3 +H_XMM_FUNCS 4, 8 +INIT_XMM avx +H_XMM_FUNCS 4, 8 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 @@ -531,83 +670,124 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt ; tm -INIT_MMX ssse3 +%macro TM_MMX_FUNCS 0 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a pxor m1, m1 - pinsrw m2, [aq-1], 0 movd m0, [aq] - DEFINE_ARGS dst, stride, l, cnt - mova m3, [pw_m256] - mova m4, [pw_m255] - pshufb m2, m3 + pinsrw m2, [aq-1], 0 punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m2, m2, q0000 +%endif psubw m0, m2 mov cntq, 1 .loop: pinsrw m2, [lq+cntq*2], 0 - pshufb m1, m2, m4 +%if cpuflag(ssse3) + pshufb m4, m2, m1 pshufb m2, m3 - paddw m1, m0 +%else + punpcklbw m2, m1 + pshufw m4, m2, q1111 + pshufw m2, m2, q0000 +%endif + paddw m4, m0 paddw m2, m0 - packuswb m1, m1 + packuswb m4, m4 packuswb m2, m2 - movd [dstq+strideq*0], m1 + movd [dstq+strideq*0], m4 movd [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] dec cntq jge .loop RET +%endmacro -%macro TM_XMM_FUNCS 1 -INIT_XMM %1 +INIT_MMX mmxext +TM_MMX_FUNCS +INIT_MMX ssse3 +TM_MMX_FUNCS + +%macro TM_XMM_FUNCS 0 cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a pxor m1, m1 - pinsrw m2, [aq-1], 0 movh m0, [aq] - DEFINE_ARGS dst, stride, l, cnt - mova m3, [pw_m256] - mova m4, [pw_m255] - pshufb m2, m3 + pinsrw m2, [aq-1], 0 punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif psubw m0, m2 mov cntq, 3 .loop: pinsrw m2, [lq+cntq*2], 0 - pshufb m1, m2, m4 +%if cpuflag(ssse3) + pshufb m4, m2, m1 pshufb m2, m3 - paddw m1, m0 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m4, m2, q1111 + pshufd m2, m2, q0000 +%endif + paddw m4, m0 paddw m2, m0 - packuswb m1, m2 - movh [dstq+strideq*0], m1 - movhps [dstq+strideq*1], m1 + packuswb m4, m2 + movh [dstq+strideq*0], m4 + movhps [dstq+strideq*1], m4 lea dstq, [dstq+strideq*2] dec cntq jge .loop RET -INIT_XMM %1 cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a pxor m3, m3 - pinsrw m2, [aq-1], 0 mova m0, [aq] - DEFINE_ARGS dst, stride, l, cnt - mova m4, [pw_m256] - mova m5, [pw_m255] - pshufb m2, m4 + pinsrw m2, [aq-1], 0 punpckhbw m1, m0, m3 punpcklbw m0, m3 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m4, [pw_m256] + mova m3, [pw_m255] + pshufb m2, m4 +%else + punpcklbw m2, m3 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif psubw m1, m2 psubw m0, m2 mov cntq, 7 .loop: pinsrw m7, [lq+cntq*2], 0 - pshufb m3, m7, m5 +%if cpuflag(ssse3) + pshufb m5, m7, m3 pshufb m7, m4 - paddw m2, m3, m0 - paddw m3, m1 +%else + punpcklbw m7, m3 + punpcklwd m7, m7 + pshufd m5, m7, q1111 + pshufd m7, m7, q0000 +%endif + paddw m2, m5, m0 + paddw m5, m1 paddw m6, m7, m0 paddw m7, m1 - packuswb m2, m3 + packuswb m2, m5 packuswb m6, m7 mova [dstq+strideq*0], m2 mova [dstq+strideq*1], m6 @@ -617,16 +797,32 @@ cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a RET %if ARCH_X86_64 -INIT_XMM %1 -cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a +%define mem 0 +%else +%define mem 64 +%endif +cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a pxor m5, m5 pinsrw m4, [aq-1], 0 mova m0, [aq] mova m2, [aq+16] DEFINE_ARGS dst, stride, l, cnt - mova m8, [pw_m256] - mova m9, [pw_m255] - pshufb m4, m8 +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_m256] + mova m13, [pw_m255] +%define pw_m256_reg m12 +%define pw_m255_reg m13 +%else +%define pw_m256_reg [pw_m256] +%define pw_m255_reg [pw_m255] +%endif + pshufb m4, pw_m256_reg +%else + punpcklbw m4, m5 + punpcklwd m4, m4 + pshufd m4, m4, q0000 +%endif punpckhbw m1, m0, m5 punpckhbw m3, m2, m5 punpcklbw m0, m5 @@ -635,36 +831,72 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a psubw m0, m4 psubw m3, m4 psubw m2, m4 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%else + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + mova [rsp+3*16], m3 +%endif mov cntq, 15 .loop: - pinsrw m13, [lq+cntq*2], 0 - pshufb m7, m13, m9 - pshufb m13, m8 - paddw m4, m7, m0 - paddw m5, m7, m1 - paddw m6, m7, m2 - paddw m7, m3 - paddw m10, m13, m0 - paddw m11, m13, m1 - paddw m12, m13, m2 - paddw m13, m3 + pinsrw m3, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m7, m3, pw_m255_reg + pshufb m3, pw_m256_reg +%else + pxor m7, m7 + punpcklbw m3, m7 + punpcklwd m3, m3 + pshufd m7, m3, q1111 + pshufd m3, m3, q0000 +%endif +%if ARCH_X86_64 + paddw m4, m7, m8 + paddw m5, m7, m9 + paddw m6, m7, m10 + paddw m7, m11 + paddw m0, m3, m8 + paddw m1, m3, m9 + paddw m2, m3, m10 + paddw m3, m11 +%else + paddw m4, m7, [rsp+0*16] + paddw m5, m7, [rsp+1*16] + paddw m6, m7, [rsp+2*16] + paddw m7, [rsp+3*16] + paddw m0, m3, [rsp+0*16] + paddw m1, m3, [rsp+1*16] + paddw m2, m3, [rsp+2*16] + paddw m3, [rsp+3*16] +%endif packuswb m4, m5 packuswb m6, m7 - packuswb m10, m11 - packuswb m12, m13 + packuswb m0, m1 + packuswb m2, m3 mova [dstq+strideq*0+ 0], m4 mova [dstq+strideq*0+16], m6 - mova [dstq+strideq*1+ 0], m10 - mova [dstq+strideq*1+16], m12 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m2 lea dstq, [dstq+strideq*2] dec cntq jge .loop RET -%endif +%undef pw_m256_reg +%undef pw_m255_reg +%undef mem %endmacro -TM_XMM_FUNCS ssse3 -TM_XMM_FUNCS avx +INIT_XMM sse2 +TM_XMM_FUNCS +INIT_XMM ssse3 +TM_XMM_FUNCS +INIT_XMM avx +TM_XMM_FUNCS %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 @@ -711,11 +943,20 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a pavgb m%1, m%2 %endmacro -INIT_MMX ssse3 +%macro DL_MMX_FUNCS 0 cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a movq m1, [aq] +%if cpuflag(ssse3) pshufb m0, m1, [pb_0to5_2x7] pshufb m2, m1, [pb_2to6_3x7] +%else + punpckhbw m3, m1, m1 ; 44556677 + pand m0, m1, [pb_6xm1_2x0] ; 012345__ + pand m3, [pb_6x0_2xm1] ; ______77 + psrlq m2, m1, 16 ; 234567__ + por m0, m3 ; 01234577 + por m2, m3 ; 23456777 +%endif psrlq m1, 8 LOWPASS 0, 1, 2, 3 @@ -728,15 +969,29 @@ cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a movd [dstq+strideq*0], m0 movd [dstq+strideq*2], m1 RET +%endmacro -%macro DL_XMM_FUNCS 1 -INIT_XMM %1 +INIT_MMX mmxext +DL_MMX_FUNCS +INIT_MMX ssse3 +DL_MMX_FUNCS + +%macro DL_XMM_FUNCS 0 cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a movq m0, [aq] lea stride5q, [strideq*5] +%if cpuflag(ssse3) pshufb m1, m0, [pb_1to6_10x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7 +%endif + shufps m0, m1, q3310 +%if notcpuflag(ssse3) + psrldq m1, m0, 1 + shufps m1, m0, q3210 +%endif psrldq m2, m1, 1 - shufps m0, m1, q3210 LOWPASS 0, 1, 2, 3 pshufd m1, m0, q3321 @@ -757,46 +1012,72 @@ cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a movq [dstq+stride5q ], m1 RET -INIT_XMM %1 cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a - mova m5, [pb_1toE_2xF] mova m0, [aq] +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] pshufb m1, m0, m5 pshufb m2, m1, m5 pshufb m4, m0, [pb_15] +%else + pand m5, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m5 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m5 ; 23456789ABCDEFFF + pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF +%endif LOWPASS 0, 1, 2, 3 DEFINE_ARGS dst, stride, cnt, stride9 - lea stride9q, [strideq*3] + lea stride9q, [strideq+strideq*8] mov cntd, 4 - lea stride9q, [stride9q*3] .loop: movhlps m4, m0 mova [dstq+strideq*0], m0 +%if cpuflag(ssse3) pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif mova [dstq+strideq*8], m4 movhlps m4, m0 mova [dstq+strideq*1], m0 +%if cpuflag(ssse3) pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif mova [dstq+stride9q ], m4 lea dstq, [dstq+strideq*2] dec cntd jg .loop RET -INIT_XMM %1 cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 - mova m5, [pb_1toE_2xF] mova m0, [aq] mova m1, [aq+16] - palignr m2, m1, m0, 1 - palignr m3, m1, m0, 2 + PALIGNR m2, m1, m0, 1, m4 + PALIGNR m3, m1, m0, 2, m4 LOWPASS 0, 2, 3, 4 +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] pshufb m2, m1, m5 pshufb m3, m2, m5 pshufb m6, m1, [pb_15] - LOWPASS 1, 2, 3, 4 mova m7, m6 +%else + pand m5, m1, [pb_15x0_1xm1] ; _______________F + psrldq m2, m1, 1 ; 123456789ABCDEF_ + por m2, m5 ; 123456789ABCDEFF + psrldq m3, m2, 1 ; 23456789ABCDEFF_ + por m3, m5 ; 23456789ABCDEFFF + pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF + pshufd m6, m7, q3333 +%endif + LOWPASS 1, 2, 3, 4 lea dst16q, [dstq +strideq*8] mov cntd, 8 lea dst16q, [dst16q+strideq*8] @@ -814,10 +1095,17 @@ cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 %if cpuflag(avx) vpalignr m0, m1, m0, 1 pshufb m1, m5 -%else +%elif cpuflag(ssse3) palignr m2, m1, m0, 1 pshufb m1, m5 mova m0, m2 +%else + mova m4, m1 + psrldq m0, 1 + pslldq m4, 15 + psrldq m1, 1 + por m0, m4 + por m1, m5 %endif add dstq, strideq add dst16q, strideq @@ -826,19 +1114,23 @@ cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 RET %endmacro -DL_XMM_FUNCS ssse3 -DL_XMM_FUNCS avx +INIT_XMM sse2 +DL_XMM_FUNCS +INIT_XMM ssse3 +DL_XMM_FUNCS +INIT_XMM avx +DL_XMM_FUNCS ; dr -INIT_MMX ssse3 +%macro DR_MMX_FUNCS 0 cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a movd m0, [lq] punpckldq m0, [aq-1] movd m1, [aq+3] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] - palignr m1, m0, 1 + PALIGNR m1, m0, 1, m3 psrlq m2, m1, 8 LOWPASS 0, 1, 2, 3 @@ -850,9 +1142,14 @@ cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a psrlq m0, 8 movd [dstq+strideq*0], m0 RET +%endmacro -%macro DR_XMM_FUNCS 1 -INIT_XMM %1 +INIT_MMX mmxext +DR_MMX_FUNCS +INIT_MMX ssse3 +DR_MMX_FUNCS + +%macro DR_XMM_FUNCS 0 cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a movq m1, [lq] movhps m1, [aq-1] @@ -860,7 +1157,7 @@ cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] pslldq m0, m1, 1 - palignr m2, m1, 1 + PALIGNR m2, m1, 1, m3 LOWPASS 0, 1, 2, 3 movhps [dstq+strideq*0], m0 @@ -881,7 +1178,6 @@ cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a movhps [dstq+stride3q ], m0 RET -INIT_XMM %1 cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a mova m1, [lq] movu m2, [aq-1] @@ -890,30 +1186,29 @@ cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a lea stride9q, [strideq *3] mov cntd, 4 lea stride9q, [stride9q*3] - palignr m4, m2, 1 - palignr m3, m2, m1, 15 + PALIGNR m4, m2, 1, m5 + PALIGNR m3, m2, m1, 15, m5 LOWPASS 3, 2, 4, 5 pslldq m0, m1, 1 - palignr m2, m1, 1 + PALIGNR m2, m1, 1, m4 LOWPASS 0, 1, 2, 4 .loop: mova [dstq+strideq*0 ], m3 movhps [dstq+strideq*8+0], m0 movq [dstq+strideq*8+8], m3 - palignr m3, m0, 15 + PALIGNR m3, m0, 15, m1 pslldq m0, 1 mova [dstq+strideq*1 ], m3 movhps [dstq+stride9q +0], m0 movq [dstq+stride9q +8], m3 - palignr m3, m0, 15 + PALIGNR m3, m0, 15, m1 pslldq m0, 1 lea dstq, [dstq+strideq*2] dec cntd jg .loop RET -INIT_XMM %1 cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a mova m1, [lq] mova m2, [lq+16] @@ -922,16 +1217,16 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a movd m5, [aq+31] DEFINE_ARGS dst, stride, stride8, cnt lea stride8q, [strideq*8] - palignr m5, m4, 1 - palignr m6, m4, m3, 15 + PALIGNR m5, m4, 1, m7 + PALIGNR m6, m4, m3, 15, m7 LOWPASS 5, 4, 6, 7 - palignr m4, m3, 1 - palignr m6, m3, m2, 15 + PALIGNR m4, m3, 1, m7 + PALIGNR m6, m3, m2, 15, m7 LOWPASS 4, 3, 6, 7 - palignr m3, m2, 1 - palignr m6, m2, m1, 15 + PALIGNR m3, m2, 1, m7 + PALIGNR m6, m2, m1, 15, m7 LOWPASS 3, 2, 6, 7 - palignr m2, m1, 1 + PALIGNR m2, m1, 1, m6 pslldq m0, m1, 1 LOWPASS 2, 1, 0, 6 mov cntd, 16 @@ -942,9 +1237,9 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a mova [dstq+stride8q*0+16], m5 mova [dstq+stride8q*2+ 0], m3 mova [dstq+stride8q*2+16], m4 - palignr m5, m4, 15 - palignr m4, m3, 15 - palignr m3, m2, 15 + PALIGNR m5, m4, 15, m6 + PALIGNR m4, m3, 15, m6 + PALIGNR m3, m2, 15, m6 pslldq m2, 1 add dstq, strideq dec cntd @@ -952,12 +1247,16 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a RET %endmacro -DR_XMM_FUNCS ssse3 -DR_XMM_FUNCS avx +INIT_XMM sse2 +DR_XMM_FUNCS +INIT_XMM ssse3 +DR_XMM_FUNCS +INIT_XMM avx +DR_XMM_FUNCS ; vl -INIT_MMX ssse3 +INIT_MMX mmxext cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a movq m0, [aq] psrlq m1, m0, 8 @@ -973,11 +1272,16 @@ cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a movd [dstq+strideq*1], m2 RET -%macro VL_XMM_FUNCS 1 -INIT_XMM %1 +%macro VL_XMM_FUNCS 0 cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a movq m0, [aq] +%if cpuflag(ssse3) pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 + punpckhwd m1, m1 + shufps m0, m1, q3310 +%endif DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] psrldq m1, m0, 1 @@ -1002,48 +1306,82 @@ cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a movq [dstq+stride3q ], m2 RET -INIT_XMM %1 cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a mova m0, [aq] - mova m4, [pb_1toE_2xF] DEFINE_ARGS dst, stride, stride3, cnt lea stride3q, [strideq*3] +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] pshufb m1, m0, m4 pshufb m2, m1, m4 +%else + pand m4, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m4 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m4 ; 23456789ABCDEFFF +%endif LOWPASS 2, 1, 0, 3 pavgb m1, m0 mov cntd, 4 .loop: mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 +%if cpuflag(ssse3) pshufb m1, m4 pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif mova [dstq+strideq*2], m1 mova [dstq+stride3q ], m2 +%if cpuflag(ssse3) pshufb m1, m4 pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif lea dstq, [dstq+strideq*4] dec cntd jg .loop RET -INIT_XMM %1 cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a mova m0, [aq] mova m5, [aq+16] - mova m4, [pb_1toE_2xF] DEFINE_ARGS dst, stride, dst16, cnt - palignr m2, m5, m0, 1 - palignr m3, m5, m0, 2 + PALIGNR m2, m5, m0, 1, m4 + PALIGNR m3, m5, m0, 2, m4 lea dst16q, [dstq +strideq*8] LOWPASS 3, 2, 0, 6 pavgb m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] pshufb m0, m5, m4 pshufb m1, m0, m4 +%else + pand m4, m5, [pb_15x0_1xm1] ; _______________F + psrldq m0, m5, 1 ; 123456789ABCDEF_ + por m0, m4 ; 123456789ABCDEFF + psrldq m1, m0, 1 ; 23456789ABCDEFF_ + por m1, m4 ; 23456789ABCDEFFF +%endif lea dst16q, [dst16q+strideq*8] LOWPASS 1, 0, 5, 6 pavgb m0, m5 +%if cpuflag(ssse3) pshufb m5, [pb_15] +%else + punpckhbw m5, m4, m4 + pshufhw m5, m5, q3333 + punpckhqdq m5, m5 +%endif mov cntd, 8 .loop: @@ -1056,10 +1394,16 @@ cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a %if cpuflag(avx) palignr %2, %3, %2, 1 pshufb %3, m4 -%else +%elif cpuflag(ssse3) palignr m6, %3, %2, 1 pshufb %3, m4 mova %2, m6 +%else + pslldq m6, %3, 15 + psrldq %3, 1 + psrldq %2, 1 + por %3, m4 + por %2, m6 %endif %endmacro @@ -1072,12 +1416,16 @@ cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a RET %endmacro -VL_XMM_FUNCS ssse3 -VL_XMM_FUNCS avx +INIT_XMM sse2 +VL_XMM_FUNCS +INIT_XMM ssse3 +VL_XMM_FUNCS +INIT_XMM avx +VL_XMM_FUNCS ; vr -INIT_MMX ssse3 +%macro VR_MMX_FUNCS 0 cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a movq m1, [aq-1] punpckldq m2, [lq] @@ -1085,7 +1433,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] pavgb m0, m1 - palignr m1, m2, 5 + PALIGNR m1, m2, 5, m3 psrlq m2, m1, 8 psllq m3, m1, 8 LOWPASS 2, 1, 3, 4 @@ -1095,6 +1443,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a ; IABC | m0 contains ABCDxxxx ; JEFG | m2 contains xJIEFGHx +%if cpuflag(ssse3) punpckldq m0, m2 pshufb m2, [pb_13456_3xm1] movd [dstq+strideq*0], m0 @@ -1103,10 +1452,26 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a psrlq m2, 8 movd [dstq+strideq*2], m0 movd [dstq+strideq*1], m2 +%else + psllq m1, m2, 40 + psrlq m2, 24 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m2 + PALIGNR m0, m1, 7, m3 + psllq m1, 8 + PALIGNR m2, m1, 7, m3 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m2 +%endif RET +%endmacro -%macro VR_XMM_FUNCS 1 -INIT_XMM %1 +INIT_MMX mmxext +VR_MMX_FUNCS +INIT_MMX ssse3 +VR_MMX_FUNCS + +%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16 cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a movu m1, [aq-1] movhps m2, [lq] @@ -1114,7 +1479,7 @@ cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] pavgb m0, m1 - palignr m1, m2, 9 + PALIGNR m1, m2, 9, m3 pslldq m2, m1, 1 pslldq m3, m1, 2 LOWPASS 1, 2, 3, 4 @@ -1128,83 +1493,118 @@ cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a ; USQABCDE ; VTRIJKLM +%if cpuflag(ssse3) punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ +%endif movq [dstq+strideq*0], m0 - pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG movhps [dstq+strideq*1], m1 - pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO +%if cpuflag(ssse3) + pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG + pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO +%else + psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx + pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx + packuswb m3, m2 ; xVTRxxxxxUSQxxxx + pslldq m3, 4 ; xxxxxVTRxxxxxUSQ + PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG + psrldq m1, 8 + pslldq m3, 8 + PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO +%endif movhps [dstq+strideq*2], m0 - pslldq m0, 1 movhps [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] + pslldq m0, 1 pslldq m1, 1 movhps [dstq+strideq*0], m0 - pslldq m0, 1 movhps [dstq+strideq*1], m1 + pslldq m0, 1 pslldq m1, 1 movhps [dstq+strideq*2], m0 movhps [dstq+stride3q ], m1 RET -INIT_XMM %1 -cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a +cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a mova m0, [aq] movu m1, [aq-1] mova m2, [lq] DEFINE_ARGS dst, stride, stride3, cnt lea stride3q, [strideq*3] - palignr m3, m1, m2, 15 + PALIGNR m3, m1, m2, 15, m6 LOWPASS 3, 1, 0, 4 pavgb m0, m1 - palignr m1, m2, 1 + PALIGNR m1, m2, 1, m6 pslldq m4, m2, 1 LOWPASS 1, 2, 4, 5 +%if cpuflag(ssse3) pshufb m1, [pb_02468ACE_13579BDF] +%else + psrlw m5, m1, 8 + pand m1, [pw_255] + packuswb m1, m5 +%endif mov cntd, 4 .loop: movlhps m2, m1 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m3 - palignr m4, m0, m1, 15 - palignr m5, m3, m2, 15 + PALIGNR m4, m0, m1, 15, m6 + PALIGNR m5, m3, m2, 15, m6 mova [dstq+strideq*2], m4 mova [dstq+stride3q ], m5 lea dstq, [dstq+strideq*4] - palignr m0, m1, 14 - palignr m3, m2, 14 + PALIGNR m0, m1, 14, m6 + PALIGNR m3, m2, 14, m6 pslldq m1, 2 dec cntd jg .loop RET -%if ARCH_X86_64 -INIT_XMM %1 cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a mova m0, [aq] mova m2, [aq+16] movu m1, [aq-1] - palignr m3, m2, m0, 15 - palignr m4, m2, m0, 14 + PALIGNR m3, m2, m0, 15, m6 + PALIGNR m4, m2, m0, 14, m6 LOWPASS 4, 3, 2, 5 pavgb m3, m2 mova m2, [lq+16] - palignr m5, m1, m2, 15 + PALIGNR m5, m1, m2, 15, m6 LOWPASS 5, 1, 0, 6 pavgb m0, m1 mova m6, [lq] - palignr m1, m2, 1 - palignr m7, m2, m6, 15 - LOWPASS 1, 2, 7, 8 - palignr m2, m6, 1 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [dstq], m0 +%endif + PALIGNR m1, m2, 1, m0 + PALIGNR m7, m2, m6, 15, m0 + LOWPASS 1, 2, 7, 0 + PALIGNR m2, m6, 1, m0 pslldq m7, m6, 1 - LOWPASS 2, 6, 7, 8 + LOWPASS 2, 6, 7, 0 +%if cpuflag(ssse3) pshufb m1, [pb_02468ACE_13579BDF] pshufb m2, [pb_02468ACE_13579BDF] +%else + psrlw m0, m1, 8 + psrlw m6, m2, 8 + pand m1, [pw_255] + pand m2, [pw_255] + packuswb m1, m0 + packuswb m2, m6 +%endif DEFINE_ARGS dst, stride, dst16, cnt lea dst16q, [dstq +strideq*8] lea dst16q, [dst16q+strideq*8] SBUTTERFLY qdq, 2, 1, 6 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova m0, [dstq] +%endif mov cntd, 8 .loop: @@ -1216,8 +1616,8 @@ cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a movhps [dst16q+stride%1 ], %2 movu [dst16q+stride%1+ 8], %3 movq [dst16q+stride%1+24], %4 - palignr %4, %3, 15 - palignr %3, %2, 15 + PALIGNR %4, %3, 15, m6 + PALIGNR %3, %2, 15, m6 pslldq %2, 1 %endmacro @@ -1228,15 +1628,18 @@ cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a dec cntd jg .loop RET -%endif %endmacro -VR_XMM_FUNCS ssse3 -VR_XMM_FUNCS avx +INIT_XMM sse2 +VR_XMM_FUNCS 7 +INIT_XMM ssse3 +VR_XMM_FUNCS 6 +INIT_XMM avx +VR_XMM_FUNCS 6 ; hd -INIT_MMX ssse3 +INIT_MMX mmxext cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a movd m0, [lq] punpckldq m0, [aq-1] @@ -1266,9 +1669,8 @@ cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a movd [dstq+strideq*0], m0 RET -%macro HD_XMM_FUNCS 1 -INIT_XMM %1 -cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a +%macro HD_XMM_FUNCS 0 +cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a movq m0, [lq] movhps m0, [aq-1] DEFINE_ARGS dst, stride, stride3, dst4 @@ -1296,18 +1698,17 @@ cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a movhps [dstq +stride3q ], m1 movq [dst4q+stride3q ], m1 - palignr m3, m2, m1, 2 + PALIGNR m3, m2, m1, 2, m4 movhps [dstq +strideq*2], m3 movq [dst4q+strideq*2], m3 - palignr m3, m2, m1, 4 + PALIGNR m3, m2, m1, 4, m4 movhps [dstq +strideq*1], m3 movq [dst4q+strideq*1], m3 - palignr m2, m1, 6 + PALIGNR m2, m1, 6, m4 movhps [dstq +strideq*0], m2 movq [dst4q+strideq*0], m2 RET -INIT_XMM %1 cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a mova m0, [lq] movu m3, [aq-1] @@ -1319,8 +1720,8 @@ cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a psrldq m4, m3, 1 psrldq m5, m3, 2 LOWPASS 5, 4, 3, 6 - palignr m1, m3, m0, 1 - palignr m2, m3, m0, 2 + PALIGNR m1, m3, m0, 1, m6 + PALIGNR m2, m3, m0, 2, m6 LOWPASS 2, 1, 0, 6 pavgb m1, m0 SBUTTERFLY bw, 1, 2, 6 @@ -1338,17 +1739,26 @@ cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a %if cpuflag(avx) palignr m1, m2, m1, 2 palignr m2, m5, m2, 2 -%else +%elif cpuflag(ssse3) palignr m3, m2, m1, 2 palignr m0, m5, m2, 2 mova m1, m3 mova m2, m0 +%else + ; slightly modified version of PALIGNR + mova m6, m2 + mova m4, m5 + pslldq m6, 14 + pslldq m4, 14 + psrldq m1, 2 + psrldq m2, 2 + por m1, m6 + por m2, m4 %endif psrldq m5, 2 jg .loop RET -INIT_XMM %1 cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a mova m0, [lq] mova m1, [lq+16] @@ -1362,15 +1772,15 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a psrldq m4, m3, 1 psrldq m5, m3, 2 LOWPASS 5, 4, 3, 6 - palignr m4, m3, m2, 2 - palignr m3, m2, 1 + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m3, m2, 1, m6 LOWPASS 4, 3, 2, 6 - palignr m3, m2, m1, 2 - palignr m2, m1, 1 + PALIGNR m3, m2, m1, 2, m6 + PALIGNR m2, m1, 1, m6 LOWPASS 3, 2, 1, 6 pavgb m2, m1 - palignr m6, m1, m0, 1 - palignr m1, m0, 2 + PALIGNR m6, m1, m0, 1, m7 + PALIGNR m1, m0, 2, m7 LOWPASS 1, 6, 0, 7 pavgb m0, m6 SBUTTERFLY bw, 2, 3, 6 @@ -1394,7 +1804,7 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a palignr m3, m4, m3, 2 palignr m4, m5, m4, 2 psrldq m5, 2 -%else +%elif cpuflag(ssse3) psrldq m6, m5, 2 palignr m5, m4, 2 palignr m4, m3, 2 @@ -1407,18 +1817,46 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a mova m3, m4 mova m4, m5 mova m5, m6 +%else + ; sort of a half-integrated version of PALIGNR + pslldq m7, m4, 14 + pslldq m6, m5, 14 + psrldq m4, 2 + psrldq m5, 2 + por m4, m6 + pslldq m6, m3, 14 + psrldq m3, 2 + por m3, m7 + pslldq m7, m2, 14 + psrldq m2, 2 + por m2, m6 + pslldq m6, m1, 14 + psrldq m1, 2 + por m1, m7 + psrldq m0, 2 + por m0, m6 %endif jg .loop RET %endmacro -HD_XMM_FUNCS ssse3 -HD_XMM_FUNCS avx +INIT_XMM sse2 +HD_XMM_FUNCS +INIT_XMM ssse3 +HD_XMM_FUNCS +INIT_XMM avx +HD_XMM_FUNCS -INIT_MMX ssse3 +%macro HU_MMX_FUNCS 0 cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l movd m0, [lq] +%if cpuflag(ssse3) pshufb m0, [pb_0to2_5x3] +%else + punpcklbw m1, m0, m0 ; 00112233 + pshufw m1, m1, q3333 ; 33333333 + punpckldq m0, m1 ; 01233333 +%endif psrlq m1, m0, 8 psrlq m2, m1, 8 LOWPASS 2, 1, 0, 3 @@ -1426,7 +1864,7 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] SBUTTERFLY bw, 1, 2, 0 - palignr m2, m1, 2 + PALIGNR m2, m1, 2, m0 movd [dstq+strideq*0], m1 movd [dstq+strideq*1], m2 punpckhdq m1, m1 @@ -1434,12 +1872,23 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l movd [dstq+strideq*2], m1 movd [dstq+stride3q ], m2 RET +%endmacro -%macro HU_XMM_FUNCS 1 -INIT_XMM %1 +INIT_MMX mmxext +HU_MMX_FUNCS +INIT_MMX ssse3 +HU_MMX_FUNCS + +%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32 cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l movq m0, [lq] +%if cpuflag(ssse3) pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4444555566667777 + shufps m0, m1, q3310 ; 0123456777777777 +%endif psrldq m1, m0, 1 psrldq m2, m1, 1 LOWPASS 2, 1, 0, 3 @@ -1450,56 +1899,81 @@ cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l SBUTTERFLY bw, 1, 2, 0 movq [dstq +strideq*0], m1 movhps [dst4q+strideq*0], m1 - palignr m0, m2, m1, 2 + PALIGNR m0, m2, m1, 2, m3 movq [dstq +strideq*1], m0 movhps [dst4q+strideq*1], m0 - palignr m0, m2, m1, 4 + PALIGNR m0, m2, m1, 4, m3 movq [dstq +strideq*2], m0 movhps [dst4q+strideq*2], m0 - palignr m2, m1, 6 + PALIGNR m2, m1, 6, m3 movq [dstq +stride3q ], m2 movhps [dst4q+stride3q ], m2 RET -INIT_XMM %1 cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l mova m0, [lq] +%if cpuflag(ssse3) mova m3, [pb_2toE_3xF] pshufb m1, m0, [pb_1toE_2xF] pshufb m2, m0, m3 +%else + pand m3, m0, [pb_15x0_1xm1] + psrldq m1, m0, 1 + por m1, m3 + punpckhbw m3, m3 + psrldq m2, m0, 2 + por m2, m3 +%endif LOWPASS 2, 1, 0, 4 pavgb m1, m0 DEFINE_ARGS dst, stride, stride9, cnt - lea stride9q, [strideq *3] + lea stride9q, [strideq*8+strideq] mov cntd, 4 - lea stride9q, [stride9q*3] SBUTTERFLY bw, 1, 2, 0 .loop: mova [dstq+strideq*0], m1 mova [dstq+strideq*8], m2 - palignr m0, m2, m1, 2 + PALIGNR m0, m2, m1, 2, m4 +%if cpuflag(ssse3) pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif mova [dstq+strideq*1], m0 mova [dstq+stride9q ], m2 - palignr m1, m2, m0, 2 + PALIGNR m1, m2, m0, 2, m4 +%if cpuflag(ssse3) pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif lea dstq, [dstq+strideq*2] dec cntd jg .loop RET -INIT_XMM %1 -cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l +cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l mova m1, [lq] mova m0, [lq+16] - mova m4, [pb_2toE_3xF] - palignr m2, m0, m1, 1 - palignr m3, m0, m1, 2 + PALIGNR m2, m0, m1, 1, m5 + PALIGNR m3, m0, m1, 2, m5 LOWPASS 3, 2, 1, 5 pavgb m2, m1 - pshufb m1, m0, m4 +%if cpuflag(ssse3) + mova m4, [pb_2toE_3xF] pshufb m5, m0, [pb_1toE_2xF] + pshufb m1, m0, m4 +%else + pand m4, m0, [pb_15x0_1xm1] + psrldq m5, m0, 1 + por m5, m4 + punpckhbw m4, m4 + psrldq m1, m0, 2 + por m1, m4 +%endif LOWPASS 1, 5, 0, 6 pavgb m0, m5 DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24 @@ -1510,7 +1984,12 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l lea dst24q, [dst16q+strideq*8] SBUTTERFLY bw, 0, 1, 5 SBUTTERFLY bw, 2, 3, 5 +%if cpuflag(ssse3) pshufb m6, m1, [pb_15] +%else + pshufhw m6, m4, q3333 + punpckhqdq m6, m6 +%endif .loop: mova [dstq +stride0q+ 0], m2 @@ -1526,7 +2005,7 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l palignr m3, m0, m3, 2 palignr m0, m1, m0, 2 pshufb m1, m4 -%else +%elif cpuflag(ssse3) pshufb m5, m1, m4 palignr m1, m0, 2 palignr m0, m3, 2 @@ -1535,6 +2014,19 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l mova m3, m0 mova m0, m1 mova m1, m5 +%else + ; half-integrated version of PALIGNR + pslldq m5, m1, 14 + pslldq m7, m0, 14 + psrldq m1, 2 + psrldq m0, 2 + por m1, m4 + por m0, m5 + pslldq m5, m3, 14 + psrldq m3, 2 + por m3, m7 + psrldq m2, 2 + por m2, m5 %endif add stride0q, strideq dec cntd @@ -1542,7 +2034,11 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l RET %endmacro -HU_XMM_FUNCS ssse3 -HU_XMM_FUNCS avx +INIT_XMM sse2 +HU_XMM_FUNCS 8 +INIT_XMM ssse3 +HU_XMM_FUNCS 7 +INIT_XMM avx +HU_XMM_FUNCS 7 ; FIXME 127, 128, 129 ?