diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 471a6b7012..308651d80a 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -87,6 +87,16 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride, uint8_t *src, int srcstride, int height, int mx, int my); +extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, int dststride, + uint8_t *src, int srcstride, + int height, int mx, int my); +extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, int dststride, + uint8_t *src, int srcstride, + int height, int mx, int my); +extern void ff_put_vp8_pixels16_sse(uint8_t *dst, int dststride, + uint8_t *src, int srcstride, + int height, int mx, int my); + #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ uint8_t *dst, int dststride, uint8_t *src, \ @@ -218,6 +228,10 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) #if HAVE_YASM if (mm_flags & FF_MM_MMX) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; + c->put_vp8_epel_pixels_tab[0][0][0] = + c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; + c->put_vp8_epel_pixels_tab[1][0][0] = + c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; } /* note that 4-tap width=16 functions are missing because w=16 @@ -231,6 +245,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) VP8_BILINEAR_MC_FUNC(1, 4, mmxext); } + if (mm_flags & FF_MM_SSE) { + c->put_vp8_epel_pixels_tab[0][0][0] = + c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; + } + if (mm_flags & FF_MM_SSE2) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index dbaac86ad3..f70d0117ef 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -810,6 +810,46 @@ cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 jg .nextrow REP_RET +cglobal put_vp8_pixels8_mmx, 5,5 +.nextrow: + movq mm0, [r2+r3*0] + movq mm1, [r2+r3*1] + lea r2, [r2+r3*2] + movq [r0+r1*0], mm0 + movq [r0+r1*1], mm1 + lea r0, [r0+r1*2] + sub r4d, 2 + jg .nextrow + REP_RET + +cglobal put_vp8_pixels16_mmx, 5,5 +.nextrow: + movq mm0, [r2+r3*0+0] + movq mm1, [r2+r3*0+8] + movq mm2, [r2+r3*1+0] + movq mm3, [r2+r3*1+8] + lea r2, [r2+r3*2] + movq [r0+r1*0+0], mm0 + movq [r0+r1*0+8], mm1 + movq [r0+r1*1+0], mm2 + movq [r0+r1*1+8], mm3 + lea r0, [r0+r1*2] + sub r4d, 2 + jg .nextrow + REP_RET + +cglobal put_vp8_pixels16_sse, 5,5,2 +.nextrow: + movups xmm0, [r2+r3*0] + movups xmm1, [r2+r3*1] + lea r2, [r2+r3*2] + movaps [r0+r1*0], xmm0 + movaps [r0+r1*1], xmm1 + lea r0, [r0+r1*2] + sub r4d, 2 + jg .nextrow + REP_RET + ;----------------------------------------------------------------------------- ; IDCT functions: ;