diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index ad15af9a27..1ac02574d6 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -92,13 +92,21 @@ INIT_XMM sse vvar_fn %macro hvar_fn 0 -cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w +cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w lea dstq, [dstq+n_wordsq*2] neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { + ; FIXME also write a ssse3 version using pshufb + movzx wd, byte [dstq+start_xq] ; w = read(1) + imul wd, 0x01010101 ; w *= 0x01010101 + movd m0, wd mov wq, n_wordsq ; initialize w - SPLATB_LOAD m0, dstq+start_xq, m1 ; read(1); splat +%if cpuflag(sse2) + pshufd m0, m0, q0000 ; splat +%else ; mmx + punpckldq m0, m0 ; splat +%endif ; mmx/sse .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -114,8 +122,6 @@ cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w %if ARCH_X86_32 INIT_MMX mmx hvar_fn -INIT_MMX mmxext -hvar_fn %endif INIT_XMM sse2 @@ -338,12 +344,16 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 -%if notcpuflag(mmxext) && %1 < 8 - movzx vald, byte [%2] + movzx vald, byte %2 imul vald, 0x01010101 +%if %1 >= 8 + movd m0, vald +%if mmsize == 16 + pshufd m0, m0, q0000 %else - SPLATB_LOAD m0, %2, m1 -%endif ; %1 < 8 + punpckldq m0, m0 +%endif ; mmsize == 16 +%endif ; %1 > 16 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 @@ -378,42 +388,26 @@ VERTICAL_EXTEND 16, 22 %endif %endif ; %1-%%off >= 4 -%if %1-%%off == 2 - movd [%2+%%off-2], m0 -%endif ; (%1-%%off)/2 - %else ; %1 < 8 -%if cpuflag(mmxext) - movd [%2+%%off], m0 -%if %1 == 6 - movd [%2+%%off+2], m0 -%endif ; (%1-%%off)/2 - -%else ; notcpuflag(mmxext) - %rep %1/4 mov [%2+%%off], vald %assign %%off %%off+4 %endrep ; %1/4 +%endif ; %1 >=/< 8 + %if %1-%%off == 2 mov [%2+%%off], valw %endif ; (%1-%%off)/2 -%endif ; cpuflag -%endif ; %1 >=/< 8 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 -%if %%n < 8 && notcpuflag(mmxext) -cglobal emu_edge_hfix %+ %%n, 4, 5, 2, dst, dst_stride, start_x, bh, val -%else -cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh -%endif +cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val .loop_y: ; do { - READ_V_PIXEL %%n, dstq+start_xq ; $variable_regs = read($n) + READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) add dstq, dst_strideq ; dst += dst_stride dec bhq ; } while (--bh) @@ -424,16 +418,11 @@ cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh %endmacro ; H_EXTEND INIT_MMX mmx -H_EXTEND 2, 2 -%if ARCH_X86_32 -H_EXTEND 4, 22 -%endif - -INIT_MMX mmxext -H_EXTEND 4, 14 +H_EXTEND 2, 14 %if ARCH_X86_32 H_EXTEND 16, 22 %endif + INIT_XMM sse2 H_EXTEND 16, 22 diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c index bd61ab461f..3218abdd88 100644 --- a/libavcodec/x86/videodsp_init.c +++ b/libavcodec/x86/videodsp_init.c @@ -117,34 +117,15 @@ static emu_edge_hfix_func *hfixtbl_mmx[11] = { }; #endif extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; -extern emu_edge_hfix_func ff_emu_edge_hfix4_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix6_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix8_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix10_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix12_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix14_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix16_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix18_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix20_mmxext; -extern emu_edge_hfix_func ff_emu_edge_hfix22_mmxext; -#if ARCH_X86_32 -static emu_edge_hfix_func *hfixtbl_mmxext[11] = { - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmxext, ff_emu_edge_hfix6_mmxext, - ff_emu_edge_hfix8_mmxext, ff_emu_edge_hfix10_mmxext, ff_emu_edge_hfix12_mmxext, - ff_emu_edge_hfix14_mmxext, ff_emu_edge_hfix16_mmxext, ff_emu_edge_hfix18_mmxext, - ff_emu_edge_hfix20_mmxext, ff_emu_edge_hfix22_mmxext -}; -#endif -extern emu_edge_hvar_func ff_emu_edge_hvar_mmxext; extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; static emu_edge_hfix_func *hfixtbl_sse2[11] = { - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmxext, ff_emu_edge_hfix6_mmxext, - ff_emu_edge_hfix8_mmxext, ff_emu_edge_hfix10_mmxext, ff_emu_edge_hfix12_mmxext, - ff_emu_edge_hfix14_mmxext, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, - ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, + ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 }; extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; @@ -234,17 +215,6 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } -static av_noinline void emulated_edge_mc_mmxext(uint8_t *buf, const uint8_t *src, - ptrdiff_t buf_stride, - ptrdiff_t src_stride, - int block_w, int block_h, - int src_x, int src_y, int w, int h) -{ - emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, - src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, - hfixtbl_mmxext, &ff_emu_edge_hvar_mmxext); -} - static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, ptrdiff_t buf_stride, ptrdiff_t src_stride, @@ -253,7 +223,7 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, { emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, - hfixtbl_mmxext, &ff_emu_edge_hvar_mmxext); + hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } #endif @@ -288,10 +258,6 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) #endif /* ARCH_X86_32 */ if (EXTERNAL_MMXEXT(cpu_flags)) { ctx->prefetch = ff_prefetch_mmxext; -#if ARCH_X86_32 - if (bpc <= 8) - ctx->emulated_edge_mc = emulated_edge_mc_mmxext; -#endif /* ARCH_X86_32 */ } #if ARCH_X86_32 if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {