From 99a319c4e7538670847ac4633ef8b0f0629deb22 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Wed, 28 May 2014 15:52:24 +0200 Subject: [PATCH] x86: huffyuvdsp: port add_bytes to yasm C MMX SSE2 Cycles: 2972 587 302 Signed-off-by: Michael Niedermayer --- libavcodec/huffyuvdsp.c | 2 +- libavcodec/huffyuvdsp.h | 2 +- libavcodec/ppc/huffyuvdsp_altivec.c | 2 +- libavcodec/x86/huffyuvdsp.asm | 37 +++++++++++++++++++++++++++++ libavcodec/x86/huffyuvdsp_init.c | 9 +++++-- libavcodec/x86/huffyuvdsp_mmx.c | 32 +------------------------ 6 files changed, 48 insertions(+), 36 deletions(-) diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c index cbc09cf124..3d51552fc3 100644 --- a/libavcodec/huffyuvdsp.c +++ b/libavcodec/huffyuvdsp.c @@ -27,7 +27,7 @@ #define pb_7f (~0UL / 255 * 0x7f) #define pb_80 (~0UL / 255 * 0x80) -static void add_bytes_c(uint8_t *dst, uint8_t *src, int w) +static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w) { long i; diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h index fd66f0a56e..c52dd69405 100644 --- a/libavcodec/huffyuvdsp.h +++ b/libavcodec/huffyuvdsp.h @@ -35,7 +35,7 @@ typedef struct HuffYUVDSPContext { void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */, - int w); + intptr_t w); void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/huffyuvdsp_altivec.c index ff2bd87eeb..0052daeb64 100644 --- a/libavcodec/ppc/huffyuvdsp_altivec.c +++ b/libavcodec/ppc/huffyuvdsp_altivec.c @@ -31,7 +31,7 @@ #include "libavcodec/huffyuvdsp.h" #if HAVE_ALTIVEC -static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) +static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w) { register int i; register vector unsigned char vdst, vsrc; diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm index f183ebee54..a923e70e1e 100644 --- a/libavcodec/x86/huffyuvdsp.asm +++ b/libavcodec/x86/huffyuvdsp.asm @@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left ADD_HFYU_LEFT_LOOP 0, 1 .src_unaligned: ADD_HFYU_LEFT_LOOP 0, 0 + +%macro ADD_BYTES 0 +cglobal add_bytes, 3,4,2, dst, src, w, size + mov sizeq, wq + and sizeq, -2*mmsize + jz .2 + add dstq, sizeq + add srcq, sizeq + neg sizeq +.1: + mova m0, [srcq + sizeq] + mova m1, [srcq + sizeq + mmsize] + paddb m0, [dstq + sizeq] + paddb m1, [dstq + sizeq + mmsize] + mova [dstq + sizeq], m0 + mova [dstq + sizeq + mmsize], m1 + add sizeq, 2*mmsize + jl .1 +.2: + and wq, 2*mmsize-1 + jz .end + add dstq, wq + add srcq, wq + neg wq +.3 + mov sizeb, [srcq + wq] + add [dstq + wq], sizeb + inc wq + jl .3 +.end: + REP_RET +%endmacro + +INIT_MMX mmx +ADD_BYTES +INIT_XMM sse2 +ADD_BYTES diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c index 1efb34dbbe..8a755e65b0 100644 --- a/libavcodec/x86/huffyuvdsp_init.c +++ b/libavcodec/x86/huffyuvdsp_init.c @@ -23,7 +23,8 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/huffyuvdsp.h" -void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w); +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w); +void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w); void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, @@ -46,7 +47,7 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c) c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov; #endif - if (INLINE_MMX(cpu_flags)) + if (EXTERNAL_MMX(cpu_flags)) c->add_bytes = ff_add_bytes_mmx; if (EXTERNAL_MMXEXT(cpu_flags)) { @@ -55,6 +56,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c) c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext; } + if (EXTERNAL_SSE2(cpu_flags)) { + c->add_bytes = ff_add_bytes_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3; if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe diff --git a/libavcodec/x86/huffyuvdsp_mmx.c b/libavcodec/x86/huffyuvdsp_mmx.c index 59422107d3..ee6ec91287 100644 --- a/libavcodec/x86/huffyuvdsp_mmx.c +++ b/libavcodec/x86/huffyuvdsp_mmx.c @@ -22,9 +22,7 @@ #include "libavutil/x86/asm.h" #include "huffyuvdsp.h" -#if HAVE_INLINE_ASM - -#if HAVE_7REGS +#if HAVE_INLINE_ASM && HAVE_7REGS void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) @@ -61,31 +59,3 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, *left_top = tl; } #endif - -void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) -{ - x86_reg i = 0; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %3, %0 \n\t" - "js 1b \n\t" - : "+r" (i) - : "r" (src), "r" (dst), "r" ((x86_reg) w - 15)); - - for (; i < w; i++) - dst[i + 0] += src[i + 0]; -} - -#endif /* HAVE_INLINE_ASM */