diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 1191081440..2d6fc2ad26 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -144,6 +144,7 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); #endif /* HAVE_INLINE_ASM */ +void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); @@ -176,6 +177,9 @@ av_cold void rgb2rgb_init_x86(void) rgb2rgb_init_avx(); #endif /* HAVE_INLINE_ASM */ + if (EXTERNAL_MMXEXT(cpu_flags)) { + shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext; + } if (EXTERNAL_SSE2(cpu_flags)) { #if ARCH_X86_64 uyvytoyuv422 = ff_uyvytoyuv422_sse2; diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c index 287e1d3501..ae2469e663 100644 --- a/libswscale/x86/rgb2rgb_template.c +++ b/libswscale/x86/rgb2rgb_template.c @@ -1034,51 +1034,6 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s } } -#if COMPILE_TEMPLATE_MMXEXT -static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size) -{ - x86_reg idx = 15 - src_size; - const uint8_t *s = src-idx; - uint8_t *d = dst-idx; - __asm__ volatile( - "test %0, %0 \n\t" - "jns 2f \n\t" - PREFETCH" (%1, %0) \n\t" - "movq %3, %%mm7 \n\t" - "pxor %4, %%mm7 \n\t" - "movq %%mm7, %%mm6 \n\t" - "pxor %5, %%mm7 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1, %0) \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - "pshufw $177, %%mm0, %%mm3 \n\t" - "pshufw $177, %%mm1, %%mm5 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm6, %%mm5 \n\t" - "por %%mm3, %%mm0 \n\t" - "por %%mm5, %%mm1 \n\t" - MOVNTQ" %%mm0, (%2, %0) \n\t" - MOVNTQ" %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "js 1b \n\t" - SFENCE" \n\t" - EMMS" \n\t" - "2: \n\t" - : "+&r"(idx) - : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) - : "memory"); - for (; idx<15; idx+=4) { - register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; - v &= 0xff00ff; - *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); - } -} -#endif - static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) { unsigned i; @@ -2555,9 +2510,6 @@ static av_cold void RENAME(rgb2rgb_init)(void) rgb24to15 = RENAME(rgb24to15); rgb24to16 = RENAME(rgb24to16); rgb24tobgr24 = RENAME(rgb24tobgr24); -#if COMPILE_TEMPLATE_MMXEXT - shuffle_bytes_2103 = RENAME(shuffle_bytes_2103); -#endif rgb32tobgr16 = RENAME(rgb32tobgr16); rgb32tobgr15 = RENAME(rgb32tobgr15); yv12toyuy2 = RENAME(yv12toyuy2); diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 156b4d2c74..5fb5d2ee61 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -24,6 +24,7 @@ SECTION_RODATA +pb_mask_shuffle2103_mmx times 8 dw 255 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 @@ -42,6 +43,68 @@ SECTION .text %endif %endmacro +;------------------------------------------------------------------------------ +; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) +;------------------------------------------------------------------------------ +INIT_MMX mmxext +cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x + mova m6, [pb_mask_shuffle2103_mmx] + mova m7, m6 + psllq m7, 8 + + movsxdifnidn wq, wd + mov xq, wq + + add srcq, wq + add dstq, wq + neg wq + +;calc scalar loop + and xq, mmsize*2 -4 + je .loop_simd + +.loop_scalar: + mov tmpb, [srcq + wq + 2] + mov [dstq+wq + 0], tmpb + mov tmpb, [srcq + wq + 1] + mov [dstq+wq + 1], tmpb + mov tmpb, [srcq + wq + 0] + mov [dstq+wq + 2], tmpb + mov tmpb, [srcq + wq + 3] + mov [dstq+wq + 3], tmpb + add wq, 4 + sub xq, 4 + jg .loop_scalar + +;check if src_size < mmsize * 2 +cmp wq, 0 +jge .end + +.loop_simd: + movu m0, [srcq+wq] + movu m1, [srcq+wq+8] + + pshufw m3, m0, 177 + pshufw m5, m1, 177 + + pand m0, m7 + pand m3, m6 + + pand m1, m7 + pand m5, m6 + + por m0, m3 + por m1, m5 + + movu [dstq+wq], m0 + movu [dstq+wq + 8], m1 + + add wq, mmsize*2 + jl .loop_simd + +.end: + RET + ;------------------------------------------------------------------------------ ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) ;------------------------------------------------------------------------------