mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
swscale/x86/rgb2rgb : port shuffle 2103 mmxext to external asm and remove inline asm version
This commit is contained in:
parent
04afdbb560
commit
296609f859
@ -144,6 +144,7 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
@ -176,6 +177,9 @@ av_cold void rgb2rgb_init_x86(void)
|
||||
rgb2rgb_init_avx();
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
#if ARCH_X86_64
|
||||
uyvytoyuv422 = ff_uyvytoyuv422_sse2;
|
||||
|
@ -1034,51 +1034,6 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
|
||||
}
|
||||
}
|
||||
|
||||
#if COMPILE_TEMPLATE_MMXEXT
|
||||
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
|
||||
{
|
||||
x86_reg idx = 15 - src_size;
|
||||
const uint8_t *s = src-idx;
|
||||
uint8_t *d = dst-idx;
|
||||
__asm__ volatile(
|
||||
"test %0, %0 \n\t"
|
||||
"jns 2f \n\t"
|
||||
PREFETCH" (%1, %0) \n\t"
|
||||
"movq %3, %%mm7 \n\t"
|
||||
"pxor %4, %%mm7 \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"pxor %5, %%mm7 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
PREFETCH" 32(%1, %0) \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
"pshufw $177, %%mm0, %%mm3 \n\t"
|
||||
"pshufw $177, %%mm1, %%mm5 \n\t"
|
||||
"pand %%mm7, %%mm0 \n\t"
|
||||
"pand %%mm6, %%mm3 \n\t"
|
||||
"pand %%mm7, %%mm1 \n\t"
|
||||
"pand %%mm6, %%mm5 \n\t"
|
||||
"por %%mm3, %%mm0 \n\t"
|
||||
"por %%mm5, %%mm1 \n\t"
|
||||
MOVNTQ" %%mm0, (%2, %0) \n\t"
|
||||
MOVNTQ" %%mm1, 8(%2, %0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"js 1b \n\t"
|
||||
SFENCE" \n\t"
|
||||
EMMS" \n\t"
|
||||
"2: \n\t"
|
||||
: "+&r"(idx)
|
||||
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
|
||||
: "memory");
|
||||
for (; idx<15; idx+=4) {
|
||||
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
|
||||
v &= 0xff00ff;
|
||||
*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
|
||||
{
|
||||
unsigned i;
|
||||
@ -2555,9 +2510,6 @@ static av_cold void RENAME(rgb2rgb_init)(void)
|
||||
rgb24to15 = RENAME(rgb24to15);
|
||||
rgb24to16 = RENAME(rgb24to16);
|
||||
rgb24tobgr24 = RENAME(rgb24tobgr24);
|
||||
#if COMPILE_TEMPLATE_MMXEXT
|
||||
shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
|
||||
#endif
|
||||
rgb32tobgr16 = RENAME(rgb32tobgr16);
|
||||
rgb32tobgr15 = RENAME(rgb32tobgr15);
|
||||
yv12toyuy2 = RENAME(yv12toyuy2);
|
||||
|
@ -24,6 +24,7 @@
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pb_mask_shuffle2103_mmx times 8 dw 255
|
||||
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
|
||||
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
|
||||
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
||||
@ -42,6 +43,68 @@ SECTION .text
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
|
||||
;------------------------------------------------------------------------------
|
||||
INIT_MMX mmxext
|
||||
cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
|
||||
mova m6, [pb_mask_shuffle2103_mmx]
|
||||
mova m7, m6
|
||||
psllq m7, 8
|
||||
|
||||
movsxdifnidn wq, wd
|
||||
mov xq, wq
|
||||
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
|
||||
;calc scalar loop
|
||||
and xq, mmsize*2 -4
|
||||
je .loop_simd
|
||||
|
||||
.loop_scalar:
|
||||
mov tmpb, [srcq + wq + 2]
|
||||
mov [dstq+wq + 0], tmpb
|
||||
mov tmpb, [srcq + wq + 1]
|
||||
mov [dstq+wq + 1], tmpb
|
||||
mov tmpb, [srcq + wq + 0]
|
||||
mov [dstq+wq + 2], tmpb
|
||||
mov tmpb, [srcq + wq + 3]
|
||||
mov [dstq+wq + 3], tmpb
|
||||
add wq, 4
|
||||
sub xq, 4
|
||||
jg .loop_scalar
|
||||
|
||||
;check if src_size < mmsize * 2
|
||||
cmp wq, 0
|
||||
jge .end
|
||||
|
||||
.loop_simd:
|
||||
movu m0, [srcq+wq]
|
||||
movu m1, [srcq+wq+8]
|
||||
|
||||
pshufw m3, m0, 177
|
||||
pshufw m5, m1, 177
|
||||
|
||||
pand m0, m7
|
||||
pand m3, m6
|
||||
|
||||
pand m1, m7
|
||||
pand m5, m6
|
||||
|
||||
por m0, m3
|
||||
por m1, m5
|
||||
|
||||
movu [dstq+wq], m0
|
||||
movu [dstq+wq + 8], m1
|
||||
|
||||
add wq, mmsize*2
|
||||
jl .loop_simd
|
||||
|
||||
.end:
|
||||
RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
|
||||
;------------------------------------------------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user