mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
swscale: write yuv2plane1 MMX/SSE2/SSE4/AVX functions.
This commit is contained in:
parent
2f7f2e4b41
commit
c435653627
@ -34,6 +34,12 @@ yuv2yuvX_10_start: times 4 dd 0x10000
|
||||
yuv2yuvX_9_start: times 4 dd 0x20000
|
||||
yuv2yuvX_10_upper: times 8 dw 0x3ff
|
||||
yuv2yuvX_9_upper: times 8 dw 0x1ff
|
||||
pd_4: times 4 dd 4
|
||||
pd_4min0x40000:times 4 dd 4 - (0x40000)
|
||||
pw_16: times 8 dw 16
|
||||
pw_32: times 8 dw 32
|
||||
pw_512: times 8 dw 512
|
||||
pw_1024: times 8 dw 1024
|
||||
|
||||
SECTION .text
|
||||
|
||||
@ -663,3 +669,139 @@ INIT_AVX
|
||||
yuv2planeX_fn avx, 8, 10, 7
|
||||
yuv2planeX_fn avx, 9, 7, 5
|
||||
yuv2planeX_fn avx, 10, 7, 5
|
||||
|
||||
; %1=outout-bpc, %2=alignment (u/a)
|
||||
%macro yuv2plane1_mainloop 2
|
||||
.loop_%2:
|
||||
%if %1 == 8
|
||||
paddsw m0, m2, [r0+r2*2+mmsize*0]
|
||||
paddsw m1, m3, [r0+r2*2+mmsize*1]
|
||||
psraw m0, 7
|
||||
psraw m1, 7
|
||||
packuswb m0, m1
|
||||
mov%2 [r1+r2], m0
|
||||
%elif %1 == 16
|
||||
paddd m0, m4, [r0+r2*4+mmsize*0]
|
||||
paddd m1, m4, [r0+r2*4+mmsize*1]
|
||||
paddd m2, m4, [r0+r2*4+mmsize*2]
|
||||
paddd m3, m4, [r0+r2*4+mmsize*3]
|
||||
psrad m0, 3
|
||||
psrad m1, 3
|
||||
psrad m2, 3
|
||||
psrad m3, 3
|
||||
%if cpuflag(sse4) ; avx/sse4
|
||||
packusdw m0, m1
|
||||
packusdw m2, m3
|
||||
%else ; mmx/sse2
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
paddw m0, m5
|
||||
paddw m2, m5
|
||||
%endif ; mmx/sse2/sse4/avx
|
||||
mov%2 [r1+r2*2], m0
|
||||
mov%2 [r1+r2*2+mmsize], m2
|
||||
%else
|
||||
paddsw m0, m2, [r0+r2*2+mmsize*0]
|
||||
paddsw m1, m2, [r0+r2*2+mmsize*1]
|
||||
psraw m0, 15 - %1
|
||||
psraw m1, 15 - %1
|
||||
pmaxsw m0, m4
|
||||
pmaxsw m1, m4
|
||||
pminsw m0, m3
|
||||
pminsw m1, m3
|
||||
mov%2 [r1+r2*2], m0
|
||||
mov%2 [r1+r2*2+mmsize], m1
|
||||
%endif
|
||||
add r2, mmsize
|
||||
jl .loop_%2
|
||||
%endmacro
|
||||
|
||||
%macro yuv2plane1_fn 3
|
||||
cglobal yuv2plane1_%1, %3, %3, %2
|
||||
%if %1 == 8
|
||||
add r1, r2
|
||||
%else ; %1 != 8
|
||||
lea r1, [r1+r2*2]
|
||||
%endif ; %1 == 8
|
||||
%if %1 == 16
|
||||
lea r0, [r0+r2*4]
|
||||
%else ; %1 != 16
|
||||
lea r0, [r0+r2*2]
|
||||
%endif ; %1 == 16
|
||||
neg r2
|
||||
|
||||
%if %1 == 8
|
||||
pxor m4, m4 ; zero
|
||||
|
||||
; create registers holding dither
|
||||
movq m3, [r3] ; dither
|
||||
test r4d, r4d
|
||||
jz .no_rot
|
||||
%if mmsize == 16
|
||||
punpcklqdq m3, m3
|
||||
%endif ; mmsize == 16
|
||||
PALIGNR_MMX m3, m3, 3, m2
|
||||
.no_rot:
|
||||
%if mmsize == 8
|
||||
mova m2, m3
|
||||
punpckhbw m3, m4 ; byte->word
|
||||
punpcklbw m2, m4 ; byte->word
|
||||
%else
|
||||
punpcklbw m3, m4
|
||||
mova m2, m3
|
||||
%endif
|
||||
%elif %1 == 9
|
||||
pxor m4, m4
|
||||
mova m3, [pw_512]
|
||||
mova m2, [pw_32]
|
||||
%elif %1 == 10
|
||||
pxor m4, m4
|
||||
mova m3, [pw_1024]
|
||||
mova m2, [pw_16]
|
||||
%else ; %1 == 16
|
||||
%if cpuflag(sse4) ; sse4/avx
|
||||
mova m4, [pd_4]
|
||||
%else ; mmx/sse2
|
||||
mova m4, [pd_4min0x40000]
|
||||
mova m5, [minshort]
|
||||
%endif ; mmx/sse2/sse4/avx
|
||||
%endif ; %1 == ..
|
||||
|
||||
; actual pixel scaling
|
||||
%if mmsize == 8
|
||||
yuv2plane1_mainloop %1, a
|
||||
%else ; mmsize == 16
|
||||
test r1, 15
|
||||
jnz .unaligned
|
||||
yuv2plane1_mainloop %1, a
|
||||
REP_RET
|
||||
.unaligned:
|
||||
yuv2plane1_mainloop %1, u
|
||||
%endif ; mmsize == 8/16
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%ifdef ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
yuv2plane1_fn 8, 0, 5
|
||||
yuv2plane1_fn 16, 0, 3
|
||||
|
||||
INIT_MMX mmx2
|
||||
yuv2plane1_fn 9, 0, 3
|
||||
yuv2plane1_fn 10, 0, 3
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
yuv2plane1_fn 8, 5, 5
|
||||
yuv2plane1_fn 9, 5, 3
|
||||
yuv2plane1_fn 10, 5, 3
|
||||
yuv2plane1_fn 16, 6, 3
|
||||
|
||||
INIT_XMM sse4
|
||||
yuv2plane1_fn 16, 5, 3
|
||||
|
||||
INIT_XMM avx
|
||||
yuv2plane1_fn 8, 5, 5
|
||||
yuv2plane1_fn 9, 5, 3
|
||||
yuv2plane1_fn 10, 5, 3
|
||||
yuv2plane1_fn 16, 5, 3
|
||||
|
@ -228,6 +228,22 @@ VSCALEX_FUNCS(sse4, sse4);
|
||||
VSCALEX_FUNC(16, sse4);
|
||||
VSCALEX_FUNCS(avx, avx);
|
||||
|
||||
#define VSCALE_FUNC(size, opt) \
|
||||
extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
|
||||
const uint8_t *dither, int offset)
|
||||
#define VSCALE_FUNCS(opt1, opt2) \
|
||||
VSCALE_FUNC(8, opt1); \
|
||||
VSCALE_FUNC(9, opt2); \
|
||||
VSCALE_FUNC(10, opt2); \
|
||||
VSCALE_FUNC(16, opt1)
|
||||
|
||||
#if ARCH_X86_32
|
||||
VSCALE_FUNCS(mmx, mmx2);
|
||||
#endif
|
||||
VSCALE_FUNCS(sse2, sse2);
|
||||
VSCALE_FUNC(16, sse4);
|
||||
VSCALE_FUNCS(avx, avx);
|
||||
|
||||
void ff_sws_init_swScale_mmx(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -268,11 +284,19 @@ switch(c->dstBpc){ \
|
||||
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_9_ ## opt2; break; \
|
||||
default: vscalefn = ff_yuv2planeX_8_ ## opt1; break; \
|
||||
}
|
||||
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
|
||||
switch(c->dstBpc){ \
|
||||
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
|
||||
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
|
||||
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
|
||||
default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
|
||||
}
|
||||
#if ARCH_X86_32
|
||||
if (cpu_flags & AV_CPU_FLAG_MMX) {
|
||||
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
|
||||
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2,);
|
||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2);
|
||||
}
|
||||
#endif
|
||||
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
|
||||
@ -287,6 +311,7 @@ switch(c->dstBpc){ \
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, sse2, 1,);
|
||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
|
||||
}
|
||||
if (cpu_flags & AV_CPU_FLAG_SSSE3) {
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
|
||||
@ -298,10 +323,13 @@ switch(c->dstBpc){ \
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4, sse4, 1,
|
||||
if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4);
|
||||
if (c->dstBpc == 16 && !isBE(c->dstFormat))
|
||||
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
|
||||
}
|
||||
|
||||
if (cpu_flags & AV_CPU_FLAG_AVX) {
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, avx, 1,);
|
||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -35,116 +35,6 @@
|
||||
#endif
|
||||
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
|
||||
|
||||
#if !COMPILE_TEMPLATE_MMX2
|
||||
static av_always_inline void
|
||||
dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
|
||||
{
|
||||
if (rot) {
|
||||
__asm__ volatile("pxor %%mm0, %%mm0\n\t"
|
||||
"movq (%0), %%mm3\n\t"
|
||||
"movq %%mm3, %%mm4\n\t"
|
||||
"psrlq $24, %%mm3\n\t"
|
||||
"psllq $40, %%mm4\n\t"
|
||||
"por %%mm4, %%mm3\n\t"
|
||||
"movq %%mm3, %%mm4\n\t"
|
||||
"punpcklbw %%mm0, %%mm3\n\t"
|
||||
"punpckhbw %%mm0, %%mm4\n\t"
|
||||
"psraw $4, %%mm3\n\t"
|
||||
"psraw $4, %%mm4\n\t"
|
||||
"movq %%mm3, "DITHER16"+0(%1)\n\t"
|
||||
"movq %%mm4, "DITHER16"+8(%1)\n\t"
|
||||
:: "r"(srcDither), "r"(&c->redDither)
|
||||
);
|
||||
} else {
|
||||
__asm__ volatile("pxor %%mm0, %%mm0\n\t"
|
||||
"movq (%0), %%mm3\n\t"
|
||||
"movq %%mm3, %%mm4\n\t"
|
||||
"punpcklbw %%mm0, %%mm3\n\t"
|
||||
"punpckhbw %%mm0, %%mm4\n\t"
|
||||
"psraw $4, %%mm3\n\t"
|
||||
"psraw $4, %%mm4\n\t"
|
||||
"movq %%mm3, "DITHER16"+0(%1)\n\t"
|
||||
"movq %%mm4, "DITHER16"+8(%1)\n\t"
|
||||
:: "r"(srcDither), "r"(&c->redDither)
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
|
||||
const int16_t *chrUSrc, const int16_t *chrVSrc,
|
||||
const int16_t *alpSrc,
|
||||
uint8_t *dst[4], int dstW, int chrDstW)
|
||||
{
|
||||
int p= 4;
|
||||
const int16_t *src[4]= {
|
||||
lumSrc + dstW, chrUSrc + chrDstW,
|
||||
chrVSrc + chrDstW, alpSrc + dstW
|
||||
};
|
||||
x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
|
||||
|
||||
while (p--) {
|
||||
if (dst[p]) {
|
||||
__asm__ volatile(
|
||||
"mov %2, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t" /* FIXME Unroll? */
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
|
||||
"psraw $7, %%mm0 \n\t"
|
||||
"psraw $7, %%mm1 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
MOVNTQ(%%mm0, (%1, %%REGa))
|
||||
"add $8, %%"REG_a" \n\t"
|
||||
"jnc 1b \n\t"
|
||||
:: "r" (src[p]), "r" (dst[p] + counter[p]),
|
||||
"g" (-counter[p])
|
||||
: "%"REG_a
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
|
||||
const int16_t *chrUSrc, const int16_t *chrVSrc,
|
||||
const int16_t *alpSrc,
|
||||
uint8_t *dst[4], int dstW, int chrDstW)
|
||||
{
|
||||
int p= 4;
|
||||
const int16_t *src[4]= {
|
||||
lumSrc + dstW, chrUSrc + chrDstW,
|
||||
chrVSrc + chrDstW, alpSrc + dstW
|
||||
};
|
||||
x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
|
||||
const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
|
||||
|
||||
while (p--) {
|
||||
if (dst[p]) {
|
||||
dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2);
|
||||
__asm__ volatile(
|
||||
"mov %2, %%"REG_a" \n\t"
|
||||
"movq "DITHER16"+0(%3), %%mm6 \n\t"
|
||||
"movq "DITHER16"+8(%3), %%mm7 \n\t"
|
||||
".p2align 4 \n\t" /* FIXME Unroll? */
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
|
||||
"paddsw %%mm6, %%mm0 \n\t"
|
||||
"paddsw %%mm7, %%mm1 \n\t"
|
||||
"psraw $7, %%mm0 \n\t"
|
||||
"psraw $7, %%mm1 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
MOVNTQ(%%mm0, (%1, %%REGa))
|
||||
"add $8, %%"REG_a" \n\t"
|
||||
"jnc 1b \n\t"
|
||||
:: "r" (src[p]), "r" (dst[p] + counter[p]),
|
||||
"g" (-counter[p]), "r"(&c->redDither)
|
||||
: "%"REG_a
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define YSCALEYUV2PACKEDX_UV \
|
||||
__asm__ volatile(\
|
||||
"xor %%"REG_a", %%"REG_a" \n\t"\
|
||||
@ -1899,7 +1789,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
|
||||
dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
|
||||
if (!(c->flags & SWS_BITEXACT)) {
|
||||
if (c->flags & SWS_ACCURATE_RND) {
|
||||
//c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
|
||||
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
|
||||
switch (c->dstFormat) {
|
||||
case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
|
||||
@ -1911,7 +1800,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//c->yuv2yuv1 = RENAME(yuv2yuv1 );
|
||||
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
|
||||
switch (c->dstFormat) {
|
||||
case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
|
||||
|
Loading…
Reference in New Issue
Block a user