You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-15 14:13:16 +02:00
swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions
x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT, SSE and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2). So given that the only systems that
benefit from these functions are truely ancient 32bit x86s
they are removed.
Moreover, some of the removed code was buggy/not bitexact
and lead to failures involving the f32le and f32be versions of
gray, gbrp and gbrap on x86-32 when SSE2 was not disabled.
See e.g.
https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx
Notice that yuv2yuvX_mmx is not removed, because it is used
by SSE3 and AVX2 as fallback in case of unaligned data and
also for tail processing. I don't know why yuv2yuvX_mmxext
isn't being used for this; an earlier version [1] of
554c2bc708
used it, but
the version that was eventually applied does not.
[1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -133,23 +133,18 @@ SECTION .text
|
|||||||
; %2 = rgb or bgr
|
; %2 = rgb or bgr
|
||||||
%macro RGB24_TO_Y_FN 2-3
|
%macro RGB24_TO_Y_FN 2-3
|
||||||
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
||||||
%if mmsize == 8
|
%if ARCH_X86_64
|
||||||
mova m5, [%2_Ycoeff_12x4]
|
|
||||||
mova m6, [%2_Ycoeff_3x56]
|
|
||||||
%define coeff1 m5
|
|
||||||
%define coeff2 m6
|
|
||||||
%elif ARCH_X86_64
|
|
||||||
mova m8, [%2_Ycoeff_12x4]
|
mova m8, [%2_Ycoeff_12x4]
|
||||||
mova m9, [%2_Ycoeff_3x56]
|
mova m9, [%2_Ycoeff_3x56]
|
||||||
%define coeff1 m8
|
%define coeff1 m8
|
||||||
%define coeff2 m9
|
%define coeff2 m9
|
||||||
%else ; x86-32 && mmsize == 16
|
%else ; x86-32
|
||||||
%define coeff1 [%2_Ycoeff_12x4]
|
%define coeff1 [%2_Ycoeff_12x4]
|
||||||
%define coeff2 [%2_Ycoeff_3x56]
|
%define coeff2 [%2_Ycoeff_3x56]
|
||||||
%endif ; x86-32/64 && mmsize == 8/16
|
%endif ; x86-32/64
|
||||||
%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
|
%if ARCH_X86_64 && %0 == 3
|
||||||
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
|
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
|
||||||
%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
|
%else ; ARCH_X86_64 && %0 == 3
|
||||||
.body:
|
.body:
|
||||||
%if cpuflag(ssse3)
|
%if cpuflag(ssse3)
|
||||||
mova m7, [shuf_rgb_12x4]
|
mova m7, [shuf_rgb_12x4]
|
||||||
@@ -184,7 +179,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
|||||||
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
|
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
|
||||||
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
|
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
|
||||||
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
|
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
|
||||||
%if mmsize == 16 ; i.e. sse2
|
|
||||||
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||||
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||||
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
|
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
|
||||||
@@ -193,7 +187,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
|||||||
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
|
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
|
||||||
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||||
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||||
%endif ; mmsize == 16
|
|
||||||
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||||
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||||
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||||
@@ -215,7 +208,7 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
|||||||
add wq, mmsize
|
add wq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
REP_RET
|
REP_RET
|
||||||
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
|
%endif ; ARCH_X86_64 && %0 == 3
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; %1 = nr. of XMM registers
|
; %1 = nr. of XMM registers
|
||||||
@@ -275,12 +268,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
|
|||||||
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
|
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
|
||||||
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
|
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
|
||||||
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
|
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
|
||||||
%if mmsize == 16
|
|
||||||
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||||
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||||
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
|
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
|
||||||
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
|
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
|
||||||
%endif ; mmsize == 16
|
|
||||||
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||||
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||||
%endif ; cpuflag(ssse3)
|
%endif ; cpuflag(ssse3)
|
||||||
@@ -294,12 +285,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
|
|||||||
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||||
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||||
%else ; !cpuflag(ssse3)
|
%else ; !cpuflag(ssse3)
|
||||||
%if mmsize == 16
|
|
||||||
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
|
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
|
||||||
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
|
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
|
||||||
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||||
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||||
%endif ; mmsize == 16 && !cpuflag(ssse3)
|
|
||||||
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||||
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||||
%endif ; cpuflag(ssse3)
|
%endif ; cpuflag(ssse3)
|
||||||
@@ -320,13 +309,8 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
|
|||||||
psrad m4, 9
|
psrad m4, 9
|
||||||
packssdw m0, m1 ; (word) { U[0-7] }
|
packssdw m0, m1 ; (word) { U[0-7] }
|
||||||
packssdw m2, m4 ; (word) { V[0-7] }
|
packssdw m2, m4 ; (word) { V[0-7] }
|
||||||
%if mmsize == 8
|
|
||||||
mova [dstUq+wq], m0
|
mova [dstUq+wq], m0
|
||||||
mova [dstVq+wq], m2
|
mova [dstVq+wq], m2
|
||||||
%else ; mmsize == 16
|
|
||||||
mova [dstUq+wq], m0
|
|
||||||
mova [dstVq+wq], m2
|
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
add wq, mmsize
|
add wq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
REP_RET
|
REP_RET
|
||||||
@@ -342,11 +326,6 @@ RGB24_TO_UV_FN %2, rgb
|
|||||||
RGB24_TO_UV_FN %2, bgr, rgb
|
RGB24_TO_UV_FN %2, bgr, rgb
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_32
|
|
||||||
INIT_MMX mmx
|
|
||||||
RGB24_FUNCS 0, 0
|
|
||||||
%endif
|
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
RGB24_FUNCS 10, 12
|
RGB24_FUNCS 10, 12
|
||||||
|
|
||||||
@@ -483,13 +462,8 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
|
|||||||
psrad m1, 9
|
psrad m1, 9
|
||||||
packssdw m0, m4 ; (word) { U[0-7] }
|
packssdw m0, m4 ; (word) { U[0-7] }
|
||||||
packssdw m2, m1 ; (word) { V[0-7] }
|
packssdw m2, m1 ; (word) { V[0-7] }
|
||||||
%if mmsize == 8
|
|
||||||
mova [dstUq+wq], m0
|
mova [dstUq+wq], m0
|
||||||
mova [dstVq+wq], m2
|
mova [dstVq+wq], m2
|
||||||
%else ; mmsize == 16
|
|
||||||
mova [dstUq+wq], m0
|
|
||||||
mova [dstVq+wq], m2
|
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
add wq, mmsize
|
add wq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
sub wq, mmsize - 1
|
sub wq, mmsize - 1
|
||||||
@@ -535,11 +509,6 @@ RGB32_TO_UV_FN %2, a, r, g, b, rgba
|
|||||||
RGB32_TO_UV_FN %2, a, b, g, r, rgba
|
RGB32_TO_UV_FN %2, a, b, g, r, rgba
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_32
|
|
||||||
INIT_MMX mmx
|
|
||||||
RGB32_FUNCS 0, 0
|
|
||||||
%endif
|
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
RGB32_FUNCS 8, 12
|
RGB32_FUNCS 8, 12
|
||||||
|
|
||||||
@@ -588,25 +557,18 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
|
|||||||
movsxd wq, wd
|
movsxd wq, wd
|
||||||
%endif
|
%endif
|
||||||
add dstq, wq
|
add dstq, wq
|
||||||
%if mmsize == 16
|
|
||||||
test srcq, 15
|
test srcq, 15
|
||||||
%endif
|
|
||||||
lea srcq, [srcq+wq*2]
|
lea srcq, [srcq+wq*2]
|
||||||
%ifidn %2, yuyv
|
%ifidn %2, yuyv
|
||||||
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
|
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
|
||||||
psrlw m2, 8 ; (word) { 0x00ff } x 8
|
psrlw m2, 8 ; (word) { 0x00ff } x 8
|
||||||
%endif ; yuyv
|
%endif ; yuyv
|
||||||
%if mmsize == 16
|
|
||||||
jnz .loop_u_start
|
jnz .loop_u_start
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_YUYV_TO_Y a, %2
|
LOOP_YUYV_TO_Y a, %2
|
||||||
.loop_u_start:
|
.loop_u_start:
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_YUYV_TO_Y u, %2
|
LOOP_YUYV_TO_Y u, %2
|
||||||
%else ; mmsize == 8
|
|
||||||
neg wq
|
|
||||||
LOOP_YUYV_TO_Y a, %2
|
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; %1 = a (aligned) or u (unaligned)
|
; %1 = a (aligned) or u (unaligned)
|
||||||
@@ -632,16 +594,9 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
|
|||||||
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
|
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
|
||||||
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
|
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
|
||||||
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
|
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
|
||||||
%if mmsize == 16
|
|
||||||
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
|
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
|
||||||
movh [dstUq+wq], m1
|
movh [dstUq+wq], m1
|
||||||
movhps [dstVq+wq], m1
|
movhps [dstVq+wq], m1
|
||||||
%else ; mmsize == 8
|
|
||||||
packuswb m1, m1 ; (byte) { U0, ... U3 }
|
|
||||||
packuswb m0, m0 ; (byte) { V0, ... V3 }
|
|
||||||
movh [dstUq+wq], m1
|
|
||||||
movh [dstVq+wq], m0
|
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
add wq, mmsize / 2
|
add wq, mmsize / 2
|
||||||
jl .loop_%1
|
jl .loop_%1
|
||||||
REP_RET
|
REP_RET
|
||||||
@@ -661,24 +616,24 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
|
|||||||
%endif
|
%endif
|
||||||
add dstUq, wq
|
add dstUq, wq
|
||||||
add dstVq, wq
|
add dstVq, wq
|
||||||
%if mmsize == 16 && %0 == 2
|
%if %0 == 2
|
||||||
test srcq, 15
|
test srcq, 15
|
||||||
%endif
|
%endif
|
||||||
lea srcq, [srcq+wq*4]
|
lea srcq, [srcq+wq*4]
|
||||||
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
|
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
|
||||||
psrlw m2, 8 ; (word) { 0x00ff } x 8
|
psrlw m2, 8 ; (word) { 0x00ff } x 8
|
||||||
; NOTE: if uyvy+avx, u/a are identical
|
; NOTE: if uyvy+avx, u/a are identical
|
||||||
%if mmsize == 16 && %0 == 2
|
%if %0 == 2
|
||||||
jnz .loop_u_start
|
jnz .loop_u_start
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_YUYV_TO_UV a, %2
|
LOOP_YUYV_TO_UV a, %2
|
||||||
.loop_u_start:
|
.loop_u_start:
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_YUYV_TO_UV u, %2
|
LOOP_YUYV_TO_UV u, %2
|
||||||
%else ; mmsize == 8
|
%else
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_YUYV_TO_UV a, %2
|
LOOP_YUYV_TO_UV a, %2
|
||||||
%endif ; mmsize == 8/16
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; %1 = a (aligned) or u (unaligned)
|
; %1 = a (aligned) or u (unaligned)
|
||||||
@@ -716,35 +671,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
|
|||||||
%endif
|
%endif
|
||||||
add dstUq, wq
|
add dstUq, wq
|
||||||
add dstVq, wq
|
add dstVq, wq
|
||||||
%if mmsize == 16
|
|
||||||
test srcq, 15
|
test srcq, 15
|
||||||
%endif
|
|
||||||
lea srcq, [srcq+wq*2]
|
lea srcq, [srcq+wq*2]
|
||||||
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
|
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
|
||||||
psrlw m5, 8 ; (word) { 0x00ff } x 8
|
psrlw m5, 8 ; (word) { 0x00ff } x 8
|
||||||
%if mmsize == 16
|
|
||||||
jnz .loop_u_start
|
jnz .loop_u_start
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_NVXX_TO_UV a, %2
|
LOOP_NVXX_TO_UV a, %2
|
||||||
.loop_u_start:
|
.loop_u_start:
|
||||||
neg wq
|
neg wq
|
||||||
LOOP_NVXX_TO_UV u, %2
|
LOOP_NVXX_TO_UV u, %2
|
||||||
%else ; mmsize == 8
|
|
||||||
neg wq
|
|
||||||
LOOP_NVXX_TO_UV a, %2
|
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_32
|
|
||||||
INIT_MMX mmx
|
|
||||||
YUYV_TO_Y_FN 0, yuyv
|
|
||||||
YUYV_TO_Y_FN 0, uyvy
|
|
||||||
YUYV_TO_UV_FN 0, yuyv
|
|
||||||
YUYV_TO_UV_FN 0, uyvy
|
|
||||||
NVXX_TO_UV_FN 0, nv12
|
|
||||||
NVXX_TO_UV_FN 0, nv21
|
|
||||||
%endif
|
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
YUYV_TO_Y_FN 3, yuyv
|
YUYV_TO_Y_FN 3, yuyv
|
||||||
YUYV_TO_Y_FN 2, uyvy
|
YUYV_TO_Y_FN 2, uyvy
|
||||||
|
@@ -312,11 +312,9 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
|
|||||||
%endif ; %1 == 8/9/10/16
|
%endif ; %1 == 8/9/10/16
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_32
|
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
|
||||||
INIT_MMX mmxext
|
INIT_MMX mmxext
|
||||||
yuv2planeX_fn 8, 0, 7
|
yuv2planeX_fn 8, 0, 7
|
||||||
yuv2planeX_fn 9, 0, 5
|
|
||||||
yuv2planeX_fn 10, 0, 5
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
@@ -407,19 +405,11 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
|
|||||||
movq m3, [ditherq] ; dither
|
movq m3, [ditherq] ; dither
|
||||||
test offsetd, offsetd
|
test offsetd, offsetd
|
||||||
jz .no_rot
|
jz .no_rot
|
||||||
%if mmsize == 16
|
|
||||||
punpcklqdq m3, m3
|
punpcklqdq m3, m3
|
||||||
%endif ; mmsize == 16
|
|
||||||
PALIGNR m3, m3, 3, m2
|
PALIGNR m3, m3, 3, m2
|
||||||
.no_rot:
|
.no_rot:
|
||||||
%if mmsize == 8
|
|
||||||
mova m2, m3
|
|
||||||
punpckhbw m3, m4 ; byte->word
|
|
||||||
punpcklbw m2, m4 ; byte->word
|
|
||||||
%else
|
|
||||||
punpcklbw m3, m4
|
punpcklbw m3, m4
|
||||||
mova m2, m3
|
mova m2, m3
|
||||||
%endif
|
|
||||||
%elif %1 == 9
|
%elif %1 == 9
|
||||||
pxor m4, m4
|
pxor m4, m4
|
||||||
mova m3, [pw_512]
|
mova m3, [pw_512]
|
||||||
@@ -431,36 +421,22 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
|
|||||||
%else ; %1 == 16
|
%else ; %1 == 16
|
||||||
%if cpuflag(sse4) ; sse4/avx
|
%if cpuflag(sse4) ; sse4/avx
|
||||||
mova m4, [pd_4]
|
mova m4, [pd_4]
|
||||||
%else ; mmx/sse2
|
%else ; sse2
|
||||||
mova m4, [pd_4min0x40000]
|
mova m4, [pd_4min0x40000]
|
||||||
mova m5, [minshort]
|
mova m5, [minshort]
|
||||||
%endif ; mmx/sse2/sse4/avx
|
%endif ; sse2/sse4/avx
|
||||||
%endif ; %1 == ..
|
%endif ; %1 == ..
|
||||||
|
|
||||||
; actual pixel scaling
|
; actual pixel scaling
|
||||||
%if mmsize == 8
|
|
||||||
yuv2plane1_mainloop %1, a
|
|
||||||
%else ; mmsize == 16
|
|
||||||
test dstq, 15
|
test dstq, 15
|
||||||
jnz .unaligned
|
jnz .unaligned
|
||||||
yuv2plane1_mainloop %1, a
|
yuv2plane1_mainloop %1, a
|
||||||
REP_RET
|
REP_RET
|
||||||
.unaligned:
|
.unaligned:
|
||||||
yuv2plane1_mainloop %1, u
|
yuv2plane1_mainloop %1, u
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
REP_RET
|
REP_RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_32
|
|
||||||
INIT_MMX mmx
|
|
||||||
yuv2plane1_fn 8, 0, 5
|
|
||||||
yuv2plane1_fn 16, 0, 3
|
|
||||||
|
|
||||||
INIT_MMX mmxext
|
|
||||||
yuv2plane1_fn 9, 0, 3
|
|
||||||
yuv2plane1_fn 10, 0, 3
|
|
||||||
%endif
|
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
yuv2plane1_fn 8, 5, 5
|
yuv2plane1_fn 8, 5, 5
|
||||||
yuv2plane1_fn 9, 5, 3
|
yuv2plane1_fn 9, 5, 3
|
||||||
|
@@ -61,13 +61,11 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
%define mov32 mov
|
%define mov32 mov
|
||||||
%endif ; x86-64
|
%endif ; x86-64
|
||||||
%if %2 == 19
|
%if %2 == 19
|
||||||
%if mmsize == 8 ; mmx
|
%if cpuflag(sse4)
|
||||||
mova m2, [max_19bit_int]
|
|
||||||
%elif cpuflag(sse4)
|
|
||||||
mova m2, [max_19bit_int]
|
mova m2, [max_19bit_int]
|
||||||
%else ; ssse3/sse2
|
%else ; ssse3/sse2
|
||||||
mova m2, [max_19bit_flt]
|
mova m2, [max_19bit_flt]
|
||||||
%endif ; mmx/sse2/ssse3/sse4
|
%endif ; sse2/ssse3/sse4
|
||||||
%endif ; %2 == 19
|
%endif ; %2 == 19
|
||||||
%if %1 == 16
|
%if %1 == 16
|
||||||
mova m6, [minshort]
|
mova m6, [minshort]
|
||||||
@@ -144,12 +142,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
|
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
|
||||||
|
|
||||||
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
|
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
|
||||||
%if mmsize == 8 ; mmx
|
%if notcpuflag(ssse3) ; sse2
|
||||||
movq m4, m0
|
|
||||||
punpckldq m0, m1
|
|
||||||
punpckhdq m4, m1
|
|
||||||
paddd m0, m4
|
|
||||||
%elif notcpuflag(ssse3) ; sse2
|
|
||||||
mova m4, m0
|
mova m4, m0
|
||||||
shufps m0, m1, 10001000b
|
shufps m0, m1, 10001000b
|
||||||
shufps m4, m1, 11011101b
|
shufps m4, m1, 11011101b
|
||||||
@@ -159,7 +152,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
|
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
|
||||||
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
|
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
|
||||||
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
|
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
|
||||||
%endif ; mmx/sse2/ssse3/sse4
|
%endif ; sse2/ssse3/sse4
|
||||||
%else ; %3 == 8, i.e. filterSize == 8 scaling
|
%else ; %3 == 8, i.e. filterSize == 8 scaling
|
||||||
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
|
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
|
||||||
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
|
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
|
||||||
@@ -197,14 +190,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
|
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
|
||||||
|
|
||||||
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
|
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
|
||||||
%if mmsize == 8
|
%if notcpuflag(ssse3) ; sse2
|
||||||
paddd m0, m1
|
|
||||||
paddd m4, m5
|
|
||||||
movq m1, m0
|
|
||||||
punpckldq m0, m4
|
|
||||||
punpckhdq m1, m4
|
|
||||||
paddd m0, m1
|
|
||||||
%elif notcpuflag(ssse3) ; sse2
|
|
||||||
%if %1 == 8
|
%if %1 == 8
|
||||||
%define mex m6
|
%define mex m6
|
||||||
%else
|
%else
|
||||||
@@ -233,7 +219,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
|
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
|
||||||
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
|
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
|
||||||
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
|
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
|
||||||
%endif ; mmx/sse2/ssse3/sse4
|
%endif ; sse2/ssse3/sse4
|
||||||
%endif ; %3 == 4/8
|
%endif ; %3 == 4/8
|
||||||
|
|
||||||
%else ; %3 == X, i.e. any filterSize scaling
|
%else ; %3 == X, i.e. any filterSize scaling
|
||||||
@@ -274,7 +260,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
mov srcq, srcmemmp
|
mov srcq, srcmemmp
|
||||||
|
|
||||||
.innerloop:
|
.innerloop:
|
||||||
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
|
; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
|
||||||
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
|
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
|
||||||
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
|
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
|
||||||
%if %1 == 8
|
%if %1 == 8
|
||||||
@@ -319,12 +305,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
|
|
||||||
lea filterq, [filterq+(fltsizeq+dlt)*2]
|
lea filterq, [filterq+(fltsizeq+dlt)*2]
|
||||||
|
|
||||||
%if mmsize == 8 ; mmx
|
|
||||||
movq m0, m4
|
|
||||||
punpckldq m4, m5
|
|
||||||
punpckhdq m0, m5
|
|
||||||
paddd m0, m4
|
|
||||||
%else ; mmsize == 16
|
|
||||||
%if notcpuflag(ssse3) ; sse2
|
%if notcpuflag(ssse3) ; sse2
|
||||||
mova m1, m4
|
mova m1, m4
|
||||||
punpcklqdq m4, m5
|
punpcklqdq m4, m5
|
||||||
@@ -344,7 +324,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
phaddd m4, m4
|
phaddd m4, m4
|
||||||
SWAP 0, 4
|
SWAP 0, 4
|
||||||
%endif ; sse2/ssse3/sse4
|
%endif ; sse2/ssse3/sse4
|
||||||
%endif ; mmsize == 8/16
|
|
||||||
%endif ; %3 ==/!= X
|
%endif ; %3 ==/!= X
|
||||||
|
|
||||||
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
|
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
|
||||||
@@ -372,7 +351,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
%endif ; %3 ==/!= X
|
%endif ; %3 ==/!= X
|
||||||
%endif ; %2 == 15/19
|
%endif ; %2 == 15/19
|
||||||
%ifnidn %3, X
|
%ifnidn %3, X
|
||||||
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
|
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels
|
||||||
; per iteration. see "shl wq,1" above as for why we do this
|
; per iteration. see "shl wq,1" above as for why we do this
|
||||||
%else ; %3 == X
|
%else ; %3 == X
|
||||||
add wq, 2
|
add wq, 2
|
||||||
@@ -385,12 +364,8 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
|
|||||||
%macro SCALE_FUNCS 3
|
%macro SCALE_FUNCS 3
|
||||||
SCALE_FUNC %1, %2, 4, 4, 6, %3
|
SCALE_FUNC %1, %2, 4, 4, 6, %3
|
||||||
SCALE_FUNC %1, %2, 8, 8, 6, %3
|
SCALE_FUNC %1, %2, 8, 8, 6, %3
|
||||||
%if mmsize == 8
|
|
||||||
SCALE_FUNC %1, %2, X, X, 7, %3
|
|
||||||
%else
|
|
||||||
SCALE_FUNC %1, %2, X, X4, 7, %3
|
SCALE_FUNC %1, %2, X, X4, 7, %3
|
||||||
SCALE_FUNC %1, %2, X, X8, 7, %3
|
SCALE_FUNC %1, %2, X, X8, 7, %3
|
||||||
%endif
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
|
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
|
||||||
@@ -411,10 +386,6 @@ SCALE_FUNCS 14, 19, %2
|
|||||||
SCALE_FUNCS 16, 19, %3
|
SCALE_FUNCS 16, 19, %3
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_32
|
|
||||||
INIT_MMX mmx
|
|
||||||
SCALE_FUNCS2 0, 0, 0
|
|
||||||
%endif
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
SCALE_FUNCS2 7, 6, 8
|
SCALE_FUNCS2 7, 6, 8
|
||||||
INIT_XMM ssse3
|
INIT_XMM ssse3
|
||||||
|
@@ -54,14 +54,6 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
|
|||||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
|
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
|
||||||
|
|
||||||
|
|
||||||
//MMX versions
|
|
||||||
#if HAVE_MMX_INLINE
|
|
||||||
#undef RENAME
|
|
||||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
|
||||||
#define RENAME(a) a ## _mmx
|
|
||||||
#include "swscale_template.c"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// MMXEXT versions
|
// MMXEXT versions
|
||||||
#if HAVE_MMXEXT_INLINE
|
#if HAVE_MMXEXT_INLINE
|
||||||
#undef RENAME
|
#undef RENAME
|
||||||
@@ -269,9 +261,6 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
|
|||||||
SCALE_FUNCS(X4, opt); \
|
SCALE_FUNCS(X4, opt); \
|
||||||
SCALE_FUNCS(X8, opt)
|
SCALE_FUNCS(X8, opt)
|
||||||
|
|
||||||
#if ARCH_X86_32
|
|
||||||
SCALE_FUNCS_MMX(mmx);
|
|
||||||
#endif
|
|
||||||
SCALE_FUNCS_SSE(sse2);
|
SCALE_FUNCS_SSE(sse2);
|
||||||
SCALE_FUNCS_SSE(ssse3);
|
SCALE_FUNCS_SSE(ssse3);
|
||||||
SCALE_FUNCS_SSE(sse4);
|
SCALE_FUNCS_SSE(sse4);
|
||||||
@@ -288,9 +277,7 @@ void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
|
|||||||
VSCALEX_FUNC(9, opt); \
|
VSCALEX_FUNC(9, opt); \
|
||||||
VSCALEX_FUNC(10, opt)
|
VSCALEX_FUNC(10, opt)
|
||||||
|
|
||||||
#if ARCH_X86_32
|
VSCALEX_FUNC(8, mmxext);
|
||||||
VSCALEX_FUNCS(mmxext);
|
|
||||||
#endif
|
|
||||||
VSCALEX_FUNCS(sse2);
|
VSCALEX_FUNCS(sse2);
|
||||||
VSCALEX_FUNCS(sse4);
|
VSCALEX_FUNCS(sse4);
|
||||||
VSCALEX_FUNC(16, sse4);
|
VSCALEX_FUNC(16, sse4);
|
||||||
@@ -305,9 +292,6 @@ void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int ds
|
|||||||
VSCALE_FUNC(10, opt2); \
|
VSCALE_FUNC(10, opt2); \
|
||||||
VSCALE_FUNC(16, opt1)
|
VSCALE_FUNC(16, opt1)
|
||||||
|
|
||||||
#if ARCH_X86_32
|
|
||||||
VSCALE_FUNCS(mmx, mmxext);
|
|
||||||
#endif
|
|
||||||
VSCALE_FUNCS(sse2, sse2);
|
VSCALE_FUNCS(sse2, sse2);
|
||||||
VSCALE_FUNC(16, sse4);
|
VSCALE_FUNC(16, sse4);
|
||||||
VSCALE_FUNCS(avx, avx);
|
VSCALE_FUNCS(avx, avx);
|
||||||
@@ -337,9 +321,6 @@ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
|
|||||||
INPUT_FUNC(rgb24, opt); \
|
INPUT_FUNC(rgb24, opt); \
|
||||||
INPUT_FUNC(bgr24, opt)
|
INPUT_FUNC(bgr24, opt)
|
||||||
|
|
||||||
#if ARCH_X86_32
|
|
||||||
INPUT_FUNCS(mmx);
|
|
||||||
#endif
|
|
||||||
INPUT_FUNCS(sse2);
|
INPUT_FUNCS(sse2);
|
||||||
INPUT_FUNCS(ssse3);
|
INPUT_FUNCS(ssse3);
|
||||||
INPUT_FUNCS(avx);
|
INPUT_FUNCS(avx);
|
||||||
@@ -470,19 +451,11 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
|
|||||||
{
|
{
|
||||||
int cpu_flags = av_get_cpu_flags();
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
#if HAVE_MMX_INLINE
|
|
||||||
if (INLINE_MMX(cpu_flags))
|
|
||||||
sws_init_swscale_mmx(c);
|
|
||||||
#endif
|
|
||||||
#if HAVE_MMXEXT_INLINE
|
#if HAVE_MMXEXT_INLINE
|
||||||
if (INLINE_MMXEXT(cpu_flags))
|
if (INLINE_MMXEXT(cpu_flags))
|
||||||
sws_init_swscale_mmxext(c);
|
sws_init_swscale_mmxext(c);
|
||||||
#endif
|
#endif
|
||||||
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
|
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
|
||||||
#if HAVE_MMX_EXTERNAL
|
|
||||||
if (EXTERNAL_MMX(cpu_flags))
|
|
||||||
c->yuv2planeX = yuv2yuvX_mmx;
|
|
||||||
#endif
|
|
||||||
#if HAVE_MMXEXT_EXTERNAL
|
#if HAVE_MMXEXT_EXTERNAL
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||||
c->yuv2planeX = yuv2yuvX_mmxext;
|
c->yuv2planeX = yuv2yuvX_mmxext;
|
||||||
@@ -496,6 +469,14 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
|
|||||||
c->yuv2planeX = yuv2yuvX_avx2;
|
c->yuv2planeX = yuv2yuvX_avx2;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#if ARCH_X86_32 && !HAVE_ALIGNED_STACK
|
||||||
|
// The better yuv2planeX_8 functions need aligned stack on x86-32,
|
||||||
|
// so we use MMXEXT in this case if they are not available.
|
||||||
|
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||||
|
if (c->dstBpc == 8 && !c->use_mmx_vfilter)
|
||||||
|
c->yuv2planeX = ff_yuv2planeX_8_mmxext;
|
||||||
|
}
|
||||||
|
#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */
|
||||||
|
|
||||||
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
|
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
|
||||||
if (c->srcBpc == 8) { \
|
if (c->srcBpc == 8) { \
|
||||||
@@ -519,12 +500,6 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
|
|||||||
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
|
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
|
|
||||||
switch (filtersize) { \
|
|
||||||
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
|
|
||||||
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
|
|
||||||
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
|
|
||||||
}
|
|
||||||
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
|
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
|
||||||
switch(c->dstBpc){ \
|
switch(c->dstBpc){ \
|
||||||
case 16: do_16_case; break; \
|
case 16: do_16_case; break; \
|
||||||
@@ -546,46 +521,6 @@ switch(c->dstBpc){ \
|
|||||||
if (!c->chrSrcHSubSample) \
|
if (!c->chrSrcHSubSample) \
|
||||||
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
|
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
|
||||||
break
|
break
|
||||||
#if ARCH_X86_32
|
|
||||||
if (EXTERNAL_MMX(cpu_flags)) {
|
|
||||||
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
|
|
||||||
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
|
|
||||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
|
|
||||||
|
|
||||||
switch (c->srcFormat) {
|
|
||||||
case AV_PIX_FMT_YA8:
|
|
||||||
c->lumToYV12 = ff_yuyvToY_mmx;
|
|
||||||
if (c->needAlpha)
|
|
||||||
c->alpToYV12 = ff_uyvyToY_mmx;
|
|
||||||
break;
|
|
||||||
case AV_PIX_FMT_YUYV422:
|
|
||||||
c->lumToYV12 = ff_yuyvToY_mmx;
|
|
||||||
c->chrToYV12 = ff_yuyvToUV_mmx;
|
|
||||||
break;
|
|
||||||
case AV_PIX_FMT_UYVY422:
|
|
||||||
c->lumToYV12 = ff_uyvyToY_mmx;
|
|
||||||
c->chrToYV12 = ff_uyvyToUV_mmx;
|
|
||||||
break;
|
|
||||||
case AV_PIX_FMT_NV12:
|
|
||||||
c->chrToYV12 = ff_nv12ToUV_mmx;
|
|
||||||
break;
|
|
||||||
case AV_PIX_FMT_NV21:
|
|
||||||
c->chrToYV12 = ff_nv21ToUV_mmx;
|
|
||||||
break;
|
|
||||||
case_rgb(rgb24, RGB24, mmx);
|
|
||||||
case_rgb(bgr24, BGR24, mmx);
|
|
||||||
case_rgb(bgra, BGRA, mmx);
|
|
||||||
case_rgb(rgba, RGBA, mmx);
|
|
||||||
case_rgb(abgr, ABGR, mmx);
|
|
||||||
case_rgb(argb, ARGB, mmx);
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
|
||||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
|
|
||||||
}
|
|
||||||
#endif /* ARCH_X86_32 */
|
|
||||||
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
|
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
|
||||||
switch (filtersize) { \
|
switch (filtersize) { \
|
||||||
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
|
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
|
||||||
|
@@ -29,13 +29,8 @@
|
|||||||
#undef PREFETCH
|
#undef PREFETCH
|
||||||
|
|
||||||
|
|
||||||
#if COMPILE_TEMPLATE_MMXEXT
|
|
||||||
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
|
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
|
||||||
#define MOVNTQ2 "movntq "
|
#define MOVNTQ2 "movntq "
|
||||||
#else
|
|
||||||
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
|
|
||||||
#define MOVNTQ2 "movq "
|
|
||||||
#endif
|
|
||||||
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
|
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
|
||||||
|
|
||||||
#define YSCALEYUV2PACKEDX_UV \
|
#define YSCALEYUV2PACKEDX_UV \
|
||||||
@@ -600,13 +595,8 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
|
|||||||
"cmp "dstw", "#index" \n\t"\
|
"cmp "dstw", "#index" \n\t"\
|
||||||
" jb 1b \n\t"
|
" jb 1b \n\t"
|
||||||
|
|
||||||
#if COMPILE_TEMPLATE_MMXEXT
|
|
||||||
#undef WRITEBGR24
|
#undef WRITEBGR24
|
||||||
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
|
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
|
||||||
#else
|
|
||||||
#undef WRITEBGR24
|
|
||||||
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if HAVE_6REGS
|
#if HAVE_6REGS
|
||||||
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
|
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
|
||||||
@@ -1478,17 +1468,13 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (c->srcBpc == 8 && c->dstBpc <= 14) {
|
if (c->srcBpc == 8 && c->dstBpc <= 14) {
|
||||||
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
|
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
|
||||||
#if COMPILE_TEMPLATE_MMXEXT
|
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
|
||||||
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
|
c->hyscale_fast = ff_hyscale_fast_mmxext;
|
||||||
c->hyscale_fast = ff_hyscale_fast_mmxext;
|
c->hcscale_fast = ff_hcscale_fast_mmxext;
|
||||||
c->hcscale_fast = ff_hcscale_fast_mmxext;
|
} else {
|
||||||
} else {
|
c->hyscale_fast = NULL;
|
||||||
#endif /* COMPILE_TEMPLATE_MMXEXT */
|
c->hcscale_fast = NULL;
|
||||||
c->hyscale_fast = NULL;
|
}
|
||||||
c->hcscale_fast = NULL;
|
|
||||||
#if COMPILE_TEMPLATE_MMXEXT
|
|
||||||
}
|
|
||||||
#endif /* COMPILE_TEMPLATE_MMXEXT */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user