mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
vp9lpf/x86: add ff_vp9_loop_filter_h_{48,84}_16_{sse2,ssse3,avx}().
Signed-off-by: Anton Khirnov <anton@khirnov.net>
This commit is contained in:
parent
92d47550ea
commit
f2e3d706a1
@ -226,6 +226,12 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
|
||||
lpf_funcs(16, 16, sse2);
|
||||
lpf_funcs(16, 16, ssse3);
|
||||
lpf_funcs(16, 16, avx);
|
||||
lpf_funcs(84, 16, sse2);
|
||||
lpf_funcs(84, 16, ssse3);
|
||||
lpf_funcs(84, 16, avx);
|
||||
lpf_funcs(48, 16, sse2);
|
||||
lpf_funcs(48, 16, ssse3);
|
||||
lpf_funcs(48, 16, avx);
|
||||
lpf_funcs(88, 16, sse2);
|
||||
lpf_funcs(88, 16, ssse3);
|
||||
lpf_funcs(88, 16, avx);
|
||||
@ -269,6 +275,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
|
||||
init_subpel3_8to64(idx, type, opt); \
|
||||
init_subpel2(4, idx, 4, type, opt)
|
||||
|
||||
#define init_lpf(opt) do { \
|
||||
if (ARCH_X86_64) { \
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
|
||||
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
|
||||
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
init_fpel(4, 0, 4, put, mmx);
|
||||
init_fpel(3, 0, 8, put, mmx);
|
||||
@ -293,36 +312,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
|
||||
init_fpel(2, 1, 16, avg, sse2);
|
||||
init_fpel(1, 1, 32, avg, sse2);
|
||||
init_fpel(0, 1, 64, avg, sse2);
|
||||
if (ARCH_X86_64) {
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
|
||||
}
|
||||
init_lpf(sse2);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
init_subpel3(0, put, ssse3);
|
||||
init_subpel3(1, avg, ssse3);
|
||||
|
||||
if (ARCH_X86_64) {
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_ssse3;
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_ssse3;
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3;
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3;
|
||||
}
|
||||
init_lpf(ssse3);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
init_fpel(1, 0, 32, put, avx);
|
||||
init_fpel(0, 0, 64, put, avx);
|
||||
|
||||
if (ARCH_X86_64) {
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_avx;
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_avx;
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx;
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx;
|
||||
}
|
||||
init_lpf(avx);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||
|
@ -45,6 +45,11 @@ pw_8: times 8 dw 8
|
||||
mask_mix: times 8 db 0
|
||||
times 8 db 1
|
||||
|
||||
mask_mix84: times 8 db 0xff
|
||||
times 8 db 0x00
|
||||
mask_mix48: times 8 db 0x00
|
||||
times 8 db 0xff
|
||||
|
||||
SECTION .text
|
||||
|
||||
; %1 = abs(%2-%3)
|
||||
@ -312,7 +317,7 @@ SECTION .text
|
||||
neg mstride3q
|
||||
|
||||
%ifidn %1, h
|
||||
%if %2 == 88
|
||||
%if %2 > 16
|
||||
%define movx movh
|
||||
lea dstq, [dstq + 8*strideq - 4]
|
||||
%else
|
||||
@ -360,7 +365,7 @@ SECTION .text
|
||||
%define Q6 rsp + 224
|
||||
%define Q7 rsp + 240
|
||||
|
||||
%if %2 != 88
|
||||
%if %2 == 16
|
||||
TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
|
||||
mova [P7], m0
|
||||
mova [P6], m1
|
||||
@ -377,7 +382,7 @@ SECTION .text
|
||||
mova [Q1], m9
|
||||
mova [Q2], m10
|
||||
mova [Q3], m11
|
||||
%if %2 != 88
|
||||
%if %2 == 16
|
||||
mova [Q4], m12
|
||||
mova [Q5], m13
|
||||
mova [Q6], m14
|
||||
@ -392,7 +397,7 @@ SECTION .text
|
||||
%endif
|
||||
SPLATB_REG m2, I, m0 ; I I I I ...
|
||||
SPLATB_REG m3, E, m0 ; E E E E ...
|
||||
%elif %2 == 88
|
||||
%else
|
||||
%if cpuflag(ssse3)
|
||||
mova m0, [mask_mix]
|
||||
%endif
|
||||
@ -452,7 +457,7 @@ SECTION .text
|
||||
ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1
|
||||
pand m2, m1
|
||||
ABSSUB m4, m10, m11, m5 ; abs(p1 - p0)
|
||||
%if %2 != 88
|
||||
%if %2 == 16
|
||||
%if cpuflag(ssse3)
|
||||
pxor m0, m0
|
||||
%endif
|
||||
@ -476,8 +481,11 @@ SECTION .text
|
||||
pand m2, m1
|
||||
ABSSUB_CMP m1, m15, m12, m6, m4, m5, m8 ; abs(q3 - q0) <= 1
|
||||
pand m2, m1 ; flat8in final value
|
||||
%if %2 == 84 || %2 == 48
|
||||
pand m2, [mask_mix%2]
|
||||
%endif
|
||||
|
||||
%if %2 != 88
|
||||
%if %2 == 16
|
||||
; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
|
||||
; calc flat8out mask
|
||||
mova m8, [P7]
|
||||
@ -570,7 +578,7 @@ SECTION .text
|
||||
; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
|
||||
; filter6()
|
||||
pxor m0, m0
|
||||
%if %2 == 88
|
||||
%if %2 > 16
|
||||
pand m3, m2
|
||||
%else
|
||||
pand m2, m3 ; mask(fm) & mask(in)
|
||||
@ -608,7 +616,7 @@ SECTION .text
|
||||
; q5 +5 -p2 -q4 +q5 +q7 . q5 . .
|
||||
; q6 +6 -p1 -q5 +q6 +q7 . q6 . .
|
||||
|
||||
%if %2 != 88
|
||||
%if %2 == 16
|
||||
pand m1, m2 ; mask(out) & (mask(fm) & mask(in))
|
||||
mova m2, [P7]
|
||||
mova m3, [P6]
|
||||
@ -631,7 +639,7 @@ SECTION .text
|
||||
%endif
|
||||
|
||||
%ifidn %1, h
|
||||
%if %2 != 88
|
||||
%if %2 == 16
|
||||
mova m0, [P7]
|
||||
mova m1, [P6]
|
||||
mova m2, [P5]
|
||||
@ -720,28 +728,23 @@ SECTION .text
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro LPF_16_16_VH 1
|
||||
INIT_XMM %1
|
||||
cglobal vp9_loop_filter_v_16_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
|
||||
LOOPFILTER v, 16
|
||||
cglobal vp9_loop_filter_h_16_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
|
||||
LOOPFILTER h, 16
|
||||
%macro LPF_16_VH 2
|
||||
INIT_XMM %2
|
||||
cglobal vp9_loop_filter_v_%1_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
|
||||
LOOPFILTER v, %1
|
||||
cglobal vp9_loop_filter_h_%1_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
|
||||
LOOPFILTER h, %1
|
||||
%endmacro
|
||||
|
||||
%macro LPF_88_16_VH 1
|
||||
INIT_XMM %1
|
||||
cglobal vp9_loop_filter_v_88_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
|
||||
LOOPFILTER v, 88
|
||||
cglobal vp9_loop_filter_h_88_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
|
||||
LOOPFILTER h, 88
|
||||
%macro LPF_16_VH_ALL_OPTS 1
|
||||
LPF_16_VH %1, sse2
|
||||
LPF_16_VH %1, ssse3
|
||||
LPF_16_VH %1, avx
|
||||
%endmacro
|
||||
|
||||
LPF_16_16_VH sse2
|
||||
LPF_16_16_VH ssse3
|
||||
LPF_16_16_VH avx
|
||||
|
||||
LPF_88_16_VH sse2
|
||||
LPF_88_16_VH ssse3
|
||||
LPF_88_16_VH avx
|
||||
LPF_16_VH_ALL_OPTS 16
|
||||
LPF_16_VH_ALL_OPTS 48
|
||||
LPF_16_VH_ALL_OPTS 84
|
||||
LPF_16_VH_ALL_OPTS 88
|
||||
|
||||
%endif ; x86-64
|
||||
|
Loading…
Reference in New Issue
Block a user