mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
x86/vp9: inital AVX2 intra_pred
tos3k-vp9-b10000.webm on a Core i5-4200U @1.6GHz 1219 decicycles in ff_vp9_ipred_dc_32x32_ssse3, 131070 runs, 2 skips 439 decicycles in ff_vp9_ipred_dc_32x32_avx2, 131070 runs, 2 skips 3570 decicycles in ff_vp9_ipred_dc_top_32x32_ssse3, 4096 runs, 0 skips 2494 decicycles in ff_vp9_ipred_dc_top_32x32_avx2, 4096 runs, 0 skips 1419 decicycles in ff_vp9_ipred_dc_left_32x32_ssse3, 16384 runs, 0 skips 717 decicycles in ff_vp9_ipred_dc_left_32x32_avx2, 16384 runs, 0 skips 2737 decicycles in ff_vp9_ipred_tm_32x32_avx, 1024 runs, 0 skips 2088 decicycles in ff_vp9_ipred_tm_32x32_avx2, 1024 runs, 0 skips 3090 decicycles in ff_vp9_ipred_v_32x32_avx, 512 runs, 0 skips 2226 decicycles in ff_vp9_ipred_v_32x32_avx2, 512 runs, 0 skips 1565 decicycles in ff_vp9_ipred_h_32x32_avx, 1024 runs, 0 skips 922 decicycles in ff_vp9_ipred_h_32x32_avx2, 1024 runs, 0 skips Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
5183fac92f
commit
fc8db12a73
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* MMX/SSE constants used across x86 dsp optimizations.
|
* MMX/SSE/AVX constants used across x86 dsp optimizations.
|
||||||
*
|
*
|
||||||
* This file is part of FFmpeg.
|
* This file is part of FFmpeg.
|
||||||
*
|
*
|
||||||
@ -47,7 +47,9 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x020
|
|||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
|
||||||
|
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
|
0x0101010101010101ULL, 0x0101010101010101ULL };
|
||||||
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
|
||||||
|
0x0303030303030303ULL, 0x0303030303030303ULL };
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
|
||||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
|
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
|
||||||
|
@ -44,8 +44,8 @@ extern const uint64_t ff_pw_96;
|
|||||||
extern const uint64_t ff_pw_128;
|
extern const uint64_t ff_pw_128;
|
||||||
extern const uint64_t ff_pw_255;
|
extern const uint64_t ff_pw_255;
|
||||||
|
|
||||||
extern const xmm_reg ff_pb_1;
|
extern const ymm_reg ff_pb_1;
|
||||||
extern const xmm_reg ff_pb_3;
|
extern const ymm_reg ff_pb_3;
|
||||||
extern const xmm_reg ff_pb_80;
|
extern const xmm_reg ff_pb_80;
|
||||||
extern const xmm_reg ff_pb_F8;
|
extern const xmm_reg ff_pb_F8;
|
||||||
extern const uint64_t ff_pb_FC;
|
extern const uint64_t ff_pb_FC;
|
||||||
|
@ -241,6 +241,13 @@ ipred_funcs(hd, ssse3, avx);
|
|||||||
ipred_funcs(vl, ssse3, avx);
|
ipred_funcs(vl, ssse3, avx);
|
||||||
ipred_funcs(vr, ssse3, avx);
|
ipred_funcs(vr, ssse3, avx);
|
||||||
|
|
||||||
|
ipred_func(32, dc, avx2);
|
||||||
|
ipred_func(32, dc_left, avx2);
|
||||||
|
ipred_func(32, dc_top, avx2);
|
||||||
|
ipred_func(32, v, avx2);
|
||||||
|
ipred_func(32, h, avx2);
|
||||||
|
ipred_func(32, tm, avx2);
|
||||||
|
|
||||||
#undef ipred_funcs
|
#undef ipred_funcs
|
||||||
#undef ipred_func_set
|
#undef ipred_func_set
|
||||||
#undef ipred_func
|
#undef ipred_func
|
||||||
@ -388,6 +395,15 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
|
|||||||
init_ipred(TX_32X32, 32, avx);
|
init_ipred(TX_32X32, 32, avx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||||
|
dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2;
|
||||||
|
dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2;
|
||||||
|
dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2;
|
||||||
|
dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2;
|
||||||
|
dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2;
|
||||||
|
dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2;
|
||||||
|
}
|
||||||
|
|
||||||
#undef init_fpel
|
#undef init_fpel
|
||||||
#undef init_subpel1
|
#undef init_subpel1
|
||||||
#undef init_subpel2
|
#undef init_subpel2
|
||||||
|
@ -29,10 +29,10 @@
|
|||||||
|
|
||||||
%include "libavutil/x86/x86util.asm"
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
SECTION_RODATA
|
SECTION_RODATA 32
|
||||||
|
|
||||||
pw_m256: times 8 dw -256
|
pw_m256: times 16 dw -256
|
||||||
pw_m255: times 8 dw -255
|
pw_m255: times 16 dw -255
|
||||||
pw_512: times 8 dw 512
|
pw_512: times 8 dw 512
|
||||||
pw_1024: times 8 dw 1024
|
pw_1024: times 8 dw 1024
|
||||||
pw_2048: times 8 dw 2048
|
pw_2048: times 8 dw 2048
|
||||||
@ -72,7 +72,7 @@ pb_3to1_5x0: db 3, 2, 1
|
|||||||
times 9 db 0
|
times 9 db 0
|
||||||
pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||||
|
|
||||||
pb_2: times 16 db 2
|
pb_2: times 32 db 2
|
||||||
pb_15: times 16 db 15
|
pb_15: times 16 db 15
|
||||||
|
|
||||||
cextern pb_1
|
cextern pb_1
|
||||||
@ -180,6 +180,40 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
|
|||||||
jg .loop
|
jg .loop
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
|
||||||
|
mova m0, [lq]
|
||||||
|
mova m1, [aq]
|
||||||
|
DEFINE_ARGS dst, stride, stride3, cnt
|
||||||
|
lea stride3q, [strideq*3]
|
||||||
|
pxor m2, m2
|
||||||
|
psadbw m0, m2
|
||||||
|
psadbw m1, m2
|
||||||
|
paddw m0, m1
|
||||||
|
vextracti128 xm1, m0, 1
|
||||||
|
paddw xm0, xm1
|
||||||
|
movhlps xm1, xm0
|
||||||
|
paddw xm0, xm1
|
||||||
|
pmulhrsw xm0, [pw_512]
|
||||||
|
vpbroadcastb m0, xm0
|
||||||
|
mov cntd, 4
|
||||||
|
.loop:
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m0
|
||||||
|
mova [dstq+strideq*2], m0
|
||||||
|
mova [dstq+stride3q ], m0
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m0
|
||||||
|
mova [dstq+strideq*2], m0
|
||||||
|
mova [dstq+stride3q ], m0
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
dec cntd
|
||||||
|
jg .loop
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
|
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
|
||||||
|
|
||||||
%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
|
%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
|
||||||
@ -267,6 +301,37 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
|
|||||||
dec cntd
|
dec cntd
|
||||||
jg .loop
|
jg .loop
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
|
||||||
|
mova m0, [%2q]
|
||||||
|
DEFINE_ARGS dst, stride, stride3, cnt
|
||||||
|
lea stride3q, [strideq*3]
|
||||||
|
pxor m2, m2
|
||||||
|
psadbw m0, m2
|
||||||
|
vextracti128 xm1, m0, 1
|
||||||
|
paddw xm0, xm1
|
||||||
|
movhlps xm1, xm0
|
||||||
|
paddw xm0, xm1
|
||||||
|
pmulhrsw xm0, [pw_1024]
|
||||||
|
vpbroadcastb m0, xm0
|
||||||
|
mov cntd, 4
|
||||||
|
.loop:
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m0
|
||||||
|
mova [dstq+strideq*2], m0
|
||||||
|
mova [dstq+stride3q ], m0
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m0
|
||||||
|
mova [dstq+strideq*2], m0
|
||||||
|
mova [dstq+stride3q ], m0
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
dec cntd
|
||||||
|
jg .loop
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
DC_1D_FUNCS top, a
|
DC_1D_FUNCS top, a
|
||||||
@ -327,6 +392,29 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
|
|||||||
jg .loop
|
jg .loop
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
|
||||||
|
mova m0, [aq]
|
||||||
|
DEFINE_ARGS dst, stride, stride3, cnt
|
||||||
|
lea stride3q, [strideq*3]
|
||||||
|
mov cntd, 4
|
||||||
|
.loop:
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m0
|
||||||
|
mova [dstq+strideq*2], m0
|
||||||
|
mova [dstq+stride3q ], m0
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m0
|
||||||
|
mova [dstq+strideq*2], m0
|
||||||
|
mova [dstq+stride3q ], m0
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
dec cntd
|
||||||
|
jg .loop
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
; h
|
; h
|
||||||
|
|
||||||
INIT_XMM ssse3
|
INIT_XMM ssse3
|
||||||
@ -417,6 +505,32 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
|
|||||||
H_XMM_FUNCS ssse3
|
H_XMM_FUNCS ssse3
|
||||||
H_XMM_FUNCS avx
|
H_XMM_FUNCS avx
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
|
||||||
|
mova m5, [pb_1]
|
||||||
|
mova m6, [pb_2]
|
||||||
|
mova m7, [pb_3]
|
||||||
|
pxor m4, m4
|
||||||
|
lea stride3q, [strideq*3]
|
||||||
|
mov cntq, 7
|
||||||
|
.loop:
|
||||||
|
movd xm3, [lq+cntq*4]
|
||||||
|
vinserti128 m3, m3, xm3, 1
|
||||||
|
pshufb m0, m3, m7
|
||||||
|
pshufb m1, m3, m6
|
||||||
|
mova [dstq+strideq*0], m0
|
||||||
|
mova [dstq+strideq*1], m1
|
||||||
|
pshufb m2, m3, m5
|
||||||
|
pshufb m3, m4
|
||||||
|
mova [dstq+strideq*2], m2
|
||||||
|
mova [dstq+stride3q ], m3
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
dec cntq
|
||||||
|
jge .loop
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
; tm
|
; tm
|
||||||
|
|
||||||
INIT_MMX ssse3
|
INIT_MMX ssse3
|
||||||
@ -554,6 +668,41 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
|
|||||||
TM_XMM_FUNCS ssse3
|
TM_XMM_FUNCS ssse3
|
||||||
TM_XMM_FUNCS avx
|
TM_XMM_FUNCS avx
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
|
||||||
|
pxor m3, m3
|
||||||
|
pinsrw xm2, [aq-1], 0
|
||||||
|
vinserti128 m2, m2, xm2, 1
|
||||||
|
mova m0, [aq]
|
||||||
|
DEFINE_ARGS dst, stride, l, cnt
|
||||||
|
mova m4, [pw_m256]
|
||||||
|
mova m5, [pw_m255]
|
||||||
|
pshufb m2, m4
|
||||||
|
punpckhbw m1, m0, m3
|
||||||
|
punpcklbw m0, m3
|
||||||
|
psubw m1, m2
|
||||||
|
psubw m0, m2
|
||||||
|
mov cntq, 15
|
||||||
|
.loop:
|
||||||
|
pinsrw xm7, [lq+cntq*2], 0
|
||||||
|
vinserti128 m7, m7, xm7, 1
|
||||||
|
pshufb m3, m7, m5
|
||||||
|
pshufb m7, m4
|
||||||
|
paddw m2, m3, m0
|
||||||
|
paddw m3, m1
|
||||||
|
paddw m6, m7, m0
|
||||||
|
paddw m7, m1
|
||||||
|
packuswb m2, m3
|
||||||
|
packuswb m6, m7
|
||||||
|
mova [dstq+strideq*0], m2
|
||||||
|
mova [dstq+strideq*1], m6
|
||||||
|
lea dstq, [dstq+strideq*2]
|
||||||
|
dec cntq
|
||||||
|
jge .loop
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
; dl
|
; dl
|
||||||
|
|
||||||
%macro LOWPASS 4 ; left [dst], center, right, tmp
|
%macro LOWPASS 4 ; left [dst], center, right, tmp
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
|
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
|
||||||
|
typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
|
||||||
|
|
||||||
#if ARCH_X86_64
|
#if ARCH_X86_64
|
||||||
# define OPSIZE "q"
|
# define OPSIZE "q"
|
||||||
|
Loading…
Reference in New Issue
Block a user