mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
avcodec/v210enc: add new 10-bit function for avx512 avx512icl
avx512 on Skylake-X (Xeon D-2123IT): 1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2 avx512icl on Ice Lake (Xeon Silver 4316): 2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
This commit is contained in:
parent
bda53d2dde
commit
651cb867b1
@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
|
||||
v210enc_8_mult: db 4, 0, 64, 0
|
||||
v210enc_8_mask: dd 255<<12
|
||||
|
||||
icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
|
||||
%assign i 0
|
||||
%rep 8
|
||||
db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
|
||||
%assign i i+6
|
||||
%endrep
|
||||
|
||||
icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
|
||||
%assign i 0
|
||||
%rep 4
|
||||
db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
|
||||
%assign i i+6
|
||||
%endrep
|
||||
|
||||
icl_perm_y_kmask: times 8 db 0b1111_0110
|
||||
icl_perm_uv_kmask: times 8 db 0b0110_1111
|
||||
|
||||
icl_shift_y: times 10 dw 2,0,4
|
||||
times 4 db 0 ; padding to 64 bytes
|
||||
icl_shift_uv: times 5 dw 0,2,4
|
||||
times 2 db 0 ; padding to 32 bytes
|
||||
times 5 dw 4,0,2
|
||||
times 2 db 0 ; padding to 32 bytes
|
||||
|
||||
v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1
|
||||
v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
|
||||
v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
|
||||
v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
|
||||
db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro v210_planar_pack_10 0
|
||||
@ -113,6 +143,75 @@ INIT_YMM avx2
|
||||
v210_planar_pack_10
|
||||
%endif
|
||||
|
||||
%macro v210_planar_pack_10_new 0
|
||||
|
||||
cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width
|
||||
lea yq, [yq+2*widthq]
|
||||
add uq, widthq
|
||||
add vq, widthq
|
||||
neg widthq
|
||||
|
||||
%if cpuflag(avx512icl)
|
||||
movu m6, [icl_perm_y]
|
||||
movu m7, [icl_perm_uv]
|
||||
kmovq k1, [icl_perm_y_kmask]
|
||||
kmovq k2, [icl_perm_uv_kmask]
|
||||
%else
|
||||
movu m6, [v210enc_10_permd_y]
|
||||
VBROADCASTI128 m7, [v210enc_10_shufb_y]
|
||||
movu m8, [v210enc_10_permd_uv]
|
||||
movu m9, [v210enc_10_shufb_uv]
|
||||
%endif
|
||||
movu m2, [icl_shift_y]
|
||||
movu m3, [icl_shift_uv]
|
||||
VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
|
||||
VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
|
||||
|
||||
.loop:
|
||||
movu m0, [yq + widthq*2]
|
||||
%if cpuflag(avx512icl)
|
||||
movu ym1, [uq + widthq*1]
|
||||
vinserti32x8 zm1, [vq + widthq*1], 1
|
||||
%else
|
||||
movu xm1, [uq + widthq*1]
|
||||
vinserti128 ym1, [vq + widthq*1], 1
|
||||
%endif
|
||||
CLIPW m0, m4, m5
|
||||
CLIPW m1, m4, m5
|
||||
|
||||
vpsllvw m0, m2
|
||||
vpsllvw m1, m3
|
||||
%if cpuflag(avx512icl)
|
||||
vpermb m0{k1}{z}, m6, m0 ; make space for uv where the k-mask sets to zero
|
||||
vpermb m1{k2}{z}, m7, m1 ; interleave uv and make space for y where the k-mask sets to zero
|
||||
%else
|
||||
vpermd m0, m6, m0
|
||||
pshufb m0, m7
|
||||
vpermd m1, m8, m1
|
||||
pshufb m1, m9
|
||||
%endif
|
||||
por m0, m1
|
||||
|
||||
movu [dstq], m0
|
||||
add dstq, mmsize
|
||||
add widthq, (mmsize*3)/8
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if HAVE_AVX512_EXTERNAL
|
||||
INIT_YMM avx512
|
||||
v210_planar_pack_10_new
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if HAVE_AVX512ICL_EXTERNAL
|
||||
INIT_ZMM avx512icl
|
||||
v210_planar_pack_10_new
|
||||
%endif
|
||||
|
||||
%macro v210_planar_pack_8 0
|
||||
|
||||
; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
|
||||
|
@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
|
||||
void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
|
||||
const uint16_t *v, uint8_t *dst,
|
||||
ptrdiff_t width);
|
||||
void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
|
||||
const uint16_t *v, uint8_t *dst,
|
||||
ptrdiff_t width);
|
||||
void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
|
||||
const uint16_t *v, uint8_t *dst,
|
||||
ptrdiff_t width);
|
||||
|
||||
av_cold void ff_v210enc_init_x86(V210EncContext *s)
|
||||
{
|
||||
@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
|
||||
if (EXTERNAL_AVX512(cpu_flags)) {
|
||||
s->sample_factor_8 = 2;
|
||||
s->pack_line_8 = ff_v210_planar_pack_8_avx512;
|
||||
#if ARCH_X86_64
|
||||
s->sample_factor_10 = 2;
|
||||
s->pack_line_10 = ff_v210_planar_pack_10_avx512;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX512ICL(cpu_flags)) {
|
||||
s->sample_factor_8 = 4;
|
||||
s->pack_line_8 = ff_v210_planar_pack_8_avx512icl;
|
||||
s->sample_factor_10 = 4;
|
||||
s->pack_line_10 = ff_v210_planar_pack_10_avx512icl;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user