mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
avcodec/x86: add avx512icl function for v210dec
Ice Lake (Xeon Silver 4316): 2.01x faster (1147±36.8 vs. 571±38.2 decicycles) compared with avx2
This commit is contained in:
parent
f30b4c2f47
commit
6af453ca38
@ -17,7 +17,7 @@
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/v210dec.h"
|
||||
|
||||
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
@ -28,6 +28,8 @@ extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y
|
||||
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
|
||||
extern void ff_v210_planar_unpack_avx512icl(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
|
||||
av_cold void ff_v210_x86_init(V210DecContext *s)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
@ -42,6 +44,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
|
||||
|
||||
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
|
||||
s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
|
||||
|
||||
if (EXTERNAL_AVX512ICL(cpu_flags))
|
||||
s->unpack_frame = ff_v210_planar_unpack_avx512icl;
|
||||
}
|
||||
else {
|
||||
if (cpu_flags & AV_CPU_FLAG_SSSE3)
|
||||
@ -52,6 +57,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
|
||||
|
||||
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
|
||||
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
|
||||
|
||||
if (EXTERNAL_AVX512ICL(cpu_flags))
|
||||
s->unpack_frame = ff_v210_planar_unpack_avx512icl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -22,7 +22,21 @@
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
SECTION_RODATA 64
|
||||
|
||||
perm_y:
|
||||
db 0,1, 4,5, 6,7, 8,9, 12,13, 14,15, 16,17, 20,21
|
||||
db 22,23, 24,25, 28,29, 30,31, 32,33, 36,37, 38,39, 40,41
|
||||
db 44,45, 46,47, 48,49, 52,53, 54,55, 56,57, 60,61, 62,63
|
||||
times 16 db 0xff ; align to 64
|
||||
|
||||
perm_uv:
|
||||
db 0,1, 4,5, 10,11, 16,17, 20,21, 26,27, 32,33, 36,37
|
||||
db 42,43, 48,49, 52,53, 58,59
|
||||
times 8 db 0xff ; align to 32
|
||||
db 2,3, 8,9, 12,13, 18,19, 24,25, 28,29, 34,35, 40,41
|
||||
db 44,45, 50,51, 56,57, 60,61
|
||||
times 8 db 0xff ; align to 32
|
||||
|
||||
; for AVX2 version only
|
||||
v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required
|
||||
@ -34,6 +48,9 @@ v210_mult: dw 64,4,64,4,64,4,64,4
|
||||
v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
|
||||
v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
|
||||
|
||||
shift: times 4 dw 6, 2
|
||||
kmask: dw 0x5555, 0xaaaa
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro v210_planar_unpack 1
|
||||
@ -127,3 +144,44 @@ v210_planar_unpack aligned
|
||||
INIT_YMM avx2
|
||||
v210_planar_unpack aligned
|
||||
%endif
|
||||
|
||||
%if HAVE_AVX512ICL_EXTERNAL
|
||||
|
||||
INIT_ZMM avx512icl
|
||||
|
||||
cglobal v210_planar_unpack, 5, 5, 6, src, y, u, v, w
|
||||
movsxdifnidn wq, wd
|
||||
lea yq, [yq+2*wq]
|
||||
add uq, wq
|
||||
add vq, wq
|
||||
neg wq
|
||||
|
||||
kmovw k1, [kmask] ; odd dword mask
|
||||
kmovw k2, [kmask+2] ; even dword mask
|
||||
|
||||
VBROADCASTI128 m0, [shift]
|
||||
mova m1, [perm_y]
|
||||
mova m2, [perm_uv]
|
||||
|
||||
.loop:
|
||||
movu m3, [srcq]
|
||||
vpsllvw m4, m3, m0
|
||||
pslld m5, m3, 12
|
||||
psrlw m4, 6
|
||||
psrld m5, 22
|
||||
|
||||
vpblendmd m3{k1}, m4, m5
|
||||
vpermb m3, m1, m3 ; could use vpcompressw
|
||||
movu [yq+2*wq], m3
|
||||
|
||||
vpblendmd m5{k2}, m4, m5
|
||||
vpermb m5, m2, m5
|
||||
movu [uq+wq], ym5
|
||||
vextracti32x8 [vq+wq], zm5, 1
|
||||
|
||||
add srcq, mmsize
|
||||
add wq, (mmsize*3)/8
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
%endif
|
||||
|
@ -54,12 +54,12 @@ void checkasm_check_v210dec(void)
|
||||
if (check_func(h.unpack_frame, "v210_unpack")) {
|
||||
uint32_t src0[NUM_SAMPLES/3];
|
||||
uint32_t src1[NUM_SAMPLES/3];
|
||||
uint16_t y0[NUM_SAMPLES/2];
|
||||
uint16_t y1[NUM_SAMPLES/2];
|
||||
uint16_t u0[NUM_SAMPLES/4];
|
||||
uint16_t u1[NUM_SAMPLES/4];
|
||||
uint16_t v0[NUM_SAMPLES/4];
|
||||
uint16_t v1[NUM_SAMPLES/4];
|
||||
uint16_t y0[NUM_SAMPLES/2 + 15];
|
||||
uint16_t y1[NUM_SAMPLES/2 + 15];
|
||||
uint16_t u0[NUM_SAMPLES/4 + 7];
|
||||
uint16_t u1[NUM_SAMPLES/4 + 7];
|
||||
uint16_t v0[NUM_SAMPLES/4 + 7];
|
||||
uint16_t v1[NUM_SAMPLES/4 + 7];
|
||||
declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
const int pixels = NUM_SAMPLES / 2 / 6 * 6;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user