1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms

This commit is contained in:
Henrik Gramner
2025-05-16 15:18:14 +02:00
committed by Henrik Gramner
parent b6803bf104
commit fd18ae88ae
5 changed files with 1654 additions and 7 deletions

View File

@ -184,6 +184,7 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9itxfm_avx512.o \
x86/vp9itxfm_16bpp.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \

View File

@ -114,7 +114,9 @@ itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
itxfm_funcs(16, avx2);
itxfm_funcs(16, avx512icl);
itxfm_func(idct, idct, 32, avx2);
itxfm_func(idct, idct, 32, avx512icl);
#undef itxfm_func
#undef itxfm_funcs
@ -406,6 +408,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
init_ipred(32, avx2, tm, TM_VP8);
}
#if ARCH_X86_64
if (EXTERNAL_AVX512ICL(cpu_flags)) {
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx512icl;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx512icl;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx512icl;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl;
}
#endif
#undef init_fpel
#undef init_subpel1
#undef init_subpel2

File diff suppressed because it is too large Load Diff

View File

@ -131,4 +131,6 @@
#define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_D(32, t, v, __VA_ARGS__,,))
#define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,))
#endif /* AVUTIL_MEM_INTERNAL_H */

View File

@ -310,13 +310,13 @@ static int is_zero(const int16_t *c, int sz)
static void check_itxfm(void)
{
LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
LOCAL_ALIGNED_64(uint8_t, src, [32 * 32 * 2]);
LOCAL_ALIGNED_64(uint8_t, dst, [32 * 32 * 2]);
LOCAL_ALIGNED_64(uint8_t, dst0, [32 * 32 * 2]);
LOCAL_ALIGNED_64(uint8_t, dst1, [32 * 32 * 2]);
LOCAL_ALIGNED_64(int16_t, coef, [32 * 32 * 2]);
LOCAL_ALIGNED_64(int16_t, subcoef0, [32 * 32 * 2]);
LOCAL_ALIGNED_64(int16_t, subcoef1, [32 * 32 * 2]);
declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
VP9DSPContext dsp;
int y, x, tx, txtp, bit_depth, sub;