avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms

2025-08-04 22:03:09 +02:00 · 2025-05-16 15:18:14 +02:00
parent b6803bf104
commit fd18ae88ae
5 changed files with 1654 additions and 7 deletions
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -184,6 +184,7 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER)      += x86/vp6dsp.o
 X86ASM-OBJS-$(CONFIG_VP9_DECODER)      += x86/vp9intrapred.o            \
                                          x86/vp9intrapred_16bpp.o      \
                                          x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_avx512.o         \
                                          x86/vp9itxfm_16bpp.o          \
                                          x86/vp9lpf.o                  \
                                          x86/vp9lpf_16bpp.o            \
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@ -114,7 +114,9 @@ itxfm_func(idct, idct, 32, ssse3);
 itxfm_func(idct, idct, 32, avx);
 itxfm_func(iwht, iwht, 4, mmx);
 itxfm_funcs(16, avx2);
+itxfm_funcs(16, avx512icl);
 itxfm_func(idct, idct, 32, avx2);
+itxfm_func(idct, idct, 32, avx512icl);

 #undef itxfm_func
 #undef itxfm_funcs
@ -406,6 +408,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
        init_ipred(32, avx2, tm, TM_VP8);
    }

+#if ARCH_X86_64
+    if (EXTERNAL_AVX512ICL(cpu_flags)) {
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT]  =
+        dsp->itxfm_add[TX_32X32][DCT_ADST]  =
+        dsp->itxfm_add[TX_32X32][DCT_DCT]   = ff_vp9_idct_idct_32x32_add_avx512icl;
+    }
+#endif
+
 #undef init_fpel
 #undef init_subpel1
 #undef init_subpel2
--- a/libavcodec/x86/vp9itxfm_avx512.asm
+++ b/libavcodec/x86/vp9itxfm_avx512.asm
--- a/libavutil/mem_internal.h
+++ b/libavutil/mem_internal.h
@ -131,4 +131,6 @@

 #define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_D(32, t, v, __VA_ARGS__,,))

+#define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,))
+
 #endif /* AVUTIL_MEM_INTERNAL_H */
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@ -310,13 +310,13 @@ static int is_zero(const int16_t *c, int sz)

 static void check_itxfm(void)
 {
-    LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, src, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst0, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst1, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(int16_t, coef, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(int16_t, subcoef0, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(int16_t, subcoef1, [32 * 32 * 2]);
    declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    VP9DSPContext dsp;
    int y, x, tx, txtp, bit_depth, sub;