From aeb138679a8f97f6c4716ccd91fac3adbe7bb4d1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sat, 15 Nov 2025 19:44:02 +0100 Subject: [PATCH] avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to ASM Both GCC and Clang completely unroll the unlikely loop at -O3, leading to codesize bloat; their code is also suboptimal, as they don't make use of pmulhrsw (even with -mssse3). This commit therefore ports the whole function to external assembly. The new function occupies 176B here vs 1406B for GCC. Benchmarks for a testcase with huge qscale (notice that the C version is unrolled just like the unlikely loop in the SSSE3 version): add_8x8basis_c: 43.4 ( 1.00x) add_8x8basis_ssse3 (old): 43.6 ( 1.00x) add_8x8basis_ssse3 (new): 11.9 ( 3.63x) Reviewed-by: Kieran Kunhya Reviewed-by: Lynne Signed-off-by: Andreas Rheinhardt --- libavcodec/x86/mpegvideoencdsp.asm | 52 +++++++++++++++++++++++++++ libavcodec/x86/mpegvideoencdsp_init.c | 46 ++++-------------------- 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index 0e86a5304c..300f98b438 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -25,6 +25,58 @@ SECTION .text +; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale) +INIT_XMM ssse3 +cglobal add_8x8basis, 3, 3+ARCH_X86_64, 4, rem, basis, scale + movd m0, scaled + add scaled, 1024 + add basisq, 128 + add remq, 128 +%if ARCH_X86_64 +%define OFF r3q + mov r3q, -128 + cmp scaled, 2047 +%else +%define OFF r2q + cmp scaled, 2047 + mov r2q, -128 +%endif + ja .huge_scale + + punpcklwd m0, m0 + pshufd m0, m0, 0x0 + psllw m0, 5 +.loop1: + mova m1, [basisq+OFF] + mova m2, [basisq+OFF+16] + pmulhrsw m1, m0 + pmulhrsw m2, m0 + paddw m1, [remq+OFF] + paddw m2, [remq+OFF+16] + mova [remq+OFF], m1 + mova [remq+OFF+16], m2 + add OFF, 32 + js .loop1 + RET + +.huge_scale: + pslld m0, 6 + punpcklwd m0, m0 + pshufd m1, m0, 0x55 + psrlw m0, 1 + pshufd m0, m0, 0x0 +.loop2: + mova m2, [basisq+OFF] + pmulhrsw m3, m2, m0 + pmullw m2, m1 + paddw m2, m3 + paddw m2, [remq+OFF] + mova [remq+OFF], m2 + add OFF, 16 + js .loop2 + RET + + INIT_XMM sse2 cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset pxor m6, m6 diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index f6169b5399..220c75785a 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64], int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size); int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size); int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); +void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale); #if HAVE_INLINE_ASM #if HAVE_SSSE3_INLINE @@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c ); return i; } - -static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale) -{ - x86_reg i=0; - - if (FFABS(scale) < 1024) { - scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT); - __asm__ volatile( - "movd %3, %%xmm2 \n\t" - "punpcklwd %%xmm2, %%xmm2 \n\t" - "pshufd $0, %%xmm2, %%xmm2 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movdqa (%1, %0), %%xmm0 \n\t" - "movdqa 16(%1, %0), %%xmm1 \n\t" - "pmulhrsw %%xmm2, %%xmm0 \n\t" - "pmulhrsw %%xmm2, %%xmm1 \n\t" - "paddw (%2, %0), %%xmm0 \n\t" - "paddw 16(%2, %0), %%xmm1 \n\t" - "movdqa %%xmm0, (%2, %0) \n\t" - "movdqa %%xmm1, 16(%2, %0) \n\t" - "add $32, %0 \n\t" - "cmp $128, %0 \n\t" // FIXME optimize & bench - " jb 1b \n\t" - : "+r" (i) - : "r"(basis), "r"(rem), "g"(scale) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") - ); - } else { - for (i=0; i<8*8; i++) { - rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); - } - } -} - #endif /* HAVE_SSSE3_INLINE */ /* Draw the edges of width 'w' of an image of size width, height */ @@ -227,15 +193,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, c->draw_edges = draw_edges_mmx; } } +#endif /* HAVE_INLINE_ASM */ + if (X86_SSSE3(cpu_flags)) { #if HAVE_SSSE3_INLINE - if (INLINE_SSSE3(cpu_flags)) { if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { c->try_8x8basis = try_8x8basis_ssse3; } - c->add_8x8basis = add_8x8basis_ssse3; - } #endif /* HAVE_SSSE3_INLINE */ +#if HAVE_SSSE3_EXTERNAL + c->add_8x8basis = ff_add_8x8basis_ssse3; +#endif + } -#endif /* HAVE_INLINE_ASM */ }