avcodec/x86/cavsdsp: Add SSE2 mc20 horizontal motion compensation

Basically a direct port of the MMXEXT one. The main difference is of course that one can process eight pixels (unpacked to words) at a time, leading to speedups. avg_cavs_qpel_pixels_tab[0][2]_c: 700.1 ( 1.00x) avg_cavs_qpel_pixels_tab[0][2]_mmxext: 158.1 ( 4.43x) avg_cavs_qpel_pixels_tab[0][2]_sse2: 86.0 ( 8.14x) avg_cavs_qpel_pixels_tab[1][2]_c: 171.9 ( 1.00x) avg_cavs_qpel_pixels_tab[1][2]_mmxext: 39.4 ( 4.36x) avg_cavs_qpel_pixels_tab[1][2]_sse2: 21.7 ( 7.92x) put_cavs_qpel_pixels_tab[0][2]_c: 525.7 ( 1.00x) put_cavs_qpel_pixels_tab[0][2]_mmxext: 148.5 ( 3.54x) put_cavs_qpel_pixels_tab[0][2]_sse2: 75.2 ( 6.99x) put_cavs_qpel_pixels_tab[1][2]_c: 129.5 ( 1.00x) put_cavs_qpel_pixels_tab[1][2]_mmxext: 36.7 ( 3.53x) put_cavs_qpel_pixels_tab[1][2]_sse2: 19.0 ( 6.81x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-29 05:57:37 +02:00 · 2025-10-05 15:06:20 +02:00
parent 54598238e4
commit 74a88c0c11
3 changed files with 107 additions and 3 deletions
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -152,7 +152,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
 X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_APV_DECODER)      += x86/apv_dsp.o
-X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o x86/cavs_qpel.o
 X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
 X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
--- a/libavcodec/x86/cavs_qpel.asm
+++ b/libavcodec/x86/cavs_qpel.asm
@@ -0,0 +1,80 @@
+;*****************************************************************************
+;* SSE2-optimized CAVS QPEL code
+;*****************************************************************************
+;* Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
+;* based on H.264 optimizations by Michael Niedermayer and Loren Merritt
+;* Copyright (c) 2025 Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_4
+cextern pw_5
+
+SECTION .text
+
+%macro op_avgh 3
+    movh   %3, %2
+    pavgb  %1, %3
+    movh   %2, %1
+%endmacro
+
+%macro op_puth 2-3
+    movh   %2, %1
+%endmacro
+
+%macro CAVS_QPEL_H 1
+; ff_put_cavs_qpel8_mc20(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
+cglobal %1_cavs_qpel8_mc20, 3,4,6
+    mov         r3d, 8
+    jmp         %1_cavs_qpel8_h_after_prologue
+
+; ff_put_cavs_qpel8_h(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
+cglobal %1_cavs_qpel8_h, 4,4,6
+%1_cavs_qpel8_h_after_prologue:
+    mova         m3, [pw_4]
+    mova         m4, [pw_5]
+    pxor         m5, m5
+.loop:
+    movh         m0, [r1]
+    movh         m1, [r1+1]
+    punpcklbw    m0, m5
+    punpcklbw    m1, m5
+    paddw        m0, m1
+    movh         m1, [r1-1]
+    movh         m2, [r1+2]
+    pmullw       m0, m4
+    punpcklbw    m1, m5
+    punpcklbw    m2, m5
+    paddw        m0, m3
+    add          r1, r2
+    paddw        m1, m2
+    psubw        m0, m1
+    psraw        m0, 3
+    packuswb     m0, m5
+    op_%1h       m0, [r0], m1
+    add          r0, r2
+    dec         r3d
+    jne       .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+CAVS_QPEL_H avg
+CAVS_QPEL_H put
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -370,6 +370,25 @@ CAVS_MC(avg_,  8, mmxext)
 CAVS_MC(avg_, 16, mmxext)
 #endif /* HAVE_MMXEXT_INLINE */

+#if HAVE_SSE2_EXTERNAL
+#define DEF_QPEL(OPNAME) \
+    void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);     \
+    void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \
+
+DEF_QPEL(put)
+DEF_QPEL(avg)
+
+#define QPEL_CAVS_XMM(OPNAME, XMM) \
+static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
+{                                                                                                   \
+    ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst,     src,     stride, 16);                           \
+    ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst + 8, src + 8, stride, 16);                           \
+}
+
+QPEL_CAVS_XMM(put, sse2)
+QPEL_CAVS_XMM(avg, sse2)
+#endif
+
 av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
 {
    av_unused int cpu_flags = av_get_cpu_flags();
@@ -392,8 +411,13 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
 #endif
 #if HAVE_SSE2_EXTERNAL
    if (EXTERNAL_SSE2(cpu_flags)) {
-        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
-        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+        c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2;
+        c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2;
+        c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
+
+        c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2;
+        c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2;

        c->cavs_idct8_add = cavs_idct8_add_sse2;
        c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;