avcodec/x86/h264_chromamc: Add SSSE3 RV40 chroma motion compensation functions

The only difference between it and the H.264/VC-1 versions is the bias constant which depends on the shift parameters for RV40. This value ends up in a register and therefore one can reuse the H.264 code by setting the registers for RV40 and then jumping into the relevant H.264 function, making the four new functions cheap (just 256 bytes in total). This approach uses one jump more for the no-filter case and one jump less in the one-dimensional case than an approach using separate functions. avg_chroma_mc4_c: 167.5 ( 1.00x) avg_chroma_mc4_mmxext: 48.1 ( 3.48x) avg_chroma_mc4_ssse3: 31.1 ( 5.39x) avg_chroma_mc8_c: 325.5 ( 1.00x) avg_chroma_mc8_mmxext: 103.2 ( 3.15x) avg_chroma_mc8_ssse3: 33.5 ( 9.71x) put_chroma_mc4_c: 137.4 ( 1.00x) put_chroma_mc4_mmx: 44.5 ( 3.09x) put_chroma_mc4_ssse3: 28.4 ( 4.83x) put_chroma_mc8_c: 271.4 ( 1.00x) put_chroma_mc8_mmx: 99.9 ( 2.72x) put_chroma_mc8_ssse3: 30.6 ( 8.86x) Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-23 21:54:53 +02:00 · 2025-11-01 10:37:25 +01:00
parent c607aae2b9
commit cb054ee89b
3 changed files with 91 additions and 9 deletions
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -24,6 +24,7 @@
 * RV40 decoder motion compensation functions
 */

+#include "libavutil/attributes_internal.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "h264qpel.h"
@@ -283,7 +284,7 @@ static void avg_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t st
    avg_pixels8_xy2_8_c(dst, src, stride, 8);
 }

-static const int rv40_bias[4][4] = {
+attribute_visibility_hidden const int ff_rv40_bias[4][4] = {
    {  0, 16, 32, 16 },
    { 32, 28, 32, 28 },
    {  0, 32, 16, 32 },
@@ -300,7 +301,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\
    const int C = (8-x) * (  y);\
    const int D = (  x) * (  y);\
    int i;\
-    int bias = rv40_bias[y>>1][x>>1];\
+    int bias = ff_rv40_bias[y>>1][x>>1];\
    \
    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
@@ -336,7 +337,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\
    const int C = (8-x) * (  y);\
    const int D = (  x) * (  y);\
    int i;\
-    int bias = rv40_bias[y>>1][x>>1];\
+    int bias = ff_rv40_bias[y>>1][x>>1];\
    \
    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -20,6 +20,7 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

+%include "config_components.asm"
 %include "libavutil/x86/x86util.asm"

 SECTION_RODATA
@@ -64,6 +65,8 @@ pw_28: times 8 dw 28
 cextern pw_32
 cextern pw_64

+cextern rv40_bias
+
 SECTION .text

 %macro mv0_pixels_mc8 0
@@ -447,11 +450,12 @@ chroma_mc4_mmx_func avg, rv40
 chroma_mc2_mmx_func avg, h264

 %macro chroma_mc8_ssse3_func 2-3
-cglobal %1_%2_chroma_mc8%3, 6, 7, 8
+cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
    mov          r6d, r5d
    or           r6d, r4d
    jne .at_least_one_non_zero
    ; mx == 0 AND my == 0 - no filter needed
+..@%1_%2_chroma_mc8_no_filter_ %+ cpuname:
    mv0_pixels_mc8
    RET

@@ -462,6 +466,8 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    je .mx_is_zero

    ; general case, bilinear
+    movdqa        m5, [rnd_2d_%2]
+..@%1_%2_chroma_mc8_both_nonzero_ %+ cpuname:
    mov          r6d, r4d
    shl          r4d, 8
    sub           r4, r6
@@ -473,7 +479,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8

    movd          m7, r6d
    movd          m6, r4d
-    movdqa        m5, [rnd_2d_%2]
    movq          m0, [r1  ]
    movq          m1, [r1+1]
    pshuflw       m7, m7, 0
@@ -517,12 +522,13 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    RET

 .my_is_zero:
+    movdqa        m6, [rnd_1d_%2]
+..@%1_%2_chroma_mc8_my_zero_ %+ cpuname:
    mov          r5d, r4d
    shl          r4d, 8
    add           r4, 8
    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
    movd          m7, r4d
-    movdqa        m6, [rnd_1d_%2]
    pshuflw       m7, m7, 0
    movlhps       m7, m7

@@ -554,12 +560,13 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    RET

 .mx_is_zero:
+    movdqa        m6, [rnd_1d_%2]
+..@%1_%2_chroma_mc8_mx_zero_ %+ cpuname:
    mov          r4d, r5d
    shl          r5d, 8
    add           r5, 8
    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
    movd          m7, r5d
-    movdqa        m6, [rnd_1d_%2]
    pshuflw       m7, m7, 0
    movlhps       m7, m7

@@ -592,7 +599,9 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
 %endmacro

 %macro chroma_mc4_ssse3_func 2
-cglobal %1_%2_chroma_mc4, 6, 7, 0
+cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
+    movq          m5, [pw_32]
+..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
    mov           r6, r4
    shl          r4d, 8
    sub          r4d, r6d
@@ -604,7 +613,6 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0

    movd          m7, r6d
    movd          m6, r4d
-    movq          m5, [pw_32]
    movd          m0, [r1  ]
    pshufw        m7, m7, 0
    punpcklbw     m0, [r1+1]
@@ -641,16 +649,79 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0
    RET
 %endmacro

+%macro rv40_get_bias 1 ; dst reg
+%if !PIC || UNIX64
+    ; on UNIX64 we have enough volatile registers
+%if PIC && UNIX64
+    lea           r7, [rv40_bias]
+%endif
+    mov          r6d, r5d
+    and          r6d, 6         ; &~1 for mx/my=[0,7]
+    lea          r6d, [r6d*4+r4d]
+    sar          r6d, 1
+%if PIC && UNIX64
+    movd          %1, [r7+4*r6]
+%else
+    movd          %1, [rv40_bias+4*r6]
+%endif
+%else  ; PIC && !UNIX64, de facto WIN64
+    lea           r6, [rv40_bias]
+%ifidn r5d, r5m ; always false for currently supported calling conventions
+    push          r5
+%endif
+    and          r5d, 6         ; &~1 for mx/my=[0,7]
+    lea          r5d, [r5d*4+r4d]
+    sar          r5d, 1
+    movd          %1, [r6+4*r5]
+%ifidn r5d, r5m
+    pop           r5
+%else
+    mov          r5d, r5m
+%endif
+%endif
+    SPLATW        %1, %1
+%endmacro
+
+%macro rv40_chroma_mc8_func 1 ; put vs avg
+%if CONFIG_RV40_DECODER
+    cglobal rv40_%1_chroma_mc8, 6, 7+UNIX64, 8
+    mov          r6d, r5d
+    or           r6d, r4d
+    jz           ..@%1_h264_chroma_mc8_no_filter_ %+ cpuname
+    rv40_get_bias m5
+    ; the bilinear code expects bias in m5, the one-dimensional code in m6
+    mova          m6, m5
+    psraw         m6, 3
+    test         r5d, r5d
+    je           ..@%1_h264_chroma_mc8_my_zero_ %+ cpuname
+    test         r4d, r4d
+    je           ..@%1_h264_chroma_mc8_mx_zero_ %+ cpuname
+    jmp          ..@%1_h264_chroma_mc8_both_nonzero_ %+ cpuname
+%endif
+%endmacro
+
+%macro rv40_chroma_mc4_func 1 ; put vs avg
+%if CONFIG_RV40_DECODER
+    cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 0
+    rv40_get_bias m5
+    jmp           ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
+%endif
+%endmacro
+
 %define CHROMAMC_AVG NOTHING
 INIT_XMM ssse3
 chroma_mc8_ssse3_func put, h264, _rnd
 chroma_mc8_ssse3_func put, vc1,  _nornd
+rv40_chroma_mc8_func put
 INIT_MMX ssse3
 chroma_mc4_ssse3_func put, h264
+rv40_chroma_mc4_func put

 %define CHROMAMC_AVG DIRECT_AVG
 INIT_XMM ssse3
 chroma_mc8_ssse3_func avg, h264, _rnd
 chroma_mc8_ssse3_func avg, vc1,  _nornd
+rv40_chroma_mc8_func avg
 INIT_MMX ssse3
 chroma_mc4_ssse3_func avg, h264
+rv40_chroma_mc4_func avg
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -178,6 +178,12 @@ DEFINE_FN(avg, 8, ssse3)

 DEFINE_FN(avg, 16, sse2)
 DEFINE_FN(avg, 16, ssse3)
+
+#define CHROMA_MC_FUNC(OP, SIZE, XMM) \
+void ff_rv40_ ## OP ## _chroma_mc ## SIZE ## _ ## XMM(uint8_t *dst, const uint8_t *src,      \
+                                                      ptrdiff_t stride, int h, int x, int y);\
+    c->OP ## _chroma_pixels_tab[SIZE == 4] = ff_rv40_ ## OP ## _chroma_mc ## SIZE ## _ ## XMM
+
 #endif /* HAVE_X86ASM */

 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
@@ -204,6 +210,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
        QPEL_MC_SET(avg_, _sse2)
    }
    if (EXTERNAL_SSSE3(cpu_flags)) {
+        CHROMA_MC_FUNC(put, 8, ssse3);
+        CHROMA_MC_FUNC(put, 4, ssse3);
+        CHROMA_MC_FUNC(avg, 8, ssse3);
+        CHROMA_MC_FUNC(avg, 4, ssse3);
        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;