1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-29 05:57:37 +02:00

avcodec/x86/cavsdsp: Add SSE2 mc20 horizontal motion compensation

Basically a direct port of the MMXEXT one. The main difference
is of course that one can process eight pixels (unpacked to words)
at a time, leading to speedups.

avg_cavs_qpel_pixels_tab[0][2]_c:                      700.1 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][2]_mmxext:                 158.1 ( 4.43x)
avg_cavs_qpel_pixels_tab[0][2]_sse2:                    86.0 ( 8.14x)
avg_cavs_qpel_pixels_tab[1][2]_c:                      171.9 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][2]_mmxext:                  39.4 ( 4.36x)
avg_cavs_qpel_pixels_tab[1][2]_sse2:                    21.7 ( 7.92x)
put_cavs_qpel_pixels_tab[0][2]_c:                      525.7 ( 1.00x)
put_cavs_qpel_pixels_tab[0][2]_mmxext:                 148.5 ( 3.54x)
put_cavs_qpel_pixels_tab[0][2]_sse2:                    75.2 ( 6.99x)
put_cavs_qpel_pixels_tab[1][2]_c:                      129.5 ( 1.00x)
put_cavs_qpel_pixels_tab[1][2]_mmxext:                  36.7 ( 3.53x)
put_cavs_qpel_pixels_tab[1][2]_sse2:                    19.0 ( 6.81x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-10-05 15:06:20 +02:00
parent 54598238e4
commit 74a88c0c11
3 changed files with 107 additions and 3 deletions

View File

@@ -152,7 +152,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_APV_DECODER) += x86/apv_dsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o x86/cavs_qpel.o
X86ASM-OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp.o
X86ASM-OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o

View File

@@ -0,0 +1,80 @@
;*****************************************************************************
;* SSE2-optimized CAVS QPEL code
;*****************************************************************************
;* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
;* based on H.264 optimizations by Michael Niedermayer and Loren Merritt
;* Copyright (c) 2025 Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
cextern pw_4
cextern pw_5
SECTION .text
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro CAVS_QPEL_H 1
; ff_put_cavs_qpel8_mc20(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
cglobal %1_cavs_qpel8_mc20, 3,4,6
mov r3d, 8
jmp %1_cavs_qpel8_h_after_prologue
; ff_put_cavs_qpel8_h(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
cglobal %1_cavs_qpel8_h, 4,4,6
%1_cavs_qpel8_h_after_prologue:
mova m3, [pw_4]
mova m4, [pw_5]
pxor m5, m5
.loop:
movh m0, [r1]
movh m1, [r1+1]
punpcklbw m0, m5
punpcklbw m1, m5
paddw m0, m1
movh m1, [r1-1]
movh m2, [r1+2]
pmullw m0, m4
punpcklbw m1, m5
punpcklbw m2, m5
paddw m0, m3
add r1, r2
paddw m1, m2
psubw m0, m1
psraw m0, 3
packuswb m0, m5
op_%1h m0, [r0], m1
add r0, r2
dec r3d
jne .loop
RET
%endmacro
INIT_XMM sse2
CAVS_QPEL_H avg
CAVS_QPEL_H put

View File

@@ -370,6 +370,25 @@ CAVS_MC(avg_, 8, mmxext)
CAVS_MC(avg_, 16, mmxext)
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_SSE2_EXTERNAL
#define DEF_QPEL(OPNAME) \
void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \
DEF_QPEL(put)
DEF_QPEL(avg)
#define QPEL_CAVS_XMM(OPNAME, XMM) \
static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst, src, stride, 16); \
ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst + 8, src + 8, stride, 16); \
}
QPEL_CAVS_XMM(put, sse2)
QPEL_CAVS_XMM(avg, sse2)
#endif
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
{
av_unused int cpu_flags = av_get_cpu_flags();
@@ -392,8 +411,13 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
#endif
#if HAVE_SSE2_EXTERNAL
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2;
c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2;
c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2;
c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2;
c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2;
c->cavs_idct8_add = cavs_idct8_add_sse2;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;