You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-29 05:57:37 +02:00
avcodec/x86/cavsdsp: Add SSE2 mc20 horizontal motion compensation
Basically a direct port of the MMXEXT one. The main difference is of course that one can process eight pixels (unpacked to words) at a time, leading to speedups. avg_cavs_qpel_pixels_tab[0][2]_c: 700.1 ( 1.00x) avg_cavs_qpel_pixels_tab[0][2]_mmxext: 158.1 ( 4.43x) avg_cavs_qpel_pixels_tab[0][2]_sse2: 86.0 ( 8.14x) avg_cavs_qpel_pixels_tab[1][2]_c: 171.9 ( 1.00x) avg_cavs_qpel_pixels_tab[1][2]_mmxext: 39.4 ( 4.36x) avg_cavs_qpel_pixels_tab[1][2]_sse2: 21.7 ( 7.92x) put_cavs_qpel_pixels_tab[0][2]_c: 525.7 ( 1.00x) put_cavs_qpel_pixels_tab[0][2]_mmxext: 148.5 ( 3.54x) put_cavs_qpel_pixels_tab[0][2]_sse2: 75.2 ( 6.99x) put_cavs_qpel_pixels_tab[1][2]_c: 129.5 ( 1.00x) put_cavs_qpel_pixels_tab[1][2]_mmxext: 36.7 ( 3.53x) put_cavs_qpel_pixels_tab[1][2]_sse2: 19.0 ( 6.81x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -152,7 +152,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_APV_DECODER) += x86/apv_dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
|
||||
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o x86/cavs_qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp.o
|
||||
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
|
||||
|
||||
80
libavcodec/x86/cavs_qpel.asm
Normal file
80
libavcodec/x86/cavs_qpel.asm
Normal file
@@ -0,0 +1,80 @@
|
||||
;*****************************************************************************
|
||||
;* SSE2-optimized CAVS QPEL code
|
||||
;*****************************************************************************
|
||||
;* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
|
||||
;* based on H.264 optimizations by Michael Niedermayer and Loren Merritt
|
||||
;* Copyright (c) 2025 Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_5
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro op_avgh 3
|
||||
movh %3, %2
|
||||
pavgb %1, %3
|
||||
movh %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_puth 2-3
|
||||
movh %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro CAVS_QPEL_H 1
|
||||
; ff_put_cavs_qpel8_mc20(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
cglobal %1_cavs_qpel8_mc20, 3,4,6
|
||||
mov r3d, 8
|
||||
jmp %1_cavs_qpel8_h_after_prologue
|
||||
|
||||
; ff_put_cavs_qpel8_h(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
|
||||
cglobal %1_cavs_qpel8_h, 4,4,6
|
||||
%1_cavs_qpel8_h_after_prologue:
|
||||
mova m3, [pw_4]
|
||||
mova m4, [pw_5]
|
||||
pxor m5, m5
|
||||
.loop:
|
||||
movh m0, [r1]
|
||||
movh m1, [r1+1]
|
||||
punpcklbw m0, m5
|
||||
punpcklbw m1, m5
|
||||
paddw m0, m1
|
||||
movh m1, [r1-1]
|
||||
movh m2, [r1+2]
|
||||
pmullw m0, m4
|
||||
punpcklbw m1, m5
|
||||
punpcklbw m2, m5
|
||||
paddw m0, m3
|
||||
add r1, r2
|
||||
paddw m1, m2
|
||||
psubw m0, m1
|
||||
psraw m0, 3
|
||||
packuswb m0, m5
|
||||
op_%1h m0, [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jne .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
CAVS_QPEL_H avg
|
||||
CAVS_QPEL_H put
|
||||
@@ -370,6 +370,25 @@ CAVS_MC(avg_, 8, mmxext)
|
||||
CAVS_MC(avg_, 16, mmxext)
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
#define DEF_QPEL(OPNAME) \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \
|
||||
|
||||
DEF_QPEL(put)
|
||||
DEF_QPEL(avg)
|
||||
|
||||
#define QPEL_CAVS_XMM(OPNAME, XMM) \
|
||||
static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst, src, stride, 16); \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst + 8, src + 8, stride, 16); \
|
||||
}
|
||||
|
||||
QPEL_CAVS_XMM(put, sse2)
|
||||
QPEL_CAVS_XMM(avg, sse2)
|
||||
#endif
|
||||
|
||||
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
@@ -392,8 +411,13 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
|
||||
#endif
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
|
||||
|
||||
c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2;
|
||||
|
||||
c->cavs_idct8_add = cavs_idct8_add_sse2;
|
||||
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
|
||||
|
||||
Reference in New Issue
Block a user