1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00
Files
FFmpeg/libavcodec/x86/constants.h
Andreas Rheinhardt 650098955e avcodec/x86/cavs_qpel: Add SSE2 vertical motion compensation
This is not based on the MMXEXT one, because the latter is quite
suboptimal: Motion vector types mc01 and mc03 (vertical motion vectors
with remainder of one quarter or three quarter) use different neighboring
lines for interpolation: mc01 uses two lines above and two lines below,
mc03 one line above and three lines below. The MMXEXT code uses
a common macro for all of them and therefore reads six lines
before it processes them (even reading lines which are not used
at all), leading to severe register pressure.

Another difference to the old code is that the positive and negative
parts of the sum to calculate are accumulated separately and
the subtraction is performed with unsigned saturation, so
that one can avoid biasing the sum.

The fact that the mc01 and mc03 filter coefficients are mirrors
of each other has been exploited to reduce mc01 to mc03.

But of course the most important different difference between
this code and the MMXEXT one is that XMM registers allow to
process eight words at a time, ideal for 8x8 subblocks,
whereas the MMXEXT code processes them in 4x8 or 4x16 blocks.

Benchmarks:
avg_cavs_qpel_pixels_tab[0][4]_c:                      917.0 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][4]_mmxext:                 222.0 ( 4.13x)
avg_cavs_qpel_pixels_tab[0][4]_sse2:                    89.0 (10.31x)
avg_cavs_qpel_pixels_tab[0][12]_c:                     885.7 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][12]_mmxext:                223.2 ( 3.97x)
avg_cavs_qpel_pixels_tab[0][12]_sse2:                   88.5 (10.01x)
avg_cavs_qpel_pixels_tab[1][4]_c:                      222.4 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][4]_mmxext:                  57.2 ( 3.89x)
avg_cavs_qpel_pixels_tab[1][4]_sse2:                    23.3 ( 9.55x)
avg_cavs_qpel_pixels_tab[1][12]_c:                     216.0 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][12]_mmxext:                 57.4 ( 3.76x)
avg_cavs_qpel_pixels_tab[1][12]_sse2:                   22.6 ( 9.56x)
put_cavs_qpel_pixels_tab[0][4]_c:                      750.9 ( 1.00x)
put_cavs_qpel_pixels_tab[0][4]_mmxext:                 210.4 ( 3.57x)
put_cavs_qpel_pixels_tab[0][4]_sse2:                    84.2 ( 8.92x)
put_cavs_qpel_pixels_tab[0][12]_c:                     731.6 ( 1.00x)
put_cavs_qpel_pixels_tab[0][12]_mmxext:                210.7 ( 3.47x)
put_cavs_qpel_pixels_tab[0][12]_sse2:                   84.1 ( 8.70x)
put_cavs_qpel_pixels_tab[1][4]_c:                      191.7 ( 1.00x)
put_cavs_qpel_pixels_tab[1][4]_mmxext:                  53.8 ( 3.56x)
put_cavs_qpel_pixels_tab[1][4]_sse2:                    24.5 ( 7.83x)
put_cavs_qpel_pixels_tab[1][12]_c:                     179.1 ( 1.00x)
put_cavs_qpel_pixels_tab[1][12]_mmxext:                 53.9 ( 3.32x)
put_cavs_qpel_pixels_tab[1][12]_sse2:                   24.0 ( 7.47x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-08 20:40:08 +02:00

73 lines
2.2 KiB
C

/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CONSTANTS_H
#define AVCODEC_X86_CONSTANTS_H
#include <stdint.h>
#include "libavutil/x86/asm.h"
extern const ymm_reg ff_pw_1;
extern const ymm_reg ff_pw_2;
extern const xmm_reg ff_pw_3;
extern const ymm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_7;
extern const xmm_reg ff_pw_8;
extern const xmm_reg ff_pw_9;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
extern const xmm_reg ff_pw_20;
extern const xmm_reg ff_pw_32;
extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_128;
extern const ymm_reg ff_pw_255;
extern const ymm_reg ff_pw_256;
extern const ymm_reg ff_pw_512;
extern const ymm_reg ff_pw_1023;
extern const ymm_reg ff_pw_1024;
extern const ymm_reg ff_pw_2048;
extern const ymm_reg ff_pw_4095;
extern const ymm_reg ff_pw_4096;
extern const ymm_reg ff_pw_8192;
extern const ymm_reg ff_pw_m1;
extern const ymm_reg ff_pb_0;
extern const ymm_reg ff_pb_1;
extern const ymm_reg ff_pb_2;
extern const ymm_reg ff_pb_3;
extern const ymm_reg ff_pb_80;
extern const xmm_reg ff_pb_FC;
extern const ymm_reg ff_pb_FE;
extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
extern const ymm_reg ff_pd_64;
extern const ymm_reg ff_pd_8192;
extern const ymm_reg ff_pd_65535;
#endif /* AVCODEC_X86_CONSTANTS_H */