1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00
Files
FFmpeg/libavcodec/x86/constants.c
Andreas Rheinhardt 650098955e avcodec/x86/cavs_qpel: Add SSE2 vertical motion compensation
This is not based on the MMXEXT one, because the latter is quite
suboptimal: Motion vector types mc01 and mc03 (vertical motion vectors
with remainder of one quarter or three quarter) use different neighboring
lines for interpolation: mc01 uses two lines above and two lines below,
mc03 one line above and three lines below. The MMXEXT code uses
a common macro for all of them and therefore reads six lines
before it processes them (even reading lines which are not used
at all), leading to severe register pressure.

Another difference to the old code is that the positive and negative
parts of the sum to calculate are accumulated separately and
the subtraction is performed with unsigned saturation, so
that one can avoid biasing the sum.

The fact that the mc01 and mc03 filter coefficients are mirrors
of each other has been exploited to reduce mc01 to mc03.

But of course the most important different difference between
this code and the MMXEXT one is that XMM registers allow to
process eight words at a time, ideal for 8x8 subblocks,
whereas the MMXEXT code processes them in 4x8 or 4x16 blocks.

Benchmarks:
avg_cavs_qpel_pixels_tab[0][4]_c:                      917.0 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][4]_mmxext:                 222.0 ( 4.13x)
avg_cavs_qpel_pixels_tab[0][4]_sse2:                    89.0 (10.31x)
avg_cavs_qpel_pixels_tab[0][12]_c:                     885.7 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][12]_mmxext:                223.2 ( 3.97x)
avg_cavs_qpel_pixels_tab[0][12]_sse2:                   88.5 (10.01x)
avg_cavs_qpel_pixels_tab[1][4]_c:                      222.4 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][4]_mmxext:                  57.2 ( 3.89x)
avg_cavs_qpel_pixels_tab[1][4]_sse2:                    23.3 ( 9.55x)
avg_cavs_qpel_pixels_tab[1][12]_c:                     216.0 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][12]_mmxext:                 57.4 ( 3.76x)
avg_cavs_qpel_pixels_tab[1][12]_sse2:                   22.6 ( 9.56x)
put_cavs_qpel_pixels_tab[0][4]_c:                      750.9 ( 1.00x)
put_cavs_qpel_pixels_tab[0][4]_mmxext:                 210.4 ( 3.57x)
put_cavs_qpel_pixels_tab[0][4]_sse2:                    84.2 ( 8.92x)
put_cavs_qpel_pixels_tab[0][12]_c:                     731.6 ( 1.00x)
put_cavs_qpel_pixels_tab[0][12]_mmxext:                210.7 ( 3.47x)
put_cavs_qpel_pixels_tab[0][12]_sse2:                   84.1 ( 8.70x)
put_cavs_qpel_pixels_tab[1][4]_c:                      191.7 ( 1.00x)
put_cavs_qpel_pixels_tab[1][4]_mmxext:                  53.8 ( 3.56x)
put_cavs_qpel_pixels_tab[1][4]_sse2:                    24.5 ( 7.83x)
put_cavs_qpel_pixels_tab[1][12]_c:                     179.1 ( 1.00x)
put_cavs_qpel_pixels_tab[1][12]_mmxext:                 53.9 ( 3.32x)
put_cavs_qpel_pixels_tab[1][12]_sse2:                   24.0 ( 7.47x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-08 20:40:08 +02:00

95 lines
7.4 KiB
C

/*
* MMX/SSE/AVX constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem_internal.h"
#include "libavutil/x86/asm.h" // for xmm_reg
#include "constants.h"
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_7) = { 0x0007000700070007ULL, 0x0007000700070007ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
0x0100010001000100ULL, 0x0100010001000100ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
0x0200020002000200ULL, 0x0200020002000200ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
0x0400040004000400ULL, 0x0400040004000400ULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
0x0800080008000800ULL, 0x0800080008000800ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
0x1000100010001000ULL, 0x1000100010001000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
0x2000200020002000ULL, 0x2000200020002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
0x0000000000000000ULL, 0x0000000000000000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
0x0101010101010101ULL, 0x0101010101010101ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
0x0202020202020202ULL, 0x0202020202020202ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
0x0303030303030303ULL, 0x0303030303030303ULL };
DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FC) = { 0xFCFCFCFCFCFCFCFCULL, 0xFCFCFCFCFCFCFCFCULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
0x0000000100000001ULL, 0x0000000100000001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_64) = { 0x0000004000000040ULL, 0x0000004000000040ULL,
0x0000004000000040ULL, 0x0000004000000040ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
0x0000200000002000ULL, 0x0000200000002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };