You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
This is not based on the MMXEXT one, because the latter is quite suboptimal: Motion vector types mc01 and mc03 (vertical motion vectors with remainder of one quarter or three quarter) use different neighboring lines for interpolation: mc01 uses two lines above and two lines below, mc03 one line above and three lines below. The MMXEXT code uses a common macro for all of them and therefore reads six lines before it processes them (even reading lines which are not used at all), leading to severe register pressure. Another difference to the old code is that the positive and negative parts of the sum to calculate are accumulated separately and the subtraction is performed with unsigned saturation, so that one can avoid biasing the sum. The fact that the mc01 and mc03 filter coefficients are mirrors of each other has been exploited to reduce mc01 to mc03. But of course the most important different difference between this code and the MMXEXT one is that XMM registers allow to process eight words at a time, ideal for 8x8 subblocks, whereas the MMXEXT code processes them in 4x8 or 4x16 blocks. Benchmarks: avg_cavs_qpel_pixels_tab[0][4]_c: 917.0 ( 1.00x) avg_cavs_qpel_pixels_tab[0][4]_mmxext: 222.0 ( 4.13x) avg_cavs_qpel_pixels_tab[0][4]_sse2: 89.0 (10.31x) avg_cavs_qpel_pixels_tab[0][12]_c: 885.7 ( 1.00x) avg_cavs_qpel_pixels_tab[0][12]_mmxext: 223.2 ( 3.97x) avg_cavs_qpel_pixels_tab[0][12]_sse2: 88.5 (10.01x) avg_cavs_qpel_pixels_tab[1][4]_c: 222.4 ( 1.00x) avg_cavs_qpel_pixels_tab[1][4]_mmxext: 57.2 ( 3.89x) avg_cavs_qpel_pixels_tab[1][4]_sse2: 23.3 ( 9.55x) avg_cavs_qpel_pixels_tab[1][12]_c: 216.0 ( 1.00x) avg_cavs_qpel_pixels_tab[1][12]_mmxext: 57.4 ( 3.76x) avg_cavs_qpel_pixels_tab[1][12]_sse2: 22.6 ( 9.56x) put_cavs_qpel_pixels_tab[0][4]_c: 750.9 ( 1.00x) put_cavs_qpel_pixels_tab[0][4]_mmxext: 210.4 ( 3.57x) put_cavs_qpel_pixels_tab[0][4]_sse2: 84.2 ( 8.92x) put_cavs_qpel_pixels_tab[0][12]_c: 731.6 ( 1.00x) put_cavs_qpel_pixels_tab[0][12]_mmxext: 210.7 ( 3.47x) put_cavs_qpel_pixels_tab[0][12]_sse2: 84.1 ( 8.70x) put_cavs_qpel_pixels_tab[1][4]_c: 191.7 ( 1.00x) put_cavs_qpel_pixels_tab[1][4]_mmxext: 53.8 ( 3.56x) put_cavs_qpel_pixels_tab[1][4]_sse2: 24.5 ( 7.83x) put_cavs_qpel_pixels_tab[1][12]_c: 179.1 ( 1.00x) put_cavs_qpel_pixels_tab[1][12]_mmxext: 53.9 ( 3.32x) put_cavs_qpel_pixels_tab[1][12]_sse2: 24.0 ( 7.47x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
95 lines
7.4 KiB
C
95 lines
7.4 KiB
C
/*
|
|
* MMX/SSE/AVX constants used across x86 dsp optimizations.
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/mem_internal.h"
|
|
#include "libavutil/x86/asm.h" // for xmm_reg
|
|
#include "constants.h"
|
|
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
|
|
0x0001000100010001ULL, 0x0001000100010001ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
|
|
0x0002000200020002ULL, 0x0002000200020002ULL };
|
|
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
|
|
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
|
|
0x0004000400040004ULL, 0x0004000400040004ULL };
|
|
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
|
|
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_7) = { 0x0007000700070007ULL, 0x0007000700070007ULL };
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
|
|
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
|
|
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
|
|
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
|
|
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
|
|
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
|
|
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
|
|
0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
|
|
0x0100010001000100ULL, 0x0100010001000100ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
|
|
0x0200020002000200ULL, 0x0200020002000200ULL };
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
|
|
0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
|
|
0x0400040004000400ULL, 0x0400040004000400ULL};
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
|
|
0x0800080008000800ULL, 0x0800080008000800ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
|
|
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
|
|
0x1000100010001000ULL, 0x1000100010001000ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
|
|
0x2000200020002000ULL, 0x2000200020002000ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
|
|
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
|
|
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
|
|
0x0000000000000000ULL, 0x0000000000000000ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
|
|
0x0101010101010101ULL, 0x0101010101010101ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
|
|
0x0202020202020202ULL, 0x0202020202020202ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
|
|
0x0303030303030303ULL, 0x0303030303030303ULL };
|
|
DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
|
|
0x8080808080808080ULL, 0x8080808080808080ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
|
|
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FC) = { 0xFCFCFCFCFCFCFCFCULL, 0xFCFCFCFCFCFCFCFCULL };
|
|
|
|
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
|
|
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
|
|
0x0000000100000001ULL, 0x0000000100000001ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
|
|
0x0000001000000010ULL, 0x0000001000000010ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
|
|
0x0000002000000020ULL, 0x0000002000000020ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_64) = { 0x0000004000000040ULL, 0x0000004000000040ULL,
|
|
0x0000004000000040ULL, 0x0000004000000040ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
|
|
0x0000200000002000ULL, 0x0000200000002000ULL };
|
|
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
|
|
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
|