mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
hevcdsp: add x86 SIMD for MC
This commit is contained in:
parent
0cef06df07
commit
e7078e842d
@ -38,9 +38,9 @@
|
||||
#include "golomb.h"
|
||||
#include "hevc.h"
|
||||
|
||||
const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 2 };
|
||||
const uint8_t ff_hevc_qpel_extra_after[4] = { 0, 3, 4, 4 };
|
||||
const uint8_t ff_hevc_qpel_extra[4] = { 0, 6, 7, 6 };
|
||||
const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 3 };
|
||||
const uint8_t ff_hevc_qpel_extra_after[4] = { 0, 4, 4, 4 };
|
||||
const uint8_t ff_hevc_qpel_extra[4] = { 0, 7, 7, 7 };
|
||||
|
||||
static const uint8_t scan_1x1[1] = { 0 };
|
||||
|
||||
|
@ -740,7 +740,7 @@ typedef struct HEVCPredContext {
|
||||
} HEVCPredContext;
|
||||
|
||||
typedef struct HEVCLocalContext {
|
||||
DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 7) * MAX_PB_SIZE]);
|
||||
DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 24) * MAX_PB_SIZE]);
|
||||
uint8_t cabac_state[HEVC_CONTEXTS];
|
||||
|
||||
uint8_t first_qp_group;
|
||||
|
@ -89,7 +89,7 @@ static const int8_t transform[32][32] = {
|
||||
90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 },
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
|
||||
DECLARE_ALIGNED(16, const int16_t, ff_hevc_epel_coeffs[7][16]) = {
|
||||
{ -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
|
||||
{ -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
|
||||
{ -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
|
||||
@ -99,6 +99,28 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
|
||||
{ -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_coeffs8[7][16]) = {
|
||||
{ -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
|
||||
{ -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
|
||||
{ -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
|
||||
{ -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
|
||||
{ -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
|
||||
{ -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
|
||||
{ -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, const int16_t, ff_hevc_qpel_coeffs[3][8]) = {
|
||||
{ -1, 4, -10, 58, 17, -5, 1, 0 },
|
||||
{ -1, 4, -11, 40, 40, -11, 4, -1 },
|
||||
{ 0, 1, -5, 17, 58, -10, 4, -1 },
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8[3][16]) = {
|
||||
{ -1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1, 0 },
|
||||
{ -1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1 },
|
||||
{ 0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4, -1 },
|
||||
};
|
||||
|
||||
#define BIT_DEPTH 8
|
||||
#include "hevcdsp_template.c"
|
||||
#undef BIT_DEPTH
|
||||
|
@ -118,6 +118,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
|
||||
|
||||
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
|
||||
|
||||
extern const int8_t ff_hevc_epel_filters[7][16];
|
||||
extern const int16_t ff_hevc_epel_coeffs[7][16];
|
||||
extern const int8_t ff_hevc_epel_coeffs8[7][16];
|
||||
extern const int16_t ff_hevc_qpel_coeffs[3][8];
|
||||
extern const int8_t ff_hevc_qpel_coeffs8[3][16];
|
||||
|
||||
#endif /* AVCODEC_HEVCDSP_H */
|
||||
|
@ -1018,7 +1018,7 @@ static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
|
||||
int x, y;
|
||||
pixel *src = (pixel *)_src;
|
||||
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
|
||||
const int8_t *filter = ff_hevc_epel_filters[mx - 1];
|
||||
const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
|
||||
int8_t filter_0 = filter[0];
|
||||
int8_t filter_1 = filter[1];
|
||||
int8_t filter_2 = filter[2];
|
||||
@ -1040,7 +1040,7 @@ static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
|
||||
int x, y;
|
||||
pixel *src = (pixel *)_src;
|
||||
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
|
||||
const int8_t *filter = ff_hevc_epel_filters[my - 1];
|
||||
const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
|
||||
int8_t filter_0 = filter[0];
|
||||
int8_t filter_1 = filter[1];
|
||||
int8_t filter_2 = filter[2];
|
||||
@ -1063,8 +1063,8 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
|
||||
int x, y;
|
||||
pixel *src = (pixel *)_src;
|
||||
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
|
||||
const int8_t *filter_h = ff_hevc_epel_filters[mx - 1];
|
||||
const int8_t *filter_v = ff_hevc_epel_filters[my - 1];
|
||||
const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
|
||||
const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
|
||||
int8_t filter_0 = filter_h[0];
|
||||
int8_t filter_1 = filter_h[1];
|
||||
int8_t filter_2 = filter_h[2];
|
||||
|
@ -113,7 +113,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
|
||||
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
||||
YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
|
||||
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
|
||||
YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o
|
||||
YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \
|
||||
x86/hevc_mc.o
|
||||
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
|
||||
|
851
libavcodec/x86/hevc_mc.asm
Normal file
851
libavcodec/x86/hevc_mc.asm
Normal file
@ -0,0 +1,851 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized HEVC MC
|
||||
;* Copyright 2015 Anton Khirnov
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .rodata
|
||||
|
||||
pw_1023: times 8 dw 1023
|
||||
|
||||
cextern hevc_qpel_coeffs
|
||||
cextern hevc_qpel_coeffs8
|
||||
|
||||
cextern hevc_epel_coeffs
|
||||
cextern hevc_epel_coeffs8
|
||||
|
||||
cextern pw_8
|
||||
cextern pw_16
|
||||
cextern pw_32
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
; %1: width
|
||||
; %2: bit depth
|
||||
%macro COMMON_DEFS 2
|
||||
%assign blocksize 8
|
||||
%assign nb_blocks ((%1 + blocksize - 1) / blocksize)
|
||||
%define last_block_truncated (blocksize * nb_blocks > %1)
|
||||
%if %2 > 8
|
||||
%define LOAD_BLOCK movu
|
||||
%define LOAD_HALFBLOCK movq
|
||||
%assign pixelsize 2
|
||||
%else
|
||||
%define LOAD_BLOCK movq
|
||||
%define LOAD_HALFBLOCK movd
|
||||
%assign pixelsize 1
|
||||
%endif
|
||||
%define STORE_BLOCK mova
|
||||
%define STORE_HALFBLOCK movq
|
||||
%endmacro
|
||||
|
||||
; %1: block index
|
||||
%macro BLOCK_DEFS 1
|
||||
%if last_block_truncated && %1 == nb_blocks - 1
|
||||
%define block_truncated 1
|
||||
%define LOAD LOAD_HALFBLOCK
|
||||
%define STORE STORE_HALFBLOCK
|
||||
%else
|
||||
%define block_truncated 0
|
||||
%define LOAD LOAD_BLOCK
|
||||
%define STORE STORE_BLOCK
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
|
||||
; pixel *src, ptrdiff_t srcstride,
|
||||
; int height, int mx, int my, int *mcbuffer)
|
||||
|
||||
; %1: block width
|
||||
; %2: bit depth
|
||||
; %3: log2 of height unroll
|
||||
%macro GET_PIXELS 3
|
||||
cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
|
||||
|
||||
%assign shift 14 - %2
|
||||
COMMON_DEFS %1, %2
|
||||
|
||||
%if pixelsize == 1
|
||||
pxor m0, m0
|
||||
%endif
|
||||
|
||||
shr heightd, %3
|
||||
|
||||
.loop:
|
||||
|
||||
%assign i 0
|
||||
%rep (1 << %3)
|
||||
|
||||
%assign j 0
|
||||
%rep nb_blocks
|
||||
|
||||
BLOCK_DEFS j
|
||||
|
||||
LOAD m1, [srcq + j * pixelsize * blocksize]
|
||||
%if pixelsize == 1
|
||||
punpcklbw m1, m0
|
||||
%endif
|
||||
psllw m1, shift
|
||||
STORE [dstq + j * 2 * blocksize], m1
|
||||
|
||||
%assign j (j + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
GET_PIXELS 4, 8, 1
|
||||
GET_PIXELS 8, 8, 1
|
||||
GET_PIXELS 12, 8, 3
|
||||
GET_PIXELS 16, 8, 2
|
||||
GET_PIXELS 24, 8, 3
|
||||
GET_PIXELS 32, 8, 3
|
||||
GET_PIXELS 48, 8, 3
|
||||
GET_PIXELS 64, 8, 3
|
||||
|
||||
GET_PIXELS 4, 10, 1
|
||||
GET_PIXELS 8, 10, 1
|
||||
GET_PIXELS 12, 10, 3
|
||||
GET_PIXELS 16, 10, 2
|
||||
GET_PIXELS 24, 10, 3
|
||||
GET_PIXELS 32, 10, 3
|
||||
GET_PIXELS 48, 10, 3
|
||||
GET_PIXELS 64, 10, 3
|
||||
|
||||
; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
|
||||
; uint8_t *src, ptrdiff_t srcstride,
|
||||
; int height, int mx, int my, int *mcbuffer)
|
||||
|
||||
; 8-bit qpel interpolation
|
||||
; %1: block width
|
||||
; %2: 0 - horizontal; 1 - vertical
|
||||
%macro QPEL_8 2
|
||||
%if %2
|
||||
%define postfix v
|
||||
%define mvfrac myq
|
||||
%define coeffsaddr r5q
|
||||
%define pixstride srcstrideq
|
||||
%define pixstride3 r5q
|
||||
%define src_m3 r6q
|
||||
%else
|
||||
%define postfix h
|
||||
%define mvfrac mxq
|
||||
%define coeffsaddr r6q
|
||||
%define pixstride 1
|
||||
%define pixstride3 3
|
||||
%define src_m3 (srcq - 3)
|
||||
%endif
|
||||
|
||||
COMMON_DEFS %1, 8
|
||||
|
||||
cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
|
||||
and mvfrac, 0x3
|
||||
dec mvfrac
|
||||
shl mvfrac, 4
|
||||
lea coeffsaddr, [hevc_qpel_coeffs8]
|
||||
mova m0, [coeffsaddr + mvfrac]
|
||||
|
||||
SPLATW m1, m0, 1
|
||||
SPLATW m2, m0, 2
|
||||
SPLATW m3, m0, 3
|
||||
SPLATW m0, m0, 0
|
||||
|
||||
%if %2
|
||||
lea pixstride3, [srcstrideq + 2 * srcstrideq]
|
||||
mov src_m3, srcq
|
||||
sub src_m3, pixstride3
|
||||
%endif
|
||||
|
||||
.loop
|
||||
|
||||
%assign i 0
|
||||
%rep nb_blocks
|
||||
|
||||
BLOCK_DEFS i
|
||||
|
||||
LOAD m4, [src_m3 + i * blocksize]
|
||||
LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
|
||||
punpcklbw m4, m5
|
||||
pmaddubsw m4, m0
|
||||
|
||||
LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
|
||||
LOAD m6, [srcq + i * blocksize]
|
||||
punpcklbw m5, m6
|
||||
pmaddubsw m5, m1
|
||||
paddsw m4, m5
|
||||
|
||||
LOAD m5, [srcq + i * blocksize + 1 * pixstride]
|
||||
LOAD m6, [srcq + i * blocksize + 2 * pixstride]
|
||||
punpcklbw m5, m6
|
||||
pmaddubsw m5, m2
|
||||
paddsw m4, m5
|
||||
|
||||
LOAD m5, [srcq + i * blocksize + pixstride3]
|
||||
LOAD m6, [srcq + i * blocksize + 4 * pixstride]
|
||||
punpcklbw m5, m6
|
||||
pmaddubsw m5, m3
|
||||
paddsw m4, m5
|
||||
|
||||
STORE [dstq + i * 2 * blocksize], m4
|
||||
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
%if %2
|
||||
add src_m3, srcstrideq
|
||||
%endif
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL_8 4, 0
|
||||
QPEL_8 8, 0
|
||||
QPEL_8 12, 0
|
||||
QPEL_8 16, 0
|
||||
QPEL_8 24, 0
|
||||
QPEL_8 32, 0
|
||||
QPEL_8 48, 0
|
||||
QPEL_8 64, 0
|
||||
|
||||
QPEL_8 4, 1
|
||||
QPEL_8 8, 1
|
||||
QPEL_8 12, 1
|
||||
QPEL_8 16, 1
|
||||
QPEL_8 24, 1
|
||||
QPEL_8 32, 1
|
||||
QPEL_8 48, 1
|
||||
QPEL_8 64, 1
|
||||
|
||||
; 16-bit qpel interpolation
|
||||
; %1: block width
|
||||
; %2: shift applied to the result
|
||||
; %3: 0 - horizontal; 1 - vertical
|
||||
%macro QPEL_16 3
|
||||
%if %3
|
||||
%define mvfrac myq
|
||||
%define pixstride srcstrideq
|
||||
%define pixstride3 sstride3q
|
||||
%define src_m3 srcm3q
|
||||
%else
|
||||
%define mvfrac mxq
|
||||
%define pixstride 2
|
||||
%define pixstride3 6
|
||||
%define src_m3 (srcq - 6)
|
||||
%endif
|
||||
|
||||
COMMON_DEFS %1, 16
|
||||
|
||||
and mvfrac, 0x3
|
||||
dec mvfrac
|
||||
shl mvfrac, 4
|
||||
lea coeffsregq, [hevc_qpel_coeffs]
|
||||
mova m0, [coeffsregq + mvfrac]
|
||||
|
||||
pshufd m1, m0, 0x55
|
||||
pshufd m2, m0, 0xaa
|
||||
pshufd m3, m0, 0xff
|
||||
pshufd m0, m0, 0x00
|
||||
|
||||
%if %3
|
||||
lea sstride3q, [srcstrideq + 2 * srcstrideq]
|
||||
mov srcm3q, srcq
|
||||
sub srcm3q, sstride3q
|
||||
%endif
|
||||
|
||||
.loop
|
||||
|
||||
%assign i 0
|
||||
%rep nb_blocks
|
||||
|
||||
BLOCK_DEFS i
|
||||
|
||||
LOAD m4, [src_m3 + i * 2 * blocksize]
|
||||
LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride]
|
||||
LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride]
|
||||
LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride]
|
||||
LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride]
|
||||
LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride]
|
||||
LOAD m10, [srcq + i * 2 * blocksize + pixstride3]
|
||||
LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride]
|
||||
|
||||
punpcklwd m12, m4, m5
|
||||
pmaddwd m12, m0
|
||||
|
||||
punpcklwd m13, m6, m7
|
||||
pmaddwd m13, m1
|
||||
paddd m12, m13
|
||||
|
||||
punpcklwd m13, m8, m9
|
||||
pmaddwd m13, m2
|
||||
paddd m12, m13
|
||||
|
||||
punpcklwd m13, m10, m11
|
||||
pmaddwd m13, m3
|
||||
paddd m12, m13
|
||||
psrad m12, %2
|
||||
|
||||
%if block_truncated == 0
|
||||
punpckhwd m4, m5
|
||||
pmaddwd m4, m0
|
||||
|
||||
punpckhwd m6, m7
|
||||
pmaddwd m6, m1
|
||||
paddd m4, m6
|
||||
|
||||
punpckhwd m8, m9
|
||||
pmaddwd m8, m2
|
||||
paddd m4, m8
|
||||
|
||||
punpckhwd m10, m11
|
||||
pmaddwd m10, m3
|
||||
paddd m4, m10
|
||||
|
||||
psrad m4, %2
|
||||
%endif
|
||||
packssdw m12, m4
|
||||
STORE [dstq + i * 2 * blocksize], m12
|
||||
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
%if %3
|
||||
add srcm3q, srcstrideq
|
||||
%endif
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
%macro QPEL_H_10 1
|
||||
cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
|
||||
QPEL_16 %1, 2, 0
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx
|
||||
QPEL_H_10 4
|
||||
QPEL_H_10 8
|
||||
QPEL_H_10 12
|
||||
QPEL_H_10 16
|
||||
QPEL_H_10 24
|
||||
QPEL_H_10 32
|
||||
QPEL_H_10 48
|
||||
QPEL_H_10 64
|
||||
|
||||
%macro QPEL_V_10 1
|
||||
cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
|
||||
QPEL_16 %1, 2, 1
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx
|
||||
QPEL_V_10 4
|
||||
QPEL_V_10 8
|
||||
QPEL_V_10 12
|
||||
QPEL_V_10 16
|
||||
QPEL_V_10 24
|
||||
QPEL_V_10 32
|
||||
QPEL_V_10 48
|
||||
QPEL_V_10 64
|
||||
|
||||
; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
|
||||
; uint8_t *src, ptrdiff_t srcstride,
|
||||
; int height, int mx, int my, int *mcbuffer)
|
||||
|
||||
%macro QPEL_HV 1
|
||||
cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
|
||||
QPEL_16 %1, 6, 1
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx
|
||||
QPEL_HV 4
|
||||
QPEL_HV 8
|
||||
QPEL_HV 12
|
||||
QPEL_HV 16
|
||||
QPEL_HV 24
|
||||
QPEL_HV 32
|
||||
QPEL_HV 48
|
||||
QPEL_HV 64
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
|
||||
; uint8_t *src, ptrdiff_t srcstride,
|
||||
; int height, int mx, int my, int *mcbuffer)
|
||||
|
||||
; 8-bit epel interpolation
|
||||
; %1: block width
|
||||
; %2: 0 - horizontal; 1 - vertical
|
||||
%macro EPEL_8 2
|
||||
%if %2
|
||||
%define postfix v
|
||||
%define mvfrac myq
|
||||
%define coeffsaddr r5q
|
||||
%define pixstride srcstrideq
|
||||
%define pixstride3 r5q
|
||||
%else
|
||||
%define postfix h
|
||||
%define mvfrac mxq
|
||||
%define coeffsaddr r6q
|
||||
%define pixstride 1
|
||||
%define pixstride3 3
|
||||
%endif
|
||||
|
||||
COMMON_DEFS %1, 8
|
||||
|
||||
cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
|
||||
and mvfrac, 0x7
|
||||
dec mvfrac
|
||||
shl mvfrac, 4
|
||||
lea coeffsaddr, [hevc_epel_coeffs8]
|
||||
movq m0, [coeffsaddr + mvfrac]
|
||||
|
||||
SPLATW m1, m0, 1
|
||||
SPLATW m0, m0, 0
|
||||
|
||||
%if %2
|
||||
lea pixstride3, [srcstrideq + 2 * srcstrideq]
|
||||
%endif
|
||||
sub srcq, pixstride
|
||||
|
||||
.loop
|
||||
|
||||
%assign i 0
|
||||
%rep nb_blocks
|
||||
|
||||
BLOCK_DEFS i
|
||||
|
||||
LOAD m2, [srcq + i * blocksize + 0 * pixstride]
|
||||
LOAD m3, [srcq + i * blocksize + 1 * pixstride]
|
||||
LOAD m4, [srcq + i * blocksize + 2 * pixstride]
|
||||
LOAD m5, [srcq + i * blocksize + pixstride3]
|
||||
|
||||
punpcklbw m2, m3
|
||||
punpcklbw m4, m5
|
||||
|
||||
pmaddubsw m2, m0
|
||||
pmaddubsw m4, m1
|
||||
|
||||
paddsw m2, m4
|
||||
|
||||
STORE [dstq + i * 2 * blocksize], m2
|
||||
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
EPEL_8 4, 0
|
||||
EPEL_8 8, 0
|
||||
EPEL_8 12, 0
|
||||
EPEL_8 16, 0
|
||||
EPEL_8 24, 0
|
||||
EPEL_8 32, 0
|
||||
|
||||
EPEL_8 4, 1
|
||||
EPEL_8 8, 1
|
||||
EPEL_8 12, 1
|
||||
EPEL_8 16, 1
|
||||
EPEL_8 24, 1
|
||||
EPEL_8 32, 1
|
||||
|
||||
%macro EPEL_16 3
|
||||
%if %3
|
||||
%define mvfrac myq
|
||||
%define pixstride srcstrideq
|
||||
%define pixstride3 sstride3q
|
||||
%else
|
||||
%define mvfrac mxq
|
||||
%define pixstride 2
|
||||
%define pixstride3 6
|
||||
%endif
|
||||
|
||||
COMMON_DEFS %1, 16
|
||||
|
||||
and mvfrac, 0x7
|
||||
dec mvfrac
|
||||
shl mvfrac, 5
|
||||
lea coeffsregq, [hevc_epel_coeffs]
|
||||
mova m0, [coeffsregq + mvfrac]
|
||||
|
||||
pshufd m1, m0, 0x55
|
||||
pshufd m0, m0, 0x00
|
||||
|
||||
%if %3
|
||||
lea sstride3q, [srcstrideq + 2 * srcstrideq]
|
||||
%endif
|
||||
sub srcq, pixstride
|
||||
|
||||
.loop
|
||||
|
||||
%assign i 0
|
||||
%rep nb_blocks
|
||||
|
||||
BLOCK_DEFS i
|
||||
|
||||
LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
|
||||
LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
|
||||
LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
|
||||
LOAD m5, [srcq + i * 2 * blocksize + pixstride3]
|
||||
|
||||
punpcklwd m6, m2, m3
|
||||
punpcklwd m7, m4, m5
|
||||
pmaddwd m6, m0
|
||||
pmaddwd m7, m1
|
||||
paddd m6, m7
|
||||
psrad m6, %2
|
||||
|
||||
%if block_truncated == 0
|
||||
punpckhwd m2, m3
|
||||
punpckhwd m4, m5
|
||||
pmaddwd m2, m0
|
||||
pmaddwd m4, m1
|
||||
paddd m2, m4
|
||||
psrad m2, %2
|
||||
%endif
|
||||
packssdw m6, m2
|
||||
STORE [dstq + i * 2 * blocksize], m6
|
||||
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
%macro EPEL_H_10 1
|
||||
cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
|
||||
EPEL_16 %1, 2, 0
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx
|
||||
EPEL_H_10 4
|
||||
EPEL_H_10 8
|
||||
EPEL_H_10 12
|
||||
EPEL_H_10 16
|
||||
EPEL_H_10 24
|
||||
EPEL_H_10 32
|
||||
|
||||
%macro EPEL_V_10 1
|
||||
cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
|
||||
EPEL_16 %1, 2, 1
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx
|
||||
EPEL_V_10 4
|
||||
EPEL_V_10 8
|
||||
EPEL_V_10 12
|
||||
EPEL_V_10 16
|
||||
EPEL_V_10 24
|
||||
EPEL_V_10 32
|
||||
|
||||
; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
|
||||
; int16_t *src, ptrdiff_t srcstride,
|
||||
; int height, int mx, int my, int *mcbuffer)
|
||||
|
||||
%macro EPEL_HV 1
|
||||
cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
|
||||
EPEL_16 %1, 6, 1
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx
|
||||
EPEL_HV 4
|
||||
EPEL_HV 8
|
||||
EPEL_HV 12
|
||||
EPEL_HV 16
|
||||
EPEL_HV 24
|
||||
EPEL_HV 32
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
|
||||
; int16_t *src, ptrdiff_t srcstride,
|
||||
; int height)
|
||||
|
||||
%macro AVG 5
|
||||
%if %3
|
||||
%if %4 == 4
|
||||
movq %5, %2
|
||||
paddsw %1, %5
|
||||
%else
|
||||
paddsw %1, %2
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; %1: 0 - one source; 1 - two sources
|
||||
; %2: width
|
||||
; %3: bit depth
|
||||
%macro PUT_PRED 3
|
||||
%if %1
|
||||
cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
|
||||
%else
|
||||
cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
|
||||
%endif
|
||||
|
||||
%assign shift 14 + %1 - %3
|
||||
%assign offset (1 << (shift - 1))
|
||||
%define offset_data pw_ %+ offset
|
||||
|
||||
mova m0, [offset_data]
|
||||
|
||||
%if %3 > 8
|
||||
%define STORE_BLOCK movu
|
||||
%define STORE_HALF movq
|
||||
|
||||
%assign pixel_max ((1 << %3) - 1)
|
||||
%define pw_pixel_max pw_ %+ pixel_max
|
||||
pxor m1, m1
|
||||
mova m2, [pw_pixel_max]
|
||||
%else
|
||||
%define STORE_BLOCK movq
|
||||
%define STORE_HALF movd
|
||||
%endif
|
||||
|
||||
.loop
|
||||
%assign i 0
|
||||
%rep (%2 + 7) / 8
|
||||
|
||||
%if (i + 1) * 8 > %2
|
||||
%define LOAD movq
|
||||
%define STORE STORE_HALF
|
||||
%else
|
||||
%define LOAD mova
|
||||
%define STORE STORE_BLOCK
|
||||
%endif
|
||||
|
||||
LOAD m3, [srcq + 16 * i]
|
||||
AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4
|
||||
|
||||
paddsw m3, m0
|
||||
psraw m3, shift
|
||||
|
||||
%if %3 == 8
|
||||
packuswb m3, m3
|
||||
STORE [dstq + 8 * i], m3
|
||||
%else
|
||||
CLIPW m3, m1, m2
|
||||
STORE [dstq + 16 * i], m3
|
||||
%endif
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
%if %1
|
||||
add src2q, srcstrideq
|
||||
%endif
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
PUT_PRED 0, 4, 8
|
||||
PUT_PRED 1, 4, 8
|
||||
PUT_PRED 0, 8, 8
|
||||
PUT_PRED 1, 8, 8
|
||||
PUT_PRED 0, 12, 8
|
||||
PUT_PRED 1, 12, 8
|
||||
PUT_PRED 0, 16, 8
|
||||
PUT_PRED 1, 16, 8
|
||||
PUT_PRED 0, 24, 8
|
||||
PUT_PRED 1, 24, 8
|
||||
PUT_PRED 0, 32, 8
|
||||
PUT_PRED 1, 32, 8
|
||||
PUT_PRED 0, 48, 8
|
||||
PUT_PRED 1, 48, 8
|
||||
PUT_PRED 0, 64, 8
|
||||
PUT_PRED 1, 64, 8
|
||||
|
||||
PUT_PRED 0, 4, 10
|
||||
PUT_PRED 1, 4, 10
|
||||
PUT_PRED 0, 8, 10
|
||||
PUT_PRED 1, 8, 10
|
||||
PUT_PRED 0, 12, 10
|
||||
PUT_PRED 1, 12, 10
|
||||
PUT_PRED 0, 16, 10
|
||||
PUT_PRED 1, 16, 10
|
||||
PUT_PRED 0, 24, 10
|
||||
PUT_PRED 1, 24, 10
|
||||
PUT_PRED 0, 32, 10
|
||||
PUT_PRED 1, 32, 10
|
||||
PUT_PRED 0, 48, 10
|
||||
PUT_PRED 1, 48, 10
|
||||
PUT_PRED 0, 64, 10
|
||||
PUT_PRED 1, 64, 10
|
||||
|
||||
%macro PUT_WEIGHTED_PRED 3
|
||||
%if %1
|
||||
cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
|
||||
%else
|
||||
cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
|
||||
%endif
|
||||
|
||||
and denomd, 0xff
|
||||
movsx weight0d, weight0w
|
||||
movsx offset0d, offset0w
|
||||
%if %1
|
||||
movsx weight1d, weight1w
|
||||
movsx offset1d, offset1w
|
||||
%endif
|
||||
|
||||
add denomd, 14 + %1 - %3
|
||||
movd m0, denomd
|
||||
|
||||
%if %3 > 8
|
||||
%assign pixel_max ((1 << %3) - 1)
|
||||
%define pw_pixel_max pw_ %+ pixel_max
|
||||
pxor m4, m4
|
||||
mova m5, [pw_pixel_max]
|
||||
|
||||
shl offset0d, %3 - 8
|
||||
%if %1
|
||||
shl offset1d, %3 - 8
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if %1
|
||||
lea offset0d, [offset0d + offset1d + 1]
|
||||
%else
|
||||
lea offset0d, [2 * offset0d + 1]
|
||||
%endif
|
||||
movd m1, offset0d
|
||||
SPLATD m1
|
||||
pslld m1, m0
|
||||
psrad m1, 1
|
||||
|
||||
movd m2, weight0d
|
||||
SPLATD m2
|
||||
%if %1
|
||||
movd m3, weight1d
|
||||
SPLATD m3
|
||||
%endif
|
||||
|
||||
.loop
|
||||
%assign i 0
|
||||
%rep (%2 + 3) / 4
|
||||
|
||||
pmovsxwd m6, [src0q + 8 * i]
|
||||
pmulld m6, m2
|
||||
|
||||
%if %1
|
||||
pmovsxwd m7, [src1q + 8 * i]
|
||||
pmulld m7, m3
|
||||
paddd m6, m7
|
||||
%endif
|
||||
|
||||
paddd m6, m1
|
||||
psrad m6, m0
|
||||
|
||||
packssdw m6, m6
|
||||
|
||||
%if %3 > 8
|
||||
CLIPW m6, m4, m5
|
||||
movq [dstq + 8 * i], m6
|
||||
%else
|
||||
packuswb m6, m6
|
||||
movd [dstq + 4 * i], m6
|
||||
%endif
|
||||
|
||||
%assign i (i + 1)
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add src0q, srcstrideq
|
||||
%if %1
|
||||
add src1q, srcstrideq
|
||||
%endif
|
||||
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
INIT_XMM sse4
|
||||
PUT_WEIGHTED_PRED 0, 4, 8
|
||||
PUT_WEIGHTED_PRED 1, 4, 8
|
||||
PUT_WEIGHTED_PRED 0, 8, 8
|
||||
PUT_WEIGHTED_PRED 1, 8, 8
|
||||
PUT_WEIGHTED_PRED 0, 12, 8
|
||||
PUT_WEIGHTED_PRED 1, 12, 8
|
||||
PUT_WEIGHTED_PRED 0, 16, 8
|
||||
PUT_WEIGHTED_PRED 1, 16, 8
|
||||
PUT_WEIGHTED_PRED 0, 24, 8
|
||||
PUT_WEIGHTED_PRED 1, 24, 8
|
||||
PUT_WEIGHTED_PRED 0, 32, 8
|
||||
PUT_WEIGHTED_PRED 1, 32, 8
|
||||
PUT_WEIGHTED_PRED 0, 48, 8
|
||||
PUT_WEIGHTED_PRED 1, 48, 8
|
||||
PUT_WEIGHTED_PRED 0, 64, 8
|
||||
PUT_WEIGHTED_PRED 1, 64, 8
|
||||
|
||||
PUT_WEIGHTED_PRED 0, 4, 10
|
||||
PUT_WEIGHTED_PRED 1, 4, 10
|
||||
PUT_WEIGHTED_PRED 0, 8, 10
|
||||
PUT_WEIGHTED_PRED 1, 8, 10
|
||||
PUT_WEIGHTED_PRED 0, 12, 10
|
||||
PUT_WEIGHTED_PRED 1, 12, 10
|
||||
PUT_WEIGHTED_PRED 0, 16, 10
|
||||
PUT_WEIGHTED_PRED 1, 16, 10
|
||||
PUT_WEIGHTED_PRED 0, 24, 10
|
||||
PUT_WEIGHTED_PRED 1, 24, 10
|
||||
PUT_WEIGHTED_PRED 0, 32, 10
|
||||
PUT_WEIGHTED_PRED 1, 32, 10
|
||||
PUT_WEIGHTED_PRED 0, 48, 10
|
||||
PUT_WEIGHTED_PRED 1, 48, 10
|
||||
PUT_WEIGHTED_PRED 0, 64, 10
|
||||
PUT_WEIGHTED_PRED 1, 64, 10
|
||||
|
||||
%endif ; ARCH_X86_64
|
@ -45,27 +45,260 @@ LFC_FUNCS(uint8_t, 10)
|
||||
LFL_FUNCS(uint8_t, 8)
|
||||
LFL_FUNCS(uint8_t, 10)
|
||||
|
||||
#define GET_PIXELS(width, depth, cf) \
|
||||
void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer);
|
||||
|
||||
GET_PIXELS(4, 8, sse2)
|
||||
GET_PIXELS(8, 8, sse2)
|
||||
GET_PIXELS(12, 8, sse2)
|
||||
GET_PIXELS(16, 8, sse2)
|
||||
GET_PIXELS(24, 8, sse2)
|
||||
GET_PIXELS(32, 8, sse2)
|
||||
GET_PIXELS(48, 8, sse2)
|
||||
GET_PIXELS(64, 8, sse2)
|
||||
|
||||
GET_PIXELS(4, 10, sse2)
|
||||
GET_PIXELS(8, 10, sse2)
|
||||
GET_PIXELS(12, 10, sse2)
|
||||
GET_PIXELS(16, 10, sse2)
|
||||
GET_PIXELS(24, 10, sse2)
|
||||
GET_PIXELS(32, 10, sse2)
|
||||
GET_PIXELS(48, 10, sse2)
|
||||
GET_PIXELS(64, 10, sse2)
|
||||
|
||||
/* those are independent of the bit depth, so declared separately */
|
||||
#define INTERP_HV_FUNC(width, cf) \
|
||||
void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
|
||||
int16_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer); \
|
||||
void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
|
||||
int16_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer);
|
||||
|
||||
INTERP_HV_FUNC(4, avx)
|
||||
INTERP_HV_FUNC(8, avx)
|
||||
INTERP_HV_FUNC(12, avx)
|
||||
INTERP_HV_FUNC(16, avx)
|
||||
INTERP_HV_FUNC(24, avx)
|
||||
INTERP_HV_FUNC(32, avx)
|
||||
INTERP_HV_FUNC(48, avx)
|
||||
INTERP_HV_FUNC(64, avx)
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
|
||||
static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer) \
|
||||
{ \
|
||||
const ptrdiff_t stride = FFALIGN(width + 7, 8); \
|
||||
ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
|
||||
height + 7, mx, my, mcbuffer); \
|
||||
ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \
|
||||
height, mx, my, mcbuffer); \
|
||||
}
|
||||
#else
|
||||
#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
|
||||
#endif
|
||||
|
||||
#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
|
||||
void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer); \
|
||||
void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer); \
|
||||
QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
|
||||
|
||||
QPEL_FUNCS(4, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(8, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
|
||||
QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
|
||||
|
||||
QPEL_FUNCS(4, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(8, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(12, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(16, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(24, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(32, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(48, 10, avx, avx, avx)
|
||||
QPEL_FUNCS(64, 10, avx, avx, avx)
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
|
||||
static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer) \
|
||||
{ \
|
||||
const ptrdiff_t stride = FFALIGN(width + 3, 8); \
|
||||
ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \
|
||||
height + 3, mx, my, mcbuffer); \
|
||||
ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \
|
||||
height, mx, my, mcbuffer); \
|
||||
}
|
||||
#else
|
||||
#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
|
||||
#endif
|
||||
|
||||
#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
|
||||
void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer); \
|
||||
void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int mx, int my, int16_t *mcbuffer); \
|
||||
EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
|
||||
|
||||
EPEL_FUNCS(4, 8, ssse3, ssse3, avx)
|
||||
EPEL_FUNCS(8, 8, ssse3, ssse3, avx)
|
||||
EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
|
||||
EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
|
||||
EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
|
||||
EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
|
||||
|
||||
EPEL_FUNCS(4, 10, avx, avx, avx)
|
||||
EPEL_FUNCS(8, 10, avx, avx, avx)
|
||||
EPEL_FUNCS(12, 10, avx, avx, avx)
|
||||
EPEL_FUNCS(16, 10, avx, avx, avx)
|
||||
EPEL_FUNCS(24, 10, avx, avx, avx)
|
||||
EPEL_FUNCS(32, 10, avx, avx, avx)
|
||||
|
||||
#define PUT_PRED(width, depth, cf_uw, cf_w) \
|
||||
void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
|
||||
int16_t *src, ptrdiff_t srcstride, \
|
||||
int height); \
|
||||
void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
|
||||
int16_t *src1, int16_t *src2, \
|
||||
ptrdiff_t srcstride, int height); \
|
||||
void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \
|
||||
uint8_t *dst, ptrdiff_t dststride, \
|
||||
int16_t *src, ptrdiff_t srcstride, \
|
||||
int height); \
|
||||
void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \
|
||||
int16_t offset0, int16_t offset1, \
|
||||
uint8_t *dst, ptrdiff_t dststride, \
|
||||
int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \
|
||||
int height);
|
||||
|
||||
PUT_PRED(4, 8, sse2, sse4)
|
||||
PUT_PRED(8, 8, sse2, sse4)
|
||||
PUT_PRED(12, 8, sse2, sse4)
|
||||
PUT_PRED(16, 8, sse2, sse4)
|
||||
PUT_PRED(24, 8, sse2, sse4)
|
||||
PUT_PRED(32, 8, sse2, sse4)
|
||||
PUT_PRED(48, 8, sse2, sse4)
|
||||
PUT_PRED(64, 8, sse2, sse4)
|
||||
|
||||
PUT_PRED(4, 10, sse2, sse4)
|
||||
PUT_PRED(8, 10, sse2, sse4)
|
||||
PUT_PRED(12, 10, sse2, sse4)
|
||||
PUT_PRED(16, 10, sse2, sse4)
|
||||
PUT_PRED(24, 10, sse2, sse4)
|
||||
PUT_PRED(32, 10, sse2, sse4)
|
||||
PUT_PRED(48, 10, sse2, sse4)
|
||||
PUT_PRED(64, 10, sse2, sse4)
|
||||
|
||||
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
|
||||
c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
|
||||
c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
|
||||
c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
|
||||
c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
|
||||
c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
|
||||
c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
|
||||
c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
|
||||
c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
|
||||
|
||||
#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
|
||||
c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
|
||||
c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
|
||||
c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
|
||||
c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
|
||||
c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
|
||||
c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
|
||||
|
||||
#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
|
||||
#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
|
||||
|
||||
SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
|
||||
SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
|
||||
|
||||
SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2);
|
||||
SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2);
|
||||
SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2);
|
||||
SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
|
||||
SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
|
||||
SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
|
||||
SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
|
||||
|
||||
SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
|
||||
SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
|
||||
|
||||
SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2);
|
||||
SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
|
||||
SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
|
||||
SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
|
||||
}
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4);
|
||||
SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4);
|
||||
SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4);
|
||||
SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
|
||||
SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4);
|
||||
SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4);
|
||||
SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4);
|
||||
SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
|
||||
SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
|
||||
SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
|
||||
SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
|
||||
SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
|
||||
SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
|
||||
}
|
||||
}
|
||||
#endif /* ARCH_X86_64 */
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user