mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
rv40dsp x86: MMX/MMX2/3DNow/SSE2/SSSE3 implementations of MC
Code mostly inspired by vp8's MC, however: - its MMX2 horizontal filter is worse because it can't take advantage of the coefficient redundancy - that same coefficient redundancy allows better code for non-SSSE3 versions Benchmark (rounded to tens of unit): V8x8 H8x8 2D8x8 V16x16 H16x16 2D16x16 C 445 358 985 1785 1559 3280 MMX* 219 271 478 714 929 1443 SSE2 131 158 294 425 515 892 SSSE3 120 122 248 387 390 763 End result is overall around a 15% speedup for SSSE3 version (on 6 sequences); all loop filter functions now take around 55% of decoding time, while luma MC dsp functions are around 6%, chroma ones are 1.3% and biweight around 2.3%. Signed-off-by: Diego Biurrun <diego@biurrun.de>
This commit is contained in:
parent
706b998cdc
commit
110d0cdc9d
@ -1791,6 +1791,22 @@ QPEL_2TAP(avg_, 16, 3dnow)
|
|||||||
QPEL_2TAP(put_, 8, 3dnow)
|
QPEL_2TAP(put_, 8, 3dnow)
|
||||||
QPEL_2TAP(avg_, 8, 3dnow)
|
QPEL_2TAP(avg_, 8, 3dnow)
|
||||||
|
|
||||||
|
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||||
|
{
|
||||||
|
put_pixels8_xy2_mmx(dst, src, stride, 8);
|
||||||
|
}
|
||||||
|
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||||
|
{
|
||||||
|
put_pixels16_xy2_mmx(dst, src, stride, 16);
|
||||||
|
}
|
||||||
|
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||||
|
{
|
||||||
|
avg_pixels8_xy2_mmx(dst, src, stride, 8);
|
||||||
|
}
|
||||||
|
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||||
|
{
|
||||||
|
avg_pixels16_xy2_mmx(dst, src, stride, 16);
|
||||||
|
}
|
||||||
|
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
|
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
|
||||||
|
@ -199,6 +199,11 @@ void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
|
|||||||
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
||||||
void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
||||||
|
|
||||||
|
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||||
|
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||||
|
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||||
|
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||||
|
|
||||||
void ff_mmx_idct(DCTELEM *block);
|
void ff_mmx_idct(DCTELEM *block);
|
||||||
void ff_mmxext_idct(DCTELEM *block);
|
void ff_mmxext_idct(DCTELEM *block);
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
;******************************************************************************
|
;******************************************************************************
|
||||||
;* MMX/SSE2-optimized functions for the RV40 decoder
|
;* MMX/SSE2-optimized functions for the RV40 decoder
|
||||||
|
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
||||||
|
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
||||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||||
;*
|
;*
|
||||||
;* This file is part of Libav.
|
;* This file is part of Libav.
|
||||||
@ -25,11 +27,319 @@
|
|||||||
SECTION_RODATA
|
SECTION_RODATA
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
shift_round: times 8 dw 1 << (16 - 6)
|
pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
|
||||||
cextern pw_16
|
|
||||||
|
sixtap_filter_hb_m: times 8 db 1, -5
|
||||||
|
times 8 db 52, 20
|
||||||
|
; multiplied by 2 to have the same shift
|
||||||
|
times 8 db 2, -10
|
||||||
|
times 8 db 40, 40
|
||||||
|
; back to normal
|
||||||
|
times 8 db 1, -5
|
||||||
|
times 8 db 20, 52
|
||||||
|
|
||||||
|
sixtap_filter_v_m: times 8 dw 1
|
||||||
|
times 8 dw -5
|
||||||
|
times 8 dw 52
|
||||||
|
times 8 dw 20
|
||||||
|
; multiplied by 2 to have the same shift
|
||||||
|
times 8 dw 2
|
||||||
|
times 8 dw -10
|
||||||
|
times 8 dw 40
|
||||||
|
times 8 dw 40
|
||||||
|
; back to normal
|
||||||
|
times 8 dw 1
|
||||||
|
times 8 dw -5
|
||||||
|
times 8 dw 20
|
||||||
|
times 8 dw 52
|
||||||
|
|
||||||
|
%ifdef PIC
|
||||||
|
%define sixtap_filter_hw picregq
|
||||||
|
%define sixtap_filter_hb picregq
|
||||||
|
%define sixtap_filter_v picregq
|
||||||
|
%define npicregs 1
|
||||||
|
%else
|
||||||
|
%define sixtap_filter_hw sixtap_filter_hw_m
|
||||||
|
%define sixtap_filter_hb sixtap_filter_hb_m
|
||||||
|
%define sixtap_filter_v sixtap_filter_v_m
|
||||||
|
%define npicregs 0
|
||||||
|
%endif
|
||||||
|
|
||||||
|
filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
||||||
|
filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
||||||
|
filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
|
||||||
|
|
||||||
|
cextern pw_32
|
||||||
|
cextern pw_16
|
||||||
|
cextern pw_512
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; subpel MC functions:
|
||||||
|
;
|
||||||
|
; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
|
||||||
|
; uint8_t *src, int srcstride,
|
||||||
|
; int len, int m);
|
||||||
|
;----------------------------------------------------------------------
|
||||||
|
%macro LOAD 2
|
||||||
|
%if WIN64
|
||||||
|
movsxd %1q, %1d
|
||||||
|
%endif
|
||||||
|
%ifdef PIC
|
||||||
|
add %1q, picregq
|
||||||
|
%else
|
||||||
|
add %1q, %2
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro STORE 3
|
||||||
|
%ifidn %3, avg
|
||||||
|
movh %2, [dstq]
|
||||||
|
%endif
|
||||||
|
packuswb %1, %1
|
||||||
|
%ifidn %3, avg
|
||||||
|
%if cpuflag(3dnow)
|
||||||
|
pavgusb %1, %2
|
||||||
|
%else
|
||||||
|
pavgb %1, %2
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
movh [dstq], %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro FILTER_V 1
|
||||||
|
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
|
||||||
|
%ifdef PIC
|
||||||
|
lea picregq, [sixtap_filter_v_m]
|
||||||
|
%endif
|
||||||
|
pxor m7, m7
|
||||||
|
LOAD my, sixtap_filter_v
|
||||||
|
|
||||||
|
; read 5 lines
|
||||||
|
sub srcq, srcstrideq
|
||||||
|
sub srcq, srcstrideq
|
||||||
|
movh m0, [srcq]
|
||||||
|
movh m1, [srcq+srcstrideq]
|
||||||
|
movh m2, [srcq+srcstrideq*2]
|
||||||
|
lea srcq, [srcq+srcstrideq*2]
|
||||||
|
add srcq, srcstrideq
|
||||||
|
movh m3, [srcq]
|
||||||
|
movh m4, [srcq+srcstrideq]
|
||||||
|
punpcklbw m0, m7
|
||||||
|
punpcklbw m1, m7
|
||||||
|
punpcklbw m2, m7
|
||||||
|
punpcklbw m3, m7
|
||||||
|
punpcklbw m4, m7
|
||||||
|
|
||||||
|
%ifdef m8
|
||||||
|
mova m8, [myq+ 0]
|
||||||
|
mova m9, [myq+16]
|
||||||
|
mova m10, [myq+32]
|
||||||
|
mova m11, [myq+48]
|
||||||
|
%define COEFF05 m8
|
||||||
|
%define COEFF14 m9
|
||||||
|
%define COEFF2 m10
|
||||||
|
%define COEFF3 m11
|
||||||
|
%else
|
||||||
|
%define COEFF05 [myq+ 0]
|
||||||
|
%define COEFF14 [myq+16]
|
||||||
|
%define COEFF2 [myq+32]
|
||||||
|
%define COEFF3 [myq+48]
|
||||||
|
%endif
|
||||||
|
.nextrow:
|
||||||
|
mova m6, m1
|
||||||
|
movh m5, [srcq+2*srcstrideq] ; read new row
|
||||||
|
paddw m6, m4
|
||||||
|
punpcklbw m5, m7
|
||||||
|
pmullw m6, COEFF14
|
||||||
|
paddw m0, m5
|
||||||
|
pmullw m0, COEFF05
|
||||||
|
paddw m6, m0
|
||||||
|
mova m0, m1
|
||||||
|
paddw m6, [pw_32]
|
||||||
|
mova m1, m2
|
||||||
|
pmullw m2, COEFF2
|
||||||
|
paddw m6, m2
|
||||||
|
mova m2, m3
|
||||||
|
pmullw m3, COEFF3
|
||||||
|
paddw m6, m3
|
||||||
|
|
||||||
|
; round/clip/store
|
||||||
|
mova m3, m4
|
||||||
|
psraw m6, 6
|
||||||
|
mova m4, m5
|
||||||
|
STORE m6, m5, %1
|
||||||
|
|
||||||
|
; go to next line
|
||||||
|
add dstq, dststrideq
|
||||||
|
add srcq, srcstrideq
|
||||||
|
dec heightd ; next row
|
||||||
|
jg .nextrow
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro FILTER_H 1
|
||||||
|
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
|
||||||
|
%ifdef PIC
|
||||||
|
lea picregq, [sixtap_filter_v_m]
|
||||||
|
%endif
|
||||||
|
pxor m7, m7
|
||||||
|
LOAD mx, sixtap_filter_v
|
||||||
|
mova m6, [pw_32]
|
||||||
|
%ifdef m8
|
||||||
|
mova m8, [mxq+ 0]
|
||||||
|
mova m9, [mxq+16]
|
||||||
|
mova m10, [mxq+32]
|
||||||
|
mova m11, [mxq+48]
|
||||||
|
%define COEFF05 m8
|
||||||
|
%define COEFF14 m9
|
||||||
|
%define COEFF2 m10
|
||||||
|
%define COEFF3 m11
|
||||||
|
%else
|
||||||
|
%define COEFF05 [mxq+ 0]
|
||||||
|
%define COEFF14 [mxq+16]
|
||||||
|
%define COEFF2 [mxq+32]
|
||||||
|
%define COEFF3 [mxq+48]
|
||||||
|
%endif
|
||||||
|
.nextrow:
|
||||||
|
movq m0, [srcq-2]
|
||||||
|
movq m5, [srcq+3]
|
||||||
|
movq m1, [srcq-1]
|
||||||
|
movq m4, [srcq+2]
|
||||||
|
punpcklbw m0, m7
|
||||||
|
punpcklbw m5, m7
|
||||||
|
punpcklbw m1, m7
|
||||||
|
punpcklbw m4, m7
|
||||||
|
movq m2, [srcq-0]
|
||||||
|
movq m3, [srcq+1]
|
||||||
|
paddw m0, m5
|
||||||
|
paddw m1, m4
|
||||||
|
punpcklbw m2, m7
|
||||||
|
punpcklbw m3, m7
|
||||||
|
pmullw m0, COEFF05
|
||||||
|
pmullw m1, COEFF14
|
||||||
|
pmullw m2, COEFF2
|
||||||
|
pmullw m3, COEFF3
|
||||||
|
paddw m0, m6
|
||||||
|
paddw m1, m2
|
||||||
|
paddw m0, m3
|
||||||
|
paddw m0, m1
|
||||||
|
psraw m0, 6
|
||||||
|
STORE m0, m1, %1
|
||||||
|
|
||||||
|
; go to next line
|
||||||
|
add dstq, dststrideq
|
||||||
|
add srcq, srcstrideq
|
||||||
|
dec heightd ; next row
|
||||||
|
jg .nextrow
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%if ARCH_X86_32
|
||||||
|
INIT_MMX mmx
|
||||||
|
FILTER_V put
|
||||||
|
FILTER_H put
|
||||||
|
|
||||||
|
INIT_MMX mmx2
|
||||||
|
FILTER_V avg
|
||||||
|
FILTER_H avg
|
||||||
|
|
||||||
|
INIT_MMX 3dnow
|
||||||
|
FILTER_V avg
|
||||||
|
FILTER_H avg
|
||||||
|
%endif
|
||||||
|
|
||||||
|
INIT_XMM sse2
|
||||||
|
FILTER_H put
|
||||||
|
FILTER_H avg
|
||||||
|
FILTER_V put
|
||||||
|
FILTER_V avg
|
||||||
|
|
||||||
|
%macro FILTER_SSSE3 1
|
||||||
|
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
|
||||||
|
%ifdef PIC
|
||||||
|
lea picregq, [sixtap_filter_hb_m]
|
||||||
|
%endif
|
||||||
|
|
||||||
|
; read 5 lines
|
||||||
|
sub srcq, srcstrideq
|
||||||
|
LOAD my, sixtap_filter_hb
|
||||||
|
sub srcq, srcstrideq
|
||||||
|
movh m0, [srcq]
|
||||||
|
movh m1, [srcq+srcstrideq]
|
||||||
|
movh m2, [srcq+srcstrideq*2]
|
||||||
|
lea srcq, [srcq+srcstrideq*2]
|
||||||
|
add srcq, srcstrideq
|
||||||
|
mova m5, [myq]
|
||||||
|
movh m3, [srcq]
|
||||||
|
movh m4, [srcq+srcstrideq]
|
||||||
|
lea srcq, [srcq+2*srcstrideq]
|
||||||
|
|
||||||
|
.nextrow:
|
||||||
|
mova m6, m2
|
||||||
|
punpcklbw m0, m1
|
||||||
|
punpcklbw m6, m3
|
||||||
|
pmaddubsw m0, m5
|
||||||
|
pmaddubsw m6, [myq+16]
|
||||||
|
movh m7, [srcq] ; read new row
|
||||||
|
paddw m6, m0
|
||||||
|
mova m0, m1
|
||||||
|
mova m1, m2
|
||||||
|
mova m2, m3
|
||||||
|
mova m3, m4
|
||||||
|
mova m4, m7
|
||||||
|
punpcklbw m7, m3
|
||||||
|
pmaddubsw m7, m5
|
||||||
|
paddw m6, m7
|
||||||
|
pmulhrsw m6, [pw_512]
|
||||||
|
STORE m6, m7, %1
|
||||||
|
|
||||||
|
; go to next line
|
||||||
|
add dstq, dststrideq
|
||||||
|
add srcq, srcstrideq
|
||||||
|
dec heightd ; next row
|
||||||
|
jg .nextrow
|
||||||
|
REP_RET
|
||||||
|
|
||||||
|
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
|
||||||
|
%ifdef PIC
|
||||||
|
lea picregq, [sixtap_filter_hb_m]
|
||||||
|
%endif
|
||||||
|
mova m3, [filter_h6_shuf2]
|
||||||
|
mova m4, [filter_h6_shuf3]
|
||||||
|
LOAD mx, sixtap_filter_hb
|
||||||
|
mova m5, [mxq] ; set up 6tap filter in bytes
|
||||||
|
mova m6, [mxq+16]
|
||||||
|
mova m7, [filter_h6_shuf1]
|
||||||
|
|
||||||
|
.nextrow:
|
||||||
|
movu m0, [srcq-2]
|
||||||
|
mova m1, m0
|
||||||
|
mova m2, m0
|
||||||
|
pshufb m0, m7
|
||||||
|
pshufb m1, m3
|
||||||
|
pshufb m2, m4
|
||||||
|
pmaddubsw m0, m5
|
||||||
|
pmaddubsw m1, m6
|
||||||
|
pmaddubsw m2, m5
|
||||||
|
paddw m0, m1
|
||||||
|
paddw m0, m2
|
||||||
|
pmulhrsw m0, [pw_512]
|
||||||
|
STORE m0, m1, %1
|
||||||
|
|
||||||
|
; go to next line
|
||||||
|
add dstq, dststrideq
|
||||||
|
add srcq, srcstrideq
|
||||||
|
dec heightd ; next row
|
||||||
|
jg .nextrow
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM ssse3
|
||||||
|
FILTER_SSSE3 put
|
||||||
|
FILTER_SSSE3 avg
|
||||||
|
|
||||||
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
|
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
|
||||||
%macro RV40_WCORE 4-5
|
%macro RV40_WCORE 4-5
|
||||||
movh m4, [%3 + r6 + 0]
|
movh m4, [%3 + r6 + 0]
|
||||||
@ -143,7 +453,7 @@ SECTION .text
|
|||||||
%macro RV40_WEIGHT 3
|
%macro RV40_WEIGHT 3
|
||||||
cglobal rv40_weight_func_%1_%2, 6, 7, 8
|
cglobal rv40_weight_func_%1_%2, 6, 7, 8
|
||||||
%if cpuflag(ssse3)
|
%if cpuflag(ssse3)
|
||||||
mova m1, [shift_round]
|
mova m1, [pw_1024]
|
||||||
%else
|
%else
|
||||||
mova m1, [pw_16]
|
mova m1, [pw_16]
|
||||||
%endif
|
%endif
|
||||||
|
@ -22,8 +22,11 @@
|
|||||||
/**
|
/**
|
||||||
* @file
|
* @file
|
||||||
* RV40 decoder motion compensation functions x86-optimised
|
* RV40 decoder motion compensation functions x86-optimised
|
||||||
|
* 2,0 and 0,2 have h264 equivalents.
|
||||||
|
* 3,3 is bugged in the rv40 format and maps to _xy2 version
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "libavcodec/x86/dsputil_mmx.h"
|
||||||
#include "libavcodec/rv34dsp.h"
|
#include "libavcodec/rv34dsp.h"
|
||||||
|
|
||||||
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
|
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
|
||||||
@ -53,6 +56,132 @@ DECLARE_WEIGHT(mmx)
|
|||||||
DECLARE_WEIGHT(sse2)
|
DECLARE_WEIGHT(sse2)
|
||||||
DECLARE_WEIGHT(ssse3)
|
DECLARE_WEIGHT(ssse3)
|
||||||
|
|
||||||
|
/** @{ */
|
||||||
|
/**
|
||||||
|
* Define one qpel function.
|
||||||
|
* LOOPSIZE must be already set to the number of pixels processed per
|
||||||
|
* iteration in the inner loop of the called functions.
|
||||||
|
* COFF(x) must be already defined so as to provide the offset into any
|
||||||
|
* array of coeffs used by the called function for the qpel position x.
|
||||||
|
*/
|
||||||
|
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
|
||||||
|
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
|
||||||
|
uint8_t *src, \
|
||||||
|
int stride) \
|
||||||
|
{ \
|
||||||
|
int i; \
|
||||||
|
if (PH && PV) { \
|
||||||
|
DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
|
||||||
|
uint8_t *tmpptr = tmp + SIZE * 2; \
|
||||||
|
src -= stride * 2; \
|
||||||
|
\
|
||||||
|
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||||
|
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
|
||||||
|
SIZE + 5, HCOFF(PH)); \
|
||||||
|
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||||
|
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
|
||||||
|
SIZE, SIZE, VCOFF(PV)); \
|
||||||
|
} else if (PV) { \
|
||||||
|
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||||
|
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
|
||||||
|
stride, SIZE, VCOFF(PV)); \
|
||||||
|
} else { \
|
||||||
|
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||||
|
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
|
||||||
|
stride, SIZE, HCOFF(PH)); \
|
||||||
|
} \
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Declare functions for sizes 8 and 16 and given operations
|
||||||
|
* and qpel position. */
|
||||||
|
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
|
||||||
|
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
|
||||||
|
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
|
||||||
|
|
||||||
|
/** Declare all functions for all sizes and qpel positions */
|
||||||
|
#define QPEL_MC_DECL(OP, OPT) \
|
||||||
|
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||||
|
const uint8_t *src, \
|
||||||
|
ptrdiff_t srcStride, \
|
||||||
|
int len, int m); \
|
||||||
|
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||||
|
const uint8_t *src, \
|
||||||
|
ptrdiff_t srcStride, \
|
||||||
|
int len, int m); \
|
||||||
|
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
|
||||||
|
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
|
||||||
|
/** @} */
|
||||||
|
|
||||||
|
#define LOOPSIZE 8
|
||||||
|
#define HCOFF(x) (32 * (x - 1))
|
||||||
|
#define VCOFF(x) (32 * (x - 1))
|
||||||
|
QPEL_MC_DECL(put_, _ssse3)
|
||||||
|
QPEL_MC_DECL(avg_, _ssse3)
|
||||||
|
|
||||||
|
#undef LOOPSIZE
|
||||||
|
#undef HCOFF
|
||||||
|
#undef VCOFF
|
||||||
|
#define LOOPSIZE 8
|
||||||
|
#define HCOFF(x) (64 * (x - 1))
|
||||||
|
#define VCOFF(x) (64 * (x - 1))
|
||||||
|
QPEL_MC_DECL(put_, _sse2)
|
||||||
|
QPEL_MC_DECL(avg_, _sse2)
|
||||||
|
|
||||||
|
#if ARCH_X86_32
|
||||||
|
#undef LOOPSIZE
|
||||||
|
#undef HCOFF
|
||||||
|
#undef VCOFF
|
||||||
|
#define LOOPSIZE 4
|
||||||
|
#define HCOFF(x) (64 * (x - 1))
|
||||||
|
#define VCOFF(x) (64 * (x - 1))
|
||||||
|
|
||||||
|
QPEL_MC_DECL(put_, _mmx)
|
||||||
|
|
||||||
|
#define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx
|
||||||
|
#define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx
|
||||||
|
QPEL_MC_DECL(avg_, _mmx2)
|
||||||
|
|
||||||
|
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
|
||||||
|
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
|
||||||
|
QPEL_MC_DECL(avg_, _3dnow)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/** @{ */
|
||||||
|
/** Set one function */
|
||||||
|
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
|
||||||
|
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
|
||||||
|
|
||||||
|
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
|
||||||
|
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
|
||||||
|
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
|
||||||
|
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
|
||||||
|
|
||||||
|
/** Set all functions for all sizes and qpel positions */
|
||||||
|
#define QPEL_MC_SET(OP, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
|
||||||
|
QPEL_FUNCS_SET (OP, 3, 2, OPT)
|
||||||
|
/** @} */
|
||||||
|
|
||||||
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
|
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
|
||||||
{
|
{
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
@ -65,25 +194,42 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
|
|||||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
|
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
|
||||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
|
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
|
||||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
|
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
|
||||||
|
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
|
||||||
|
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
|
||||||
|
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
|
||||||
|
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
|
||||||
|
#if ARCH_X86_32
|
||||||
|
QPEL_MC_SET(put_, _mmx)
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
|
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
|
||||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
|
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
|
||||||
|
#if ARCH_X86_32
|
||||||
|
QPEL_MC_SET(avg_, _mmx2)
|
||||||
|
#endif
|
||||||
} else if (mm_flags & AV_CPU_FLAG_3DNOW) {
|
} else if (mm_flags & AV_CPU_FLAG_3DNOW) {
|
||||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
|
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
|
||||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
|
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
|
||||||
|
#if ARCH_X86_32
|
||||||
|
QPEL_MC_SET(avg_, _3dnow)
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
|
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
|
||||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
|
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
|
||||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
|
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
|
||||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
|
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
|
||||||
|
QPEL_MC_SET(put_, _sse2)
|
||||||
|
QPEL_MC_SET(avg_, _sse2)
|
||||||
}
|
}
|
||||||
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
||||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
|
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
|
||||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
|
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
|
||||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
|
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
|
||||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
|
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
|
||||||
|
QPEL_MC_SET(put_, _ssse3)
|
||||||
|
QPEL_MC_SET(avg_, _ssse3)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user