mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-03-03 14:32:16 +02:00
x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format
Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com>
This commit is contained in:
parent
ebf648d490
commit
bcc223523e
@ -25,6 +25,7 @@
|
||||
cextern pw_4
|
||||
cextern pw_5
|
||||
cextern pw_9
|
||||
cextern pw_128
|
||||
|
||||
section .text
|
||||
|
||||
@ -319,6 +320,44 @@ cglobal vc1_h_loop_filter8, 3,5,8
|
||||
RET
|
||||
|
||||
%if HAVE_MMX_INLINE
|
||||
|
||||
; XXX some of these macros are not used right now, but they will in the future
|
||||
; when more functions are ported.
|
||||
|
||||
%macro OP_PUT 2 ; dst, src
|
||||
%endmacro
|
||||
|
||||
%macro OP_AVG 2 ; dst, src
|
||||
pavgb %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro NORMALIZE_MMX 1 ; shift
|
||||
paddw m3, m7 ; +bias-r
|
||||
paddw m4, m7 ; +bias-r
|
||||
psraw m3, %1
|
||||
psraw m4, %1
|
||||
%endmacro
|
||||
|
||||
%macro TRANSFER_DO_PACK 2 ; op, dst
|
||||
packuswb m3, m4
|
||||
%1 m3, [%2]
|
||||
mova [%2], m3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSFER_DONT_PACK 2 ; op, dst
|
||||
%1 m3, [%2]
|
||||
%1 m3, [%2 + mmsize]
|
||||
mova [%2], m3
|
||||
mova [mmsize + %2], m4
|
||||
%endmacro
|
||||
|
||||
; see MSPEL_FILTER13_CORE for use as UNPACK macro
|
||||
%macro DO_UNPACK 1 ; reg
|
||||
punpcklbw %1, m0
|
||||
%endmacro
|
||||
%macro DONT_UNPACK 1 ; reg
|
||||
%endmacro
|
||||
|
||||
; Compute the rounder 32-r or 8-r and unpacks it to m7
|
||||
%macro LOAD_ROUNDER_MMX 1 ; round
|
||||
movd m7, %1
|
||||
@ -394,6 +433,57 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
|
||||
dec i
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%undef rnd
|
||||
%undef shift
|
||||
%undef stride_neg2
|
||||
%undef stride_9minus4
|
||||
%undef i
|
||||
|
||||
; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
|
||||
; const int16_t *src, int rnd);
|
||||
; Data is already unpacked, so some operations can directly be made from
|
||||
; memory.
|
||||
%macro HOR_16B_SHIFT2 2 ; op, opname
|
||||
cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
|
||||
mov hq, 8
|
||||
sub srcq, 2
|
||||
sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
|
||||
LOAD_ROUNDER_MMX rndq
|
||||
mova m5, [pw_9]
|
||||
mova m6, [pw_128]
|
||||
pxor m0, m0
|
||||
|
||||
.loop:
|
||||
mova m1, [srcq + 2 * 0]
|
||||
mova m2, [srcq + 2 * 0 + mmsize]
|
||||
mova m3, [srcq + 2 * 1]
|
||||
mova m4, [srcq + 2 * 1 + mmsize]
|
||||
paddw m3, [srcq + 2 * 2]
|
||||
paddw m4, [srcq + 2 * 2 + mmsize]
|
||||
paddw m1, [srcq + 2 * 3]
|
||||
paddw m2, [srcq + 2 * 3 + mmsize]
|
||||
pmullw m3, m5
|
||||
pmullw m4, m5
|
||||
psubw m3, m1
|
||||
psubw m4, m2
|
||||
NORMALIZE_MMX 7
|
||||
; remove bias
|
||||
paddw m3, m6
|
||||
paddw m4, m6
|
||||
TRANSFER_DO_PACK %1, dstq
|
||||
add srcq, 24
|
||||
add dstq, strideq
|
||||
dec hq
|
||||
jnz .loop
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
HOR_16B_SHIFT2 OP_PUT, put
|
||||
|
||||
INIT_MMX mmxext
|
||||
HOR_16B_SHIFT2 OP_AVG, avg
|
||||
%endif ; HAVE_MMX_INLINE
|
||||
|
||||
%macro INV_TRANS_INIT 0
|
||||
|
@ -38,6 +38,10 @@
|
||||
void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
const uint8_t *src, x86_reg stride,
|
||||
int rnd, int64_t shift);
|
||||
void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
|
||||
const int16_t *src, int rnd);
|
||||
void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
|
||||
const int16_t *src, int rnd);
|
||||
|
||||
#define OP_PUT(S,D)
|
||||
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
|
||||
@ -70,55 +74,6 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
"punpcklwd %%mm7, %%mm7 \n\t" \
|
||||
"punpckldq %%mm7, %%mm7 \n\t"
|
||||
|
||||
/**
|
||||
* Data is already unpacked, so some operations can directly be made from
|
||||
* memory.
|
||||
*/
|
||||
#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
|
||||
static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
|
||||
const int16_t *src, int rnd)\
|
||||
{\
|
||||
int h = 8;\
|
||||
\
|
||||
src -= 1;\
|
||||
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
|
||||
__asm__ volatile(\
|
||||
LOAD_ROUNDER_MMX("%4")\
|
||||
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
|
||||
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movq 2*0+0(%1), %%mm1 \n\t"\
|
||||
"movq 2*0+8(%1), %%mm2 \n\t"\
|
||||
"movq 2*1+0(%1), %%mm3 \n\t"\
|
||||
"movq 2*1+8(%1), %%mm4 \n\t"\
|
||||
"paddw 2*3+0(%1), %%mm1 \n\t"\
|
||||
"paddw 2*3+8(%1), %%mm2 \n\t"\
|
||||
"paddw 2*2+0(%1), %%mm3 \n\t"\
|
||||
"paddw 2*2+8(%1), %%mm4 \n\t"\
|
||||
"pmullw %%mm5, %%mm3 \n\t"\
|
||||
"pmullw %%mm5, %%mm4 \n\t"\
|
||||
"psubw %%mm1, %%mm3 \n\t"\
|
||||
"psubw %%mm2, %%mm4 \n\t"\
|
||||
NORMALIZE_MMX("$7")\
|
||||
/* Remove bias */\
|
||||
"paddw %%mm6, %%mm3 \n\t"\
|
||||
"paddw %%mm6, %%mm4 \n\t"\
|
||||
TRANSFER_DO_PACK(OP)\
|
||||
"add $24, %1 \n\t"\
|
||||
"add %3, %2 \n\t"\
|
||||
"decl %0 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
: "+r"(h), "+r" (src), "+r" (dst)\
|
||||
: "r"(stride), "m"(rnd)\
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
|
||||
: "memory"\
|
||||
);\
|
||||
}
|
||||
|
||||
VC1_HOR_16b_SHIFT2(OP_PUT, put_)
|
||||
VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
|
||||
|
||||
|
||||
/**
|
||||
* Purely vertical or horizontal 1/2 shift interpolation.
|
||||
* Sacrify mm6 for *9 factor.
|
||||
@ -380,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
|
||||
* @param hmode Vertical filter.
|
||||
* @param rnd Rounding bias.
|
||||
*/
|
||||
#define VC1_MSPEL_MC(OP)\
|
||||
#define VC1_MSPEL_MC(OP, INSTR)\
|
||||
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
|
||||
int hmode, int vmode, int rnd)\
|
||||
{\
|
||||
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
|
||||
{ NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
|
||||
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
|
||||
{ NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
|
||||
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
|
||||
\
|
||||
@ -428,8 +383,8 @@ static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
|
||||
OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
|
||||
}
|
||||
|
||||
VC1_MSPEL_MC(put_)
|
||||
VC1_MSPEL_MC(avg_)
|
||||
VC1_MSPEL_MC(put_, mmx)
|
||||
VC1_MSPEL_MC(avg_, mmxext)
|
||||
|
||||
/** Macro to ease bicubic filter interpolation functions declarations */
|
||||
#define DECLARE_FUNCTION(a, b) \
|
||||
|
Loading…
x
Reference in New Issue
Block a user