mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-04-08 16:54:03 +02:00
vc1dsp: Port ff_vc1_put_ver_16b_shift2_mmx to yasm
This function is only used within other inline asm functions, hence the HAVE_MMX_INLINE guard. Per recent discussions, we should not worry about the performance of inline asm-only builds.
This commit is contained in:
parent
12628e3369
commit
ab5f43e634
@ -1,5 +1,6 @@
|
||||
;******************************************************************************
|
||||
;* VC1 deblocking optimizations
|
||||
;* VC1 DSP optimizations
|
||||
;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
||||
;* Copyright (c) 2009 David Conrad
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
@ -23,6 +24,7 @@
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_5
|
||||
cextern pw_9
|
||||
|
||||
section .text
|
||||
|
||||
@ -315,3 +317,88 @@ cglobal vc1_h_loop_filter8, 3,5,8
|
||||
START_H_FILTER 8
|
||||
VC1_H_LOOP_FILTER 8
|
||||
RET
|
||||
|
||||
%if HAVE_MMX_INLINE
|
||||
%macro NORMALIZE_MMX 1 ; shift
|
||||
paddw m3, m7 ; +bias-r
|
||||
paddw m4, m7 ; +bias-r
|
||||
psraw m3, %1
|
||||
psraw m4, %1
|
||||
%endmacro
|
||||
|
||||
; Compute the rounder 32-r or 8-r and unpacks it to m7
|
||||
%macro LOAD_ROUNDER_MMX 1 ; round
|
||||
movd m7, %1
|
||||
punpcklwd m7, m7
|
||||
punpckldq m7, m7
|
||||
%endmacro
|
||||
|
||||
%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
|
||||
paddw m%3, m%4
|
||||
movh m%2, [srcq + stride_neg2]
|
||||
pmullw m%3, m6
|
||||
punpcklbw m%2, m0
|
||||
movh m%5, [srcq + strideq]
|
||||
psubw m%3, m%2
|
||||
punpcklbw m%5, m0
|
||||
paddw m%3, m7
|
||||
psubw m%3, m%5
|
||||
psraw m%3, shift
|
||||
movu [dstq + %1], m%3
|
||||
add srcq, strideq
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
|
||||
; x86_reg stride, int rnd, int64_t shift)
|
||||
; Sacrificing m6 makes it possible to pipeline loads from src
|
||||
%if ARCH_X86_32
|
||||
cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
|
||||
DECLARE_REG_TMP 3, 4, 5
|
||||
%define rnd r3mp
|
||||
%define shift qword r4m
|
||||
%else ; X86_64
|
||||
cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
|
||||
DECLARE_REG_TMP 4, 5, 6
|
||||
%define rnd r3d
|
||||
; We need shift either in memory or in a mm reg as it's used in psraw
|
||||
; On WIN64, the arg is already on the stack
|
||||
; On UNIX64, m5 doesn't seem to be used
|
||||
%if WIN64
|
||||
%define shift r4mp
|
||||
%else ; UNIX64
|
||||
%define shift m5
|
||||
mova shift, r4q
|
||||
%endif ; WIN64
|
||||
%endif ; X86_32
|
||||
%define stride_neg2 t0q
|
||||
%define stride_9minus4 t1q
|
||||
%define i t2q
|
||||
mov stride_neg2, strideq
|
||||
neg stride_neg2
|
||||
add stride_neg2, stride_neg2
|
||||
lea stride_9minus4, [strideq * 9 - 4]
|
||||
mov i, 3
|
||||
LOAD_ROUNDER_MMX rnd
|
||||
mova m6, [pw_9]
|
||||
pxor m0, m0
|
||||
.loop:
|
||||
movh m2, [srcq]
|
||||
add srcq, strideq
|
||||
movh m3, [srcq]
|
||||
punpcklbw m2, m0
|
||||
punpcklbw m3, m0
|
||||
SHIFT2_LINE 0, 1, 2, 3, 4
|
||||
SHIFT2_LINE 24, 2, 3, 4, 1
|
||||
SHIFT2_LINE 48, 3, 4, 1, 2
|
||||
SHIFT2_LINE 72, 4, 1, 2, 3
|
||||
SHIFT2_LINE 96, 1, 2, 3, 4
|
||||
SHIFT2_LINE 120, 2, 3, 4, 1
|
||||
SHIFT2_LINE 144, 3, 4, 1, 2
|
||||
SHIFT2_LINE 168, 4, 1, 2, 3
|
||||
sub srcq, stride_9minus4
|
||||
add dstq, 8
|
||||
dec i
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endif ; HAVE_MMX_INLINE
|
||||
|
@ -33,7 +33,11 @@
|
||||
#include "fpel.h"
|
||||
#include "vc1dsp.h"
|
||||
|
||||
#if HAVE_6REGS && HAVE_INLINE_ASM
|
||||
#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
|
||||
|
||||
void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
const uint8_t *src, x86_reg stride,
|
||||
int rnd, int64_t shift);
|
||||
|
||||
#define OP_PUT(S,D)
|
||||
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
|
||||
@ -66,55 +70,6 @@
|
||||
"punpcklwd %%mm7, %%mm7 \n\t" \
|
||||
"punpckldq %%mm7, %%mm7 \n\t"
|
||||
|
||||
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
|
||||
"paddw %%mm"#R2", %%mm"#R1" \n\t" \
|
||||
"movd (%0,%3), %%mm"#R0" \n\t" \
|
||||
"pmullw %%mm6, %%mm"#R1" \n\t" \
|
||||
"punpcklbw %%mm0, %%mm"#R0" \n\t" \
|
||||
"movd (%0,%2), %%mm"#R3" \n\t" \
|
||||
"psubw %%mm"#R0", %%mm"#R1" \n\t" \
|
||||
"punpcklbw %%mm0, %%mm"#R3" \n\t" \
|
||||
"paddw %%mm7, %%mm"#R1" \n\t" \
|
||||
"psubw %%mm"#R3", %%mm"#R1" \n\t" \
|
||||
"psraw %4, %%mm"#R1" \n\t" \
|
||||
"movq %%mm"#R1", "#OFF"(%1) \n\t" \
|
||||
"add %2, %0 \n\t"
|
||||
|
||||
/** Sacrificing mm6 makes it possible to pipeline loads from src */
|
||||
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
const uint8_t *src, x86_reg stride,
|
||||
int rnd, int64_t shift)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"mov $3, %%"REG_c" \n\t"
|
||||
LOAD_ROUNDER_MMX("%5")
|
||||
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
|
||||
"1: \n\t"
|
||||
"movd (%0), %%mm2 \n\t"
|
||||
"add %2, %0 \n\t"
|
||||
"movd (%0), %%mm3 \n\t"
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"
|
||||
"punpcklbw %%mm0, %%mm3 \n\t"
|
||||
SHIFT2_LINE( 0, 1, 2, 3, 4)
|
||||
SHIFT2_LINE( 24, 2, 3, 4, 1)
|
||||
SHIFT2_LINE( 48, 3, 4, 1, 2)
|
||||
SHIFT2_LINE( 72, 4, 1, 2, 3)
|
||||
SHIFT2_LINE( 96, 1, 2, 3, 4)
|
||||
SHIFT2_LINE(120, 2, 3, 4, 1)
|
||||
SHIFT2_LINE(144, 3, 4, 1, 2)
|
||||
SHIFT2_LINE(168, 4, 1, 2, 3)
|
||||
"sub %6, %0 \n\t"
|
||||
"add $8, %1 \n\t"
|
||||
"dec %%"REG_c" \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+r"(src), "+r"(dst)
|
||||
: "r"(stride), "r"(-2*stride),
|
||||
"m"(shift), "m"(rnd), "r"(9*stride-4)
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_9)
|
||||
: "%"REG_c, "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Data is already unpacked, so some operations can directly be made from
|
||||
* memory.
|
||||
@ -430,7 +385,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
|
||||
int hmode, int vmode, int rnd)\
|
||||
{\
|
||||
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
|
||||
{ NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
||||
{ NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
|
||||
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
|
||||
@ -780,4 +735,4 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
|
||||
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
|
||||
}
|
||||
#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
|
||||
#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
|
||||
|
Loading…
x
Reference in New Issue
Block a user