mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
port ape dsp functions from sse2 to mmx
now requires yasm Originally committed as revision 20722 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
f415be684d
commit
b10fa1bb8b
@ -2384,6 +2384,12 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
|||||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||||
|
void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
|
||||||
|
void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
|
||||||
|
void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
|
||||||
|
void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
|
||||||
|
int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
|
||||||
|
int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
|
||||||
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
|
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
|
||||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||||
@ -2507,78 +2513,6 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
|
|||||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
|
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
|
||||||
|
|
||||||
|
|
||||||
static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
|
|
||||||
{
|
|
||||||
x86_reg o = -(order << 1);
|
|
||||||
v1 += order;
|
|
||||||
v2 += order;
|
|
||||||
__asm__ volatile(
|
|
||||||
"1: \n\t"
|
|
||||||
"movdqu (%1,%2), %%xmm0 \n\t"
|
|
||||||
"movdqu 16(%1,%2), %%xmm1 \n\t"
|
|
||||||
"paddw (%0,%2), %%xmm0 \n\t"
|
|
||||||
"paddw 16(%0,%2), %%xmm1 \n\t"
|
|
||||||
"movdqa %%xmm0, (%0,%2) \n\t"
|
|
||||||
"movdqa %%xmm1, 16(%0,%2) \n\t"
|
|
||||||
"add $32, %2 \n\t"
|
|
||||||
"js 1b \n\t"
|
|
||||||
: "+r"(v1), "+r"(v2), "+r"(o)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
|
|
||||||
{
|
|
||||||
x86_reg o = -(order << 1);
|
|
||||||
v1 += order;
|
|
||||||
v2 += order;
|
|
||||||
__asm__ volatile(
|
|
||||||
"1: \n\t"
|
|
||||||
"movdqa (%0,%2), %%xmm0 \n\t"
|
|
||||||
"movdqa 16(%0,%2), %%xmm2 \n\t"
|
|
||||||
"movdqu (%1,%2), %%xmm1 \n\t"
|
|
||||||
"movdqu 16(%1,%2), %%xmm3 \n\t"
|
|
||||||
"psubw %%xmm1, %%xmm0 \n\t"
|
|
||||||
"psubw %%xmm3, %%xmm2 \n\t"
|
|
||||||
"movdqa %%xmm0, (%0,%2) \n\t"
|
|
||||||
"movdqa %%xmm2, 16(%0,%2) \n\t"
|
|
||||||
"add $32, %2 \n\t"
|
|
||||||
"js 1b \n\t"
|
|
||||||
: "+r"(v1), "+r"(v2), "+r"(o)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
|
|
||||||
{
|
|
||||||
int res = 0;
|
|
||||||
DECLARE_ALIGNED_16(xmm_reg, sh);
|
|
||||||
x86_reg o = -(order << 1);
|
|
||||||
|
|
||||||
v1 += order;
|
|
||||||
v2 += order;
|
|
||||||
sh.a = shift;
|
|
||||||
__asm__ volatile(
|
|
||||||
"pxor %%xmm7, %%xmm7 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movdqu (%0,%3), %%xmm0 \n\t"
|
|
||||||
"movdqu 16(%0,%3), %%xmm1 \n\t"
|
|
||||||
"pmaddwd (%1,%3), %%xmm0 \n\t"
|
|
||||||
"pmaddwd 16(%1,%3), %%xmm1 \n\t"
|
|
||||||
"paddd %%xmm0, %%xmm7 \n\t"
|
|
||||||
"paddd %%xmm1, %%xmm7 \n\t"
|
|
||||||
"add $32, %3 \n\t"
|
|
||||||
"js 1b \n\t"
|
|
||||||
"movhlps %%xmm7, %%xmm2 \n\t"
|
|
||||||
"paddd %%xmm2, %%xmm7 \n\t"
|
|
||||||
"psrad %4, %%xmm7 \n\t"
|
|
||||||
"pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
|
|
||||||
"paddd %%xmm2, %%xmm7 \n\t"
|
|
||||||
"movd %%xmm7, %2 \n\t"
|
|
||||||
: "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
|
|
||||||
: "m"(sh)
|
|
||||||
);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
mm_flags = mm_support();
|
mm_flags = mm_support();
|
||||||
@ -3015,6 +2949,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
|
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(mm_flags & FF_MM_MMX2){
|
||||||
|
#if HAVE_YASM
|
||||||
|
c->add_int16 = ff_add_int16_mmx2;
|
||||||
|
c->sub_int16 = ff_sub_int16_mmx2;
|
||||||
|
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
if(mm_flags & FF_MM_SSE){
|
if(mm_flags & FF_MM_SSE){
|
||||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
|
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
|
||||||
c->ac3_downmix = ac3_downmix_sse;
|
c->ac3_downmix = ac3_downmix_sse;
|
||||||
@ -3033,9 +2974,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
|
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
|
||||||
c->float_to_int16 = float_to_int16_sse2;
|
c->float_to_int16 = float_to_int16_sse2;
|
||||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
||||||
c->add_int16 = add_int16_sse2;
|
#if HAVE_YASM
|
||||||
c->sub_int16 = sub_int16_sse2;
|
c->add_int16 = ff_add_int16_sse2;
|
||||||
c->scalarproduct_int16 = scalarproduct_int16_sse2;
|
c->sub_int16 = ff_sub_int16_sse2;
|
||||||
|
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,6 +99,81 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
%macro SCALARPRODUCT 1
|
||||||
|
; void add_int16(int16_t * v1, int16_t * v2, int order)
|
||||||
|
cglobal add_int16_%1, 3,3,2, v1, v2, order
|
||||||
|
shl orderq, 1
|
||||||
|
add v1q, orderq
|
||||||
|
add v2q, orderq
|
||||||
|
neg orderq
|
||||||
|
.loop:
|
||||||
|
movu m0, [v2q + orderq]
|
||||||
|
movu m1, [v2q + orderq + mmsize]
|
||||||
|
paddw m0, [v1q + orderq]
|
||||||
|
paddw m1, [v1q + orderq + mmsize]
|
||||||
|
mova [v1q + orderq], m0
|
||||||
|
mova [v1q + orderq + mmsize], m1
|
||||||
|
add orderq, mmsize*2
|
||||||
|
jl .loop
|
||||||
|
REP_RET
|
||||||
|
|
||||||
|
; void sub_int16(int16_t * v1, int16_t * v2, int order)
|
||||||
|
cglobal sub_int16_%1, 3,3,4, v1, v2, order
|
||||||
|
shl orderq, 1
|
||||||
|
add v1q, orderq
|
||||||
|
add v2q, orderq
|
||||||
|
neg orderq
|
||||||
|
.loop:
|
||||||
|
movu m2, [v2q + orderq]
|
||||||
|
movu m3, [v2q + orderq + mmsize]
|
||||||
|
mova m0, [v1q + orderq]
|
||||||
|
mova m1, [v1q + orderq + mmsize]
|
||||||
|
psubw m0, m2
|
||||||
|
psubw m1, m3
|
||||||
|
mova [v1q + orderq], m0
|
||||||
|
mova [v1q + orderq + mmsize], m1
|
||||||
|
add orderq, mmsize*2
|
||||||
|
jl .loop
|
||||||
|
REP_RET
|
||||||
|
|
||||||
|
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
|
||||||
|
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
|
||||||
|
shl orderq, 1
|
||||||
|
add v1q, orderq
|
||||||
|
add v2q, orderq
|
||||||
|
neg orderq
|
||||||
|
movd m3, shiftm
|
||||||
|
pxor m2, m2
|
||||||
|
.loop:
|
||||||
|
movu m0, [v1q + orderq]
|
||||||
|
movu m1, [v1q + orderq + mmsize]
|
||||||
|
pmaddwd m0, [v2q + orderq]
|
||||||
|
pmaddwd m1, [v2q + orderq + mmsize]
|
||||||
|
paddd m2, m0
|
||||||
|
paddd m2, m1
|
||||||
|
add orderq, mmsize*2
|
||||||
|
jl .loop
|
||||||
|
%if mmsize == 16
|
||||||
|
movhlps m0, m2
|
||||||
|
paddd m2, m0
|
||||||
|
psrad m2, m3
|
||||||
|
pshuflw m0, m2, 0x4e
|
||||||
|
%else
|
||||||
|
psrad m2, m3
|
||||||
|
pshufw m0, m2, 0x4e
|
||||||
|
%endif
|
||||||
|
paddd m2, m0
|
||||||
|
movd eax, m2
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
SCALARPRODUCT mmx2
|
||||||
|
INIT_XMM
|
||||||
|
SCALARPRODUCT sse2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
||||||
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
|
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
|
||||||
movq mm0, [topq]
|
movq mm0, [topq]
|
||||||
|
Loading…
Reference in New Issue
Block a user