mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
x86: hpeldsp: Move half-pel assembly from dsputil to hpeldsp
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
28bc406c84
commit
8db00081a3
@ -53,4 +53,7 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
|
|||||||
hpel_funcs(avg, [2], 4);
|
hpel_funcs(avg, [2], 4);
|
||||||
hpel_funcs(avg, [3], 2);
|
hpel_funcs(avg, [3], 2);
|
||||||
hpel_funcs(avg_no_rnd,, 16);
|
hpel_funcs(avg_no_rnd,, 16);
|
||||||
|
|
||||||
|
if (ARCH_X86)
|
||||||
|
ff_hpeldsp_init_x86(c, flags);
|
||||||
}
|
}
|
||||||
|
@ -94,4 +94,6 @@ typedef struct HpelDSPContext {
|
|||||||
|
|
||||||
void ff_hpeldsp_init(HpelDSPContext *c, int flags);
|
void ff_hpeldsp_init(HpelDSPContext *c, int flags);
|
||||||
|
|
||||||
|
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
|
||||||
|
|
||||||
#endif /* AVCODEC_HPELDSP_H */
|
#endif /* AVCODEC_HPELDSP_H */
|
||||||
|
@ -10,6 +10,7 @@ OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
|||||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||||
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
||||||
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
|
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
|
||||||
|
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
|
||||||
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
||||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||||
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o
|
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o
|
||||||
@ -44,7 +45,7 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
|||||||
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
|
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
|
||||||
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||||
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
|
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
|
||||||
x86/hpeldsp.o \
|
x86/fpel.o \
|
||||||
x86/mpeg4qpel.o \
|
x86/mpeg4qpel.o \
|
||||||
x86/qpel.o
|
x86/qpel.o
|
||||||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
|
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
|
||||||
@ -63,7 +64,10 @@ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
|
|||||||
x86/h264_intrapred_10bit.o
|
x86/h264_intrapred_10bit.o
|
||||||
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
||||||
x86/h264_qpel_10bit.o \
|
x86/h264_qpel_10bit.o \
|
||||||
|
x86/fpel.o \
|
||||||
x86/qpel.o
|
x86/qpel.o
|
||||||
|
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
|
||||||
|
x86/hpeldsp.o
|
||||||
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||||
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||||
|
@ -55,10 +55,6 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
|
|||||||
|
|
||||||
|
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||||
int dstStride, int src1Stride, int h);
|
int dstStride, int src1Stride, int h);
|
||||||
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
|
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
|
||||||
@ -66,53 +62,13 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
|
|||||||
int src1Stride, int h);
|
int src1Stride, int h);
|
||||||
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||||
int dstStride, int src1Stride, int h);
|
int dstStride, int src1Stride, int h);
|
||||||
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||||
int dstStride, int src1Stride, int h);
|
int dstStride, int src1Stride, int h);
|
||||||
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||||
int dstStride, int src1Stride, int h);
|
int dstStride, int src1Stride, int h);
|
||||||
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||||
int dstStride, int src1Stride, int h);
|
int dstStride, int src1Stride, int h);
|
||||||
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
|
|
||||||
const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
|
|
||||||
const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
|
|
||||||
const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
|
|
||||||
const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
|
||||||
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
|
||||||
ptrdiff_t line_size, int h);
|
ptrdiff_t line_size, int h);
|
||||||
|
|
||||||
static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
@ -186,14 +142,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
|||||||
// using regr as temporary and for the output result
|
// using regr as temporary and for the output result
|
||||||
// first argument is unmodifed and second is trashed
|
// first argument is unmodifed and second is trashed
|
||||||
// regfe is supposed to contain 0xfefefefefefefefe
|
// regfe is supposed to contain 0xfefefefefefefefe
|
||||||
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
|
|
||||||
"movq "#rega", "#regr" \n\t" \
|
|
||||||
"pand "#regb", "#regr" \n\t" \
|
|
||||||
"pxor "#rega", "#regb" \n\t" \
|
|
||||||
"pand "#regfe", "#regb" \n\t" \
|
|
||||||
"psrlq $1, "#regb" \n\t" \
|
|
||||||
"paddb "#regb", "#regr" \n\t"
|
|
||||||
|
|
||||||
#define PAVGB_MMX(rega, regb, regr, regfe) \
|
#define PAVGB_MMX(rega, regb, regr, regfe) \
|
||||||
"movq "#rega", "#regr" \n\t" \
|
"movq "#rega", "#regr" \n\t" \
|
||||||
"por "#regb", "#regr" \n\t" \
|
"por "#regb", "#regr" \n\t" \
|
||||||
@ -203,20 +151,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
|||||||
"psubb "#regb", "#regr" \n\t"
|
"psubb "#regb", "#regr" \n\t"
|
||||||
|
|
||||||
// mm6 is supposed to contain 0xfefefefefefefefe
|
// mm6 is supposed to contain 0xfefefefefefefefe
|
||||||
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
|
|
||||||
"movq "#rega", "#regr" \n\t" \
|
|
||||||
"movq "#regc", "#regp" \n\t" \
|
|
||||||
"pand "#regb", "#regr" \n\t" \
|
|
||||||
"pand "#regd", "#regp" \n\t" \
|
|
||||||
"pxor "#rega", "#regb" \n\t" \
|
|
||||||
"pxor "#regc", "#regd" \n\t" \
|
|
||||||
"pand %%mm6, "#regb" \n\t" \
|
|
||||||
"pand %%mm6, "#regd" \n\t" \
|
|
||||||
"psrlq $1, "#regb" \n\t" \
|
|
||||||
"psrlq $1, "#regd" \n\t" \
|
|
||||||
"paddb "#regb", "#regr" \n\t" \
|
|
||||||
"paddb "#regd", "#regp" \n\t"
|
|
||||||
|
|
||||||
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
||||||
"movq "#rega", "#regr" \n\t" \
|
"movq "#rega", "#regr" \n\t" \
|
||||||
"movq "#regc", "#regp" \n\t" \
|
"movq "#regc", "#regp" \n\t" \
|
||||||
@ -231,22 +165,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
|||||||
"psubb "#regb", "#regr" \n\t" \
|
"psubb "#regb", "#regr" \n\t" \
|
||||||
"psubb "#regd", "#regp" \n\t"
|
"psubb "#regd", "#regp" \n\t"
|
||||||
|
|
||||||
/***********************************/
|
|
||||||
/* MMX no rounding */
|
|
||||||
#define NO_RND 1
|
|
||||||
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
|
|
||||||
#define SET_RND MOVQ_WONE
|
|
||||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
|
|
||||||
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
|
|
||||||
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
|
||||||
|
|
||||||
#include "dsputil_rnd_template.c"
|
|
||||||
|
|
||||||
#undef DEF
|
|
||||||
#undef SET_RND
|
|
||||||
#undef PAVGBP
|
|
||||||
#undef PAVGB
|
|
||||||
#undef NO_RND
|
|
||||||
/***********************************/
|
/***********************************/
|
||||||
/* MMX rounding */
|
/* MMX rounding */
|
||||||
|
|
||||||
@ -254,6 +172,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
|||||||
#define SET_RND MOVQ_WTWO
|
#define SET_RND MOVQ_WTWO
|
||||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
||||||
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
||||||
|
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
||||||
|
|
||||||
#include "dsputil_rnd_template.c"
|
#include "dsputil_rnd_template.c"
|
||||||
|
|
||||||
@ -268,31 +187,21 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
|||||||
|
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
|
|
||||||
/***********************************/
|
|
||||||
/* 3Dnow specific */
|
|
||||||
|
|
||||||
#define DEF(x) x ## _3dnow
|
|
||||||
|
|
||||||
#include "dsputil_avg_template.c"
|
|
||||||
|
|
||||||
#undef DEF
|
|
||||||
|
|
||||||
/***********************************/
|
/***********************************/
|
||||||
/* MMXEXT specific */
|
/* MMXEXT specific */
|
||||||
|
|
||||||
#define DEF(x) x ## _mmxext
|
//FIXME the following could be optimized too ...
|
||||||
|
static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
#include "dsputil_avg_template.c"
|
int line_size, int h)
|
||||||
|
{
|
||||||
#undef DEF
|
ff_avg_pixels8_mmxext(block, pixels, line_size, h);
|
||||||
|
ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* HAVE_YASM */
|
#endif /* HAVE_YASM */
|
||||||
|
|
||||||
|
|
||||||
#if HAVE_INLINE_ASM
|
#if HAVE_INLINE_ASM
|
||||||
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
|
|
||||||
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
|
|
||||||
|
|
||||||
/***********************************/
|
/***********************************/
|
||||||
/* standard MMX */
|
/* standard MMX */
|
||||||
|
|
||||||
@ -1369,14 +1278,6 @@ void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
|
|||||||
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
|
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
|
||||||
do { \
|
|
||||||
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
|
|
||||||
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
|
|
||||||
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
|
|
||||||
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||||
int mm_flags)
|
int mm_flags)
|
||||||
{
|
{
|
||||||
@ -1392,14 +1293,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
|||||||
c->clear_blocks = clear_blocks_mmx;
|
c->clear_blocks = clear_blocks_mmx;
|
||||||
c->draw_edges = draw_edges_mmx;
|
c->draw_edges = draw_edges_mmx;
|
||||||
|
|
||||||
SET_HPEL_FUNCS(put, [0], 16, mmx);
|
|
||||||
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
|
|
||||||
SET_HPEL_FUNCS(avg, [0], 16, mmx);
|
|
||||||
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
|
|
||||||
SET_HPEL_FUNCS(put, [1], 8, mmx);
|
|
||||||
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
|
|
||||||
SET_HPEL_FUNCS(avg, [1], 8, mmx);
|
|
||||||
|
|
||||||
switch (avctx->idct_algo) {
|
switch (avctx->idct_algo) {
|
||||||
case FF_IDCT_AUTO:
|
case FF_IDCT_AUTO:
|
||||||
case FF_IDCT_SIMPLEMMX:
|
case FF_IDCT_SIMPLEMMX:
|
||||||
@ -1445,34 +1338,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
|
|||||||
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
|
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
|
||||||
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
|
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
|
||||||
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
||||||
|
|
||||||
if (!high_bit_depth) {
|
|
||||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
|
|
||||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
|
|
||||||
|
|
||||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
|
|
||||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
|
|
||||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
|
|
||||||
|
|
||||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
|
|
||||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
|
|
||||||
|
|
||||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
|
|
||||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
|
|
||||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
|
||||||
if (!high_bit_depth) {
|
|
||||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
|
|
||||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
|
|
||||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
|
|
||||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
|
|
||||||
|
|
||||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
|
|
||||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif /* HAVE_YASM */
|
#endif /* HAVE_YASM */
|
||||||
|
|
||||||
#if HAVE_INLINE_ASM
|
#if HAVE_INLINE_ASM
|
||||||
@ -1484,12 +1349,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
|
|||||||
#endif /* HAVE_INLINE_ASM */
|
#endif /* HAVE_INLINE_ASM */
|
||||||
|
|
||||||
#if HAVE_MMXEXT_EXTERNAL
|
#if HAVE_MMXEXT_EXTERNAL
|
||||||
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
|
|
||||||
avctx->codec_id == AV_CODEC_ID_THEORA)) {
|
|
||||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
|
|
||||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* slower than cmov version on AMD */
|
/* slower than cmov version on AMD */
|
||||||
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
|
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
|
||||||
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
|
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
|
||||||
@ -1505,46 +1364,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
|
|||||||
#endif /* HAVE_MMXEXT_EXTERNAL */
|
#endif /* HAVE_MMXEXT_EXTERNAL */
|
||||||
}
|
}
|
||||||
|
|
||||||
static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
|
|
||||||
int mm_flags)
|
|
||||||
{
|
|
||||||
#if HAVE_YASM
|
|
||||||
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
|
|
||||||
|
|
||||||
if (!high_bit_depth) {
|
|
||||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
|
|
||||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
|
|
||||||
|
|
||||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
|
|
||||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
|
|
||||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
|
|
||||||
|
|
||||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
|
|
||||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
|
|
||||||
|
|
||||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
|
|
||||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
|
|
||||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
|
|
||||||
|
|
||||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
||||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
|
|
||||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
|
|
||||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
|
|
||||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
|
|
||||||
|
|
||||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
|
|
||||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
|
|
||||||
avctx->codec_id == AV_CODEC_ID_THEORA)) {
|
|
||||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
|
|
||||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
|
|
||||||
}
|
|
||||||
#endif /* HAVE_YASM */
|
|
||||||
}
|
|
||||||
|
|
||||||
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
|
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
|
||||||
int mm_flags)
|
int mm_flags)
|
||||||
{
|
{
|
||||||
@ -1578,15 +1397,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
|
|||||||
#endif /* HAVE_SSE2_INLINE */
|
#endif /* HAVE_SSE2_INLINE */
|
||||||
|
|
||||||
#if HAVE_SSE2_EXTERNAL
|
#if HAVE_SSE2_EXTERNAL
|
||||||
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
|
||||||
// these functions are slower than mmx on AMD, but faster on Intel
|
|
||||||
if (!high_bit_depth) {
|
|
||||||
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
|
||||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
|
||||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||||
if (mm_flags & AV_CPU_FLAG_ATOM) {
|
if (mm_flags & AV_CPU_FLAG_ATOM) {
|
||||||
@ -1644,9 +1454,6 @@ av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
|
|||||||
if (mm_flags & AV_CPU_FLAG_MMXEXT)
|
if (mm_flags & AV_CPU_FLAG_MMXEXT)
|
||||||
dsputil_init_mmxext(c, avctx, mm_flags);
|
dsputil_init_mmxext(c, avctx, mm_flags);
|
||||||
|
|
||||||
if (mm_flags & AV_CPU_FLAG_3DNOW)
|
|
||||||
dsputil_init_3dnow(c, avctx, mm_flags);
|
|
||||||
|
|
||||||
if (mm_flags & AV_CPU_FLAG_SSE)
|
if (mm_flags & AV_CPU_FLAG_SSE)
|
||||||
dsputil_init_sse(c, avctx, mm_flags);
|
dsputil_init_sse(c, avctx, mm_flags);
|
||||||
|
|
||||||
|
@ -25,212 +25,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// put_pixels
|
// put_pixels
|
||||||
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
__asm__ volatile(
|
|
||||||
"lea (%3, %3), %%"REG_a" \n\t"
|
|
||||||
".p2align 3 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq 1(%1), %%mm1 \n\t"
|
|
||||||
"movq (%1, %3), %%mm2 \n\t"
|
|
||||||
"movq 1(%1, %3), %%mm3 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%2) \n\t"
|
|
||||||
"movq %%mm5, (%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq 1(%1), %%mm1 \n\t"
|
|
||||||
"movq (%1, %3), %%mm2 \n\t"
|
|
||||||
"movq 1(%1, %3), %%mm3 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%2) \n\t"
|
|
||||||
"movq %%mm5, (%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
"subl $4, %0 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
||||||
:"r"((x86_reg)line_size)
|
|
||||||
:REG_a, "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
__asm__ volatile(
|
|
||||||
"testl $1, %0 \n\t"
|
|
||||||
" jz 1f \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq (%2), %%mm1 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
"add $8, %2 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
|
|
||||||
"movq %%mm4, (%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"decl %0 \n\t"
|
|
||||||
".p2align 3 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq (%2), %%mm1 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
"movq (%1), %%mm2 \n\t"
|
|
||||||
"movq 8(%2), %%mm3 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"movq %%mm5, (%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq 16(%2), %%mm1 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
"movq (%1), %%mm2 \n\t"
|
|
||||||
"movq 24(%2), %%mm3 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
"add $32, %2 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"movq %%mm5, (%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"subl $4, %0 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
|
||||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
||||||
#else
|
|
||||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
||||||
#endif
|
|
||||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
|
||||||
:"memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
__asm__ volatile(
|
|
||||||
"lea (%3, %3), %%"REG_a" \n\t"
|
|
||||||
".p2align 3 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq 1(%1), %%mm1 \n\t"
|
|
||||||
"movq (%1, %3), %%mm2 \n\t"
|
|
||||||
"movq 1(%1, %3), %%mm3 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%2) \n\t"
|
|
||||||
"movq %%mm5, (%2, %3) \n\t"
|
|
||||||
"movq 8(%1), %%mm0 \n\t"
|
|
||||||
"movq 9(%1), %%mm1 \n\t"
|
|
||||||
"movq 8(%1, %3), %%mm2 \n\t"
|
|
||||||
"movq 9(%1, %3), %%mm3 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, 8(%2) \n\t"
|
|
||||||
"movq %%mm5, 8(%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq 1(%1), %%mm1 \n\t"
|
|
||||||
"movq (%1, %3), %%mm2 \n\t"
|
|
||||||
"movq 1(%1, %3), %%mm3 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%2) \n\t"
|
|
||||||
"movq %%mm5, (%2, %3) \n\t"
|
|
||||||
"movq 8(%1), %%mm0 \n\t"
|
|
||||||
"movq 9(%1), %%mm1 \n\t"
|
|
||||||
"movq 8(%1, %3), %%mm2 \n\t"
|
|
||||||
"movq 9(%1, %3), %%mm3 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, 8(%2) \n\t"
|
|
||||||
"movq %%mm5, 8(%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
"subl $4, %0 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
||||||
:"r"((x86_reg)line_size)
|
|
||||||
:REG_a, "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
__asm__ volatile(
|
|
||||||
"testl $1, %0 \n\t"
|
|
||||||
" jz 1f \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq (%2), %%mm1 \n\t"
|
|
||||||
"movq 8(%1), %%mm2 \n\t"
|
|
||||||
"movq 8(%2), %%mm3 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
"add $16, %2 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%3) \n\t"
|
|
||||||
"movq %%mm5, 8(%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"decl %0 \n\t"
|
|
||||||
".p2align 3 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq (%2), %%mm1 \n\t"
|
|
||||||
"movq 8(%1), %%mm2 \n\t"
|
|
||||||
"movq 8(%2), %%mm3 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%3) \n\t"
|
|
||||||
"movq %%mm5, 8(%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
"movq 16(%2), %%mm1 \n\t"
|
|
||||||
"movq 8(%1), %%mm2 \n\t"
|
|
||||||
"movq 24(%2), %%mm3 \n\t"
|
|
||||||
"add %4, %1 \n\t"
|
|
||||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
||||||
"movq %%mm4, (%3) \n\t"
|
|
||||||
"movq %%mm5, 8(%3) \n\t"
|
|
||||||
"add %5, %3 \n\t"
|
|
||||||
"add $32, %2 \n\t"
|
|
||||||
"subl $2, %0 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
|
||||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
||||||
#else
|
|
||||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
||||||
#endif
|
|
||||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
|
||||||
:"memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
__asm__ volatile(
|
|
||||||
"lea (%3, %3), %%"REG_a" \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
".p2align 3 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movq (%1, %3), %%mm1 \n\t"
|
|
||||||
"movq (%1, %%"REG_a"),%%mm2 \n\t"
|
|
||||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
|
||||||
"movq %%mm4, (%2) \n\t"
|
|
||||||
"movq %%mm5, (%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
"movq (%1, %3), %%mm1 \n\t"
|
|
||||||
"movq (%1, %%"REG_a"),%%mm0 \n\t"
|
|
||||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
|
||||||
"movq %%mm4, (%2) \n\t"
|
|
||||||
"movq %%mm5, (%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
"subl $4, %0 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
||||||
:"r"((x86_reg)line_size)
|
|
||||||
:REG_a, "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
{
|
{
|
||||||
MOVQ_ZERO(mm7);
|
MOVQ_ZERO(mm7);
|
||||||
@ -297,27 +91,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
|
|||||||
:REG_a, "memory");
|
:REG_a, "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
// avg_pixels
|
|
||||||
static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
JUMPALIGN();
|
|
||||||
do {
|
|
||||||
__asm__ volatile(
|
|
||||||
"movd %0, %%mm0 \n\t"
|
|
||||||
"movd %1, %%mm1 \n\t"
|
|
||||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
"movd %%mm2, %0 \n\t"
|
|
||||||
:"+m"(*block)
|
|
||||||
:"m"(*pixels)
|
|
||||||
:"memory");
|
|
||||||
pixels += line_size;
|
|
||||||
block += line_size;
|
|
||||||
}
|
|
||||||
while (--h);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef NO_RND
|
|
||||||
// in case more speed is needed - unroling would certainly help
|
// in case more speed is needed - unroling would certainly help
|
||||||
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
{
|
{
|
||||||
@ -337,7 +110,6 @@ static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t l
|
|||||||
}
|
}
|
||||||
while (--h);
|
while (--h);
|
||||||
}
|
}
|
||||||
#endif // NO_RND
|
|
||||||
|
|
||||||
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
{
|
{
|
||||||
@ -362,141 +134,6 @@ static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t
|
|||||||
while (--h);
|
while (--h);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NO_RND
|
|
||||||
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
JUMPALIGN();
|
|
||||||
do {
|
|
||||||
__asm__ volatile(
|
|
||||||
"movq %1, %%mm0 \n\t"
|
|
||||||
"movq 1%1, %%mm1 \n\t"
|
|
||||||
"movq %0, %%mm3 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
||||||
"movq %%mm0, %0 \n\t"
|
|
||||||
:"+m"(*block)
|
|
||||||
:"m"(*pixels)
|
|
||||||
:"memory");
|
|
||||||
pixels += line_size;
|
|
||||||
block += line_size;
|
|
||||||
} while (--h);
|
|
||||||
}
|
|
||||||
#endif // NO_RND
|
|
||||||
|
|
||||||
static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
JUMPALIGN();
|
|
||||||
do {
|
|
||||||
__asm__ volatile(
|
|
||||||
"movq %1, %%mm0 \n\t"
|
|
||||||
"movq %2, %%mm1 \n\t"
|
|
||||||
"movq %0, %%mm3 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
||||||
"movq %%mm0, %0 \n\t"
|
|
||||||
:"+m"(*dst)
|
|
||||||
:"m"(*src1), "m"(*src2)
|
|
||||||
:"memory");
|
|
||||||
dst += dstStride;
|
|
||||||
src1 += src1Stride;
|
|
||||||
src2 += 8;
|
|
||||||
} while (--h);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
JUMPALIGN();
|
|
||||||
do {
|
|
||||||
__asm__ volatile(
|
|
||||||
"movq %1, %%mm0 \n\t"
|
|
||||||
"movq 1%1, %%mm1 \n\t"
|
|
||||||
"movq %0, %%mm3 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
||||||
"movq %%mm0, %0 \n\t"
|
|
||||||
"movq 8%1, %%mm0 \n\t"
|
|
||||||
"movq 9%1, %%mm1 \n\t"
|
|
||||||
"movq 8%0, %%mm3 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
||||||
"movq %%mm0, 8%0 \n\t"
|
|
||||||
:"+m"(*block)
|
|
||||||
:"m"(*pixels)
|
|
||||||
:"memory");
|
|
||||||
pixels += line_size;
|
|
||||||
block += line_size;
|
|
||||||
} while (--h);
|
|
||||||
}
|
|
||||||
|
|
||||||
static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
JUMPALIGN();
|
|
||||||
do {
|
|
||||||
__asm__ volatile(
|
|
||||||
"movq %1, %%mm0 \n\t"
|
|
||||||
"movq %2, %%mm1 \n\t"
|
|
||||||
"movq %0, %%mm3 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
||||||
"movq %%mm0, %0 \n\t"
|
|
||||||
"movq 8%1, %%mm0 \n\t"
|
|
||||||
"movq 8%2, %%mm1 \n\t"
|
|
||||||
"movq 8%0, %%mm3 \n\t"
|
|
||||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
||||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
||||||
"movq %%mm0, 8%0 \n\t"
|
|
||||||
:"+m"(*dst)
|
|
||||||
:"m"(*src1), "m"(*src2)
|
|
||||||
:"memory");
|
|
||||||
dst += dstStride;
|
|
||||||
src1 += src1Stride;
|
|
||||||
src2 += 16;
|
|
||||||
} while (--h);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
{
|
|
||||||
MOVQ_BFE(mm6);
|
|
||||||
__asm__ volatile(
|
|
||||||
"lea (%3, %3), %%"REG_a" \n\t"
|
|
||||||
"movq (%1), %%mm0 \n\t"
|
|
||||||
".p2align 3 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movq (%1, %3), %%mm1 \n\t"
|
|
||||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
|
||||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
|
||||||
"movq (%2), %%mm3 \n\t"
|
|
||||||
OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
|
|
||||||
"movq (%2, %3), %%mm3 \n\t"
|
|
||||||
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
|
||||||
"movq %%mm0, (%2) \n\t"
|
|
||||||
"movq %%mm1, (%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
|
|
||||||
"movq (%1, %3), %%mm1 \n\t"
|
|
||||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
|
||||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
|
||||||
"movq (%2), %%mm3 \n\t"
|
|
||||||
OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
|
|
||||||
"movq (%2, %3), %%mm3 \n\t"
|
|
||||||
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
|
||||||
"movq %%mm2, (%2) \n\t"
|
|
||||||
"movq %%mm1, (%2, %3) \n\t"
|
|
||||||
"add %%"REG_a", %1 \n\t"
|
|
||||||
"add %%"REG_a", %2 \n\t"
|
|
||||||
|
|
||||||
"subl $4, %0 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
||||||
:"r"((x86_reg)line_size)
|
|
||||||
:REG_a, "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
// this routine is 'slightly' suboptimal but mostly unused
|
// this routine is 'slightly' suboptimal but mostly unused
|
||||||
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
{
|
{
|
||||||
@ -573,21 +210,11 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
|
|||||||
}
|
}
|
||||||
|
|
||||||
//FIXME optimize
|
//FIXME optimize
|
||||||
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
|
||||||
DEF(put, pixels8_y2)(block , pixels , line_size, h);
|
|
||||||
DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
||||||
DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
||||||
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
|
||||||
DEF(avg, pixels8_y2)(block , pixels , line_size, h);
|
|
||||||
DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
||||||
DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
||||||
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||||
|
106
libavcodec/x86/fpel.asm
Normal file
106
libavcodec/x86/fpel.asm
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
;******************************************************************************
|
||||||
|
;* MMX optimized DSP utils
|
||||||
|
;* Copyright (c) 2008 Loren Merritt
|
||||||
|
;* Copyright (c) 2003-2013 Michael Niedermayer
|
||||||
|
;* Copyright (c) 2013 Daniel Kang
|
||||||
|
;*
|
||||||
|
;* This file is part of Libav.
|
||||||
|
;*
|
||||||
|
;* Libav is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* Libav is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with Libav; if not, write to the Free Software
|
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;******************************************************************************
|
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
|
SECTION .text
|
||||||
|
|
||||||
|
INIT_MMX mmxext
|
||||||
|
; void pixels(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
%macro PIXELS48 2
|
||||||
|
%if %2 == 4
|
||||||
|
%define OP movh
|
||||||
|
%else
|
||||||
|
%define OP mova
|
||||||
|
%endif
|
||||||
|
cglobal %1_pixels%2, 4,5
|
||||||
|
movsxdifnidn r2, r2d
|
||||||
|
lea r4, [r2*3]
|
||||||
|
.loop:
|
||||||
|
OP m0, [r1]
|
||||||
|
OP m1, [r1+r2]
|
||||||
|
OP m2, [r1+r2*2]
|
||||||
|
OP m3, [r1+r4]
|
||||||
|
lea r1, [r1+r2*4]
|
||||||
|
%ifidn %1, avg
|
||||||
|
pavgb m0, [r0]
|
||||||
|
pavgb m1, [r0+r2]
|
||||||
|
pavgb m2, [r0+r2*2]
|
||||||
|
pavgb m3, [r0+r4]
|
||||||
|
%endif
|
||||||
|
OP [r0], m0
|
||||||
|
OP [r0+r2], m1
|
||||||
|
OP [r0+r2*2], m2
|
||||||
|
OP [r0+r4], m3
|
||||||
|
sub r3d, 4
|
||||||
|
lea r0, [r0+r2*4]
|
||||||
|
jne .loop
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
PIXELS48 put, 4
|
||||||
|
PIXELS48 avg, 4
|
||||||
|
PIXELS48 put, 8
|
||||||
|
PIXELS48 avg, 8
|
||||||
|
|
||||||
|
|
||||||
|
INIT_XMM sse2
|
||||||
|
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
cglobal put_pixels16, 4,5,4
|
||||||
|
lea r4, [r2*3]
|
||||||
|
.loop:
|
||||||
|
movu m0, [r1]
|
||||||
|
movu m1, [r1+r2]
|
||||||
|
movu m2, [r1+r2*2]
|
||||||
|
movu m3, [r1+r4]
|
||||||
|
lea r1, [r1+r2*4]
|
||||||
|
mova [r0], m0
|
||||||
|
mova [r0+r2], m1
|
||||||
|
mova [r0+r2*2], m2
|
||||||
|
mova [r0+r4], m3
|
||||||
|
sub r3d, 4
|
||||||
|
lea r0, [r0+r2*4]
|
||||||
|
jnz .loop
|
||||||
|
REP_RET
|
||||||
|
|
||||||
|
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
cglobal avg_pixels16, 4,5,4
|
||||||
|
lea r4, [r2*3]
|
||||||
|
.loop:
|
||||||
|
movu m0, [r1]
|
||||||
|
movu m1, [r1+r2]
|
||||||
|
movu m2, [r1+r2*2]
|
||||||
|
movu m3, [r1+r4]
|
||||||
|
lea r1, [r1+r2*4]
|
||||||
|
pavgb m0, [r0]
|
||||||
|
pavgb m1, [r0+r2]
|
||||||
|
pavgb m2, [r0+r2*2]
|
||||||
|
pavgb m3, [r0+r4]
|
||||||
|
mova [r0], m0
|
||||||
|
mova [r0+r2], m1
|
||||||
|
mova [r0+r2*2], m2
|
||||||
|
mova [r0+r4], m3
|
||||||
|
sub r3d, 4
|
||||||
|
lea r0, [r0+r2*4]
|
||||||
|
jnz .loop
|
||||||
|
REP_RET
|
408
libavcodec/x86/hpeldsp_init.c
Normal file
408
libavcodec/x86/hpeldsp_init.c
Normal file
@ -0,0 +1,408 @@
|
|||||||
|
/*
|
||||||
|
* MMX optimized DSP utils
|
||||||
|
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||||
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||||
|
*
|
||||||
|
* This file is part of Libav.
|
||||||
|
*
|
||||||
|
* Libav is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Libav is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with Libav; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*
|
||||||
|
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/cpu.h"
|
||||||
|
#include "libavutil/x86/asm.h"
|
||||||
|
#include "libavcodec/hpeldsp.h"
|
||||||
|
#include "dsputil_mmx.h"
|
||||||
|
|
||||||
|
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
|
||||||
|
const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
|
||||||
|
const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
|
||||||
|
const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
|
||||||
|
const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
|
||||||
|
|
||||||
|
#if HAVE_INLINE_ASM
|
||||||
|
|
||||||
|
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
|
||||||
|
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
|
||||||
|
|
||||||
|
#define MOVQ_BFE(regd) \
|
||||||
|
__asm__ volatile ( \
|
||||||
|
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||||
|
"paddb %%"#regd", %%"#regd" \n\t" ::)
|
||||||
|
|
||||||
|
#ifndef PIC
|
||||||
|
#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
|
||||||
|
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
|
||||||
|
#else
|
||||||
|
// for shared library it's better to use this way for accessing constants
|
||||||
|
// pcmpeqd -> -1
|
||||||
|
#define MOVQ_BONE(regd) \
|
||||||
|
__asm__ volatile ( \
|
||||||
|
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||||
|
"psrlw $15, %%"#regd" \n\t" \
|
||||||
|
"packuswb %%"#regd", %%"#regd" \n\t" ::)
|
||||||
|
|
||||||
|
#define MOVQ_WTWO(regd) \
|
||||||
|
__asm__ volatile ( \
|
||||||
|
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||||
|
"psrlw $15, %%"#regd" \n\t" \
|
||||||
|
"psllw $1, %%"#regd" \n\t"::)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// using regr as temporary and for the output result
|
||||||
|
// first argument is unmodifed and second is trashed
|
||||||
|
// regfe is supposed to contain 0xfefefefefefefefe
|
||||||
|
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
|
||||||
|
"movq "#rega", "#regr" \n\t" \
|
||||||
|
"pand "#regb", "#regr" \n\t" \
|
||||||
|
"pxor "#rega", "#regb" \n\t" \
|
||||||
|
"pand "#regfe", "#regb" \n\t" \
|
||||||
|
"psrlq $1, "#regb" \n\t" \
|
||||||
|
"paddb "#regb", "#regr" \n\t"
|
||||||
|
|
||||||
|
#define PAVGB_MMX(rega, regb, regr, regfe) \
|
||||||
|
"movq "#rega", "#regr" \n\t" \
|
||||||
|
"por "#regb", "#regr" \n\t" \
|
||||||
|
"pxor "#rega", "#regb" \n\t" \
|
||||||
|
"pand "#regfe", "#regb" \n\t" \
|
||||||
|
"psrlq $1, "#regb" \n\t" \
|
||||||
|
"psubb "#regb", "#regr" \n\t"
|
||||||
|
|
||||||
|
// mm6 is supposed to contain 0xfefefefefefefefe
|
||||||
|
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
|
||||||
|
"movq "#rega", "#regr" \n\t" \
|
||||||
|
"movq "#regc", "#regp" \n\t" \
|
||||||
|
"pand "#regb", "#regr" \n\t" \
|
||||||
|
"pand "#regd", "#regp" \n\t" \
|
||||||
|
"pxor "#rega", "#regb" \n\t" \
|
||||||
|
"pxor "#regc", "#regd" \n\t" \
|
||||||
|
"pand %%mm6, "#regb" \n\t" \
|
||||||
|
"pand %%mm6, "#regd" \n\t" \
|
||||||
|
"psrlq $1, "#regb" \n\t" \
|
||||||
|
"psrlq $1, "#regd" \n\t" \
|
||||||
|
"paddb "#regb", "#regr" \n\t" \
|
||||||
|
"paddb "#regd", "#regp" \n\t"
|
||||||
|
|
||||||
|
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
||||||
|
"movq "#rega", "#regr" \n\t" \
|
||||||
|
"movq "#regc", "#regp" \n\t" \
|
||||||
|
"por "#regb", "#regr" \n\t" \
|
||||||
|
"por "#regd", "#regp" \n\t" \
|
||||||
|
"pxor "#rega", "#regb" \n\t" \
|
||||||
|
"pxor "#regc", "#regd" \n\t" \
|
||||||
|
"pand %%mm6, "#regb" \n\t" \
|
||||||
|
"pand %%mm6, "#regd" \n\t" \
|
||||||
|
"psrlq $1, "#regd" \n\t" \
|
||||||
|
"psrlq $1, "#regb" \n\t" \
|
||||||
|
"psubb "#regb", "#regr" \n\t" \
|
||||||
|
"psubb "#regd", "#regp" \n\t"
|
||||||
|
|
||||||
|
/***********************************/
|
||||||
|
/* MMX no rounding */
|
||||||
|
#define NO_RND 1
|
||||||
|
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
|
||||||
|
#define SET_RND MOVQ_WONE
|
||||||
|
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
|
||||||
|
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
|
||||||
|
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
||||||
|
|
||||||
|
#include "hpeldsp_rnd_template.c"
|
||||||
|
|
||||||
|
#undef DEF
|
||||||
|
#undef SET_RND
|
||||||
|
#undef PAVGBP
|
||||||
|
#undef PAVGB
|
||||||
|
#undef NO_RND
|
||||||
|
/***********************************/
|
||||||
|
/* MMX rounding */
|
||||||
|
|
||||||
|
#define DEF(x, y) x ## _ ## y ## _mmx
|
||||||
|
#define SET_RND MOVQ_WTWO
|
||||||
|
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
||||||
|
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
||||||
|
|
||||||
|
#include "hpeldsp_rnd_template.c"
|
||||||
|
|
||||||
|
#undef DEF
|
||||||
|
#undef SET_RND
|
||||||
|
#undef PAVGBP
|
||||||
|
#undef PAVGB
|
||||||
|
#undef OP_AVG
|
||||||
|
|
||||||
|
#endif /* HAVE_INLINE_ASM */
|
||||||
|
|
||||||
|
|
||||||
|
#if HAVE_YASM
|
||||||
|
#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
|
||||||
|
|
||||||
|
/***********************************/
|
||||||
|
/* 3Dnow specific */
|
||||||
|
|
||||||
|
#define DEF(x) x ## _3dnow
|
||||||
|
|
||||||
|
#include "hpeldsp_avg_template.c"
|
||||||
|
|
||||||
|
#undef DEF
|
||||||
|
|
||||||
|
/***********************************/
|
||||||
|
/* MMXEXT specific */
|
||||||
|
|
||||||
|
#define DEF(x) x ## _mmxext
|
||||||
|
|
||||||
|
#include "hpeldsp_avg_template.c"
|
||||||
|
|
||||||
|
#undef DEF
|
||||||
|
|
||||||
|
#endif /* HAVE_YASM */
|
||||||
|
|
||||||
|
|
||||||
|
#if HAVE_INLINE_ASM
|
||||||
|
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
|
||||||
|
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
|
||||||
|
#define put_pixels16_mmxext put_pixels16_mmx
|
||||||
|
#define put_pixels8_mmxext put_pixels8_mmx
|
||||||
|
#define put_pixels4_mmxext put_pixels4_mmx
|
||||||
|
#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
|
||||||
|
#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
|
||||||
|
|
||||||
|
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
__asm__ volatile (
|
||||||
|
"lea (%3, %3), %%"REG_a" \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1 ), %%mm0 \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq %%mm0, (%2) \n\t"
|
||||||
|
"movq %%mm1, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"movq (%1 ), %%mm0 \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq %%mm0, (%2) \n\t"
|
||||||
|
"movq %%mm1, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"subl $4, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
: "+g"(h), "+r"(pixels), "+r"(block)
|
||||||
|
: "r"((x86_reg)line_size)
|
||||||
|
: "%"REG_a, "memory"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
__asm__ volatile (
|
||||||
|
"lea (%3, %3), %%"REG_a" \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1 ), %%mm0 \n\t"
|
||||||
|
"movq 8(%1 ), %%mm4 \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq 8(%1, %3), %%mm5 \n\t"
|
||||||
|
"movq %%mm0, (%2) \n\t"
|
||||||
|
"movq %%mm4, 8(%2) \n\t"
|
||||||
|
"movq %%mm1, (%2, %3) \n\t"
|
||||||
|
"movq %%mm5, 8(%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"movq (%1 ), %%mm0 \n\t"
|
||||||
|
"movq 8(%1 ), %%mm4 \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq 8(%1, %3), %%mm5 \n\t"
|
||||||
|
"movq %%mm0, (%2) \n\t"
|
||||||
|
"movq %%mm4, 8(%2) \n\t"
|
||||||
|
"movq %%mm1, (%2, %3) \n\t"
|
||||||
|
"movq %%mm5, 8(%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"subl $4, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
: "+g"(h), "+r"(pixels), "+r"(block)
|
||||||
|
: "r"((x86_reg)line_size)
|
||||||
|
: "%"REG_a, "memory"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif /* HAVE_INLINE_ASM */
|
||||||
|
|
||||||
|
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||||
|
ptrdiff_t line_size, int h);
|
||||||
|
|
||||||
|
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
||||||
|
do { \
|
||||||
|
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
|
||||||
|
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
|
||||||
|
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
|
||||||
|
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags)
|
||||||
|
{
|
||||||
|
#if HAVE_INLINE_ASM
|
||||||
|
SET_HPEL_FUNCS(put, [0], 16, mmx);
|
||||||
|
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
|
||||||
|
SET_HPEL_FUNCS(avg, [0], 16, mmx);
|
||||||
|
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
|
||||||
|
SET_HPEL_FUNCS(put, [1], 8, mmx);
|
||||||
|
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
|
||||||
|
SET_HPEL_FUNCS(avg, [1], 8, mmx);
|
||||||
|
#endif /* HAVE_INLINE_ASM */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
|
||||||
|
{
|
||||||
|
#if HAVE_YASM
|
||||||
|
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
|
||||||
|
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
|
||||||
|
|
||||||
|
c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
|
||||||
|
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
|
||||||
|
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
|
||||||
|
|
||||||
|
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
|
||||||
|
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
|
||||||
|
|
||||||
|
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
|
||||||
|
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
|
||||||
|
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
|
||||||
|
|
||||||
|
if (!(flags & CODEC_FLAG_BITEXACT)) {
|
||||||
|
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
|
||||||
|
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
|
||||||
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
|
||||||
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
|
||||||
|
|
||||||
|
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
|
||||||
|
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
|
||||||
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
|
||||||
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
|
||||||
|
}
|
||||||
|
#endif /* HAVE_YASM */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags)
|
||||||
|
{
|
||||||
|
#if HAVE_YASM
|
||||||
|
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
|
||||||
|
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
|
||||||
|
|
||||||
|
c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
|
||||||
|
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
|
||||||
|
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
|
||||||
|
|
||||||
|
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
|
||||||
|
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
|
||||||
|
|
||||||
|
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
|
||||||
|
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
|
||||||
|
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
|
||||||
|
|
||||||
|
if (!(flags & CODEC_FLAG_BITEXACT)){
|
||||||
|
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
|
||||||
|
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
|
||||||
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
|
||||||
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
|
||||||
|
|
||||||
|
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
|
||||||
|
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
|
||||||
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
|
||||||
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
|
||||||
|
}
|
||||||
|
#endif /* HAVE_YASM */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
|
||||||
|
{
|
||||||
|
#if HAVE_YASM
|
||||||
|
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
||||||
|
// these functions are slower than mmx on AMD, but faster on Intel
|
||||||
|
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||||
|
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||||
|
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
|
||||||
|
}
|
||||||
|
#endif /* HAVE_YASM */
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
|
||||||
|
{
|
||||||
|
int mm_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if (mm_flags & AV_CPU_FLAG_MMX)
|
||||||
|
hpeldsp_init_mmx(c, flags, mm_flags);
|
||||||
|
|
||||||
|
if (mm_flags & AV_CPU_FLAG_MMXEXT)
|
||||||
|
hpeldsp_init_mmxext(c, flags, mm_flags);
|
||||||
|
|
||||||
|
if (mm_flags & AV_CPU_FLAG_3DNOW)
|
||||||
|
hpeldsp_init_3dnow(c, flags, mm_flags);
|
||||||
|
|
||||||
|
if (mm_flags & AV_CPU_FLAG_SSE2)
|
||||||
|
hpeldsp_init_sse2(c, flags, mm_flags);
|
||||||
|
}
|
428
libavcodec/x86/hpeldsp_rnd_template.c
Normal file
428
libavcodec/x86/hpeldsp_rnd_template.c
Normal file
@ -0,0 +1,428 @@
|
|||||||
|
/*
|
||||||
|
* DSP utils mmx functions are compiled twice for rnd/no_rnd
|
||||||
|
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||||
|
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||||
|
*
|
||||||
|
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||||
|
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
||||||
|
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
||||||
|
*
|
||||||
|
* This file is part of Libav.
|
||||||
|
*
|
||||||
|
* Libav is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Libav is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with Libav; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
// put_pixels
|
||||||
|
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
__asm__ volatile(
|
||||||
|
"lea (%3, %3), %%"REG_a" \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
"movq 1(%1), %%mm1 \n\t"
|
||||||
|
"movq (%1, %3), %%mm2 \n\t"
|
||||||
|
"movq 1(%1, %3), %%mm3 \n\t"
|
||||||
|
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||||
|
"movq %%mm4, (%2) \n\t"
|
||||||
|
"movq %%mm5, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
"movq 1(%1), %%mm1 \n\t"
|
||||||
|
"movq (%1, %3), %%mm2 \n\t"
|
||||||
|
"movq 1(%1, %3), %%mm3 \n\t"
|
||||||
|
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||||
|
"movq %%mm4, (%2) \n\t"
|
||||||
|
"movq %%mm5, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"subl $4, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||||
|
:"r"((x86_reg)line_size)
|
||||||
|
:REG_a, "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
__asm__ volatile(
|
||||||
|
"lea (%3, %3), %%"REG_a" \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
"movq 1(%1), %%mm1 \n\t"
|
||||||
|
"movq (%1, %3), %%mm2 \n\t"
|
||||||
|
"movq 1(%1, %3), %%mm3 \n\t"
|
||||||
|
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||||
|
"movq %%mm4, (%2) \n\t"
|
||||||
|
"movq %%mm5, (%2, %3) \n\t"
|
||||||
|
"movq 8(%1), %%mm0 \n\t"
|
||||||
|
"movq 9(%1), %%mm1 \n\t"
|
||||||
|
"movq 8(%1, %3), %%mm2 \n\t"
|
||||||
|
"movq 9(%1, %3), %%mm3 \n\t"
|
||||||
|
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||||
|
"movq %%mm4, 8(%2) \n\t"
|
||||||
|
"movq %%mm5, 8(%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
"movq 1(%1), %%mm1 \n\t"
|
||||||
|
"movq (%1, %3), %%mm2 \n\t"
|
||||||
|
"movq 1(%1, %3), %%mm3 \n\t"
|
||||||
|
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||||
|
"movq %%mm4, (%2) \n\t"
|
||||||
|
"movq %%mm5, (%2, %3) \n\t"
|
||||||
|
"movq 8(%1), %%mm0 \n\t"
|
||||||
|
"movq 9(%1), %%mm1 \n\t"
|
||||||
|
"movq 8(%1, %3), %%mm2 \n\t"
|
||||||
|
"movq 9(%1, %3), %%mm3 \n\t"
|
||||||
|
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||||
|
"movq %%mm4, 8(%2) \n\t"
|
||||||
|
"movq %%mm5, 8(%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"subl $4, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||||
|
:"r"((x86_reg)line_size)
|
||||||
|
:REG_a, "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
__asm__ volatile(
|
||||||
|
"lea (%3, %3), %%"REG_a" \n\t"
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq (%1, %%"REG_a"),%%mm2 \n\t"
|
||||||
|
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||||
|
"movq %%mm4, (%2) \n\t"
|
||||||
|
"movq %%mm5, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq (%1, %%"REG_a"),%%mm0 \n\t"
|
||||||
|
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||||
|
"movq %%mm4, (%2) \n\t"
|
||||||
|
"movq %%mm5, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
"subl $4, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||||
|
:"r"((x86_reg)line_size)
|
||||||
|
:REG_a, "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_ZERO(mm7);
|
||||||
|
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||||
|
__asm__ volatile(
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
"movq 1(%1), %%mm4 \n\t"
|
||||||
|
"movq %%mm0, %%mm1 \n\t"
|
||||||
|
"movq %%mm4, %%mm5 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm0, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm1, %%mm5 \n\t"
|
||||||
|
"xor %%"REG_a", %%"REG_a" \n\t"
|
||||||
|
"add %3, %1 \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||||
|
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||||
|
"movq %%mm0, %%mm1 \n\t"
|
||||||
|
"movq %%mm2, %%mm3 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||||
|
"paddusw %%mm2, %%mm0 \n\t"
|
||||||
|
"paddusw %%mm3, %%mm1 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm0, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm1, %%mm5 \n\t"
|
||||||
|
"psrlw $2, %%mm4 \n\t"
|
||||||
|
"psrlw $2, %%mm5 \n\t"
|
||||||
|
"packuswb %%mm5, %%mm4 \n\t"
|
||||||
|
"movq %%mm4, (%2, %%"REG_a") \n\t"
|
||||||
|
"add %3, %%"REG_a" \n\t"
|
||||||
|
|
||||||
|
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||||
|
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
||||||
|
"movq %%mm2, %%mm3 \n\t"
|
||||||
|
"movq %%mm4, %%mm5 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm2, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm3, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm0 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm1 \n\t"
|
||||||
|
"paddusw %%mm4, %%mm0 \n\t"
|
||||||
|
"paddusw %%mm5, %%mm1 \n\t"
|
||||||
|
"psrlw $2, %%mm0 \n\t"
|
||||||
|
"psrlw $2, %%mm1 \n\t"
|
||||||
|
"packuswb %%mm1, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%2, %%"REG_a") \n\t"
|
||||||
|
"add %3, %%"REG_a" \n\t"
|
||||||
|
|
||||||
|
"subl $2, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
:"+g"(h), "+S"(pixels)
|
||||||
|
:"D"(block), "r"((x86_reg)line_size)
|
||||||
|
:REG_a, "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
// avg_pixels
|
||||||
|
#ifndef NO_RND
|
||||||
|
// in case more speed is needed - unroling would certainly help
|
||||||
|
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
JUMPALIGN();
|
||||||
|
do {
|
||||||
|
__asm__ volatile(
|
||||||
|
"movq %0, %%mm0 \n\t"
|
||||||
|
"movq %1, %%mm1 \n\t"
|
||||||
|
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||||
|
"movq %%mm2, %0 \n\t"
|
||||||
|
:"+m"(*block)
|
||||||
|
:"m"(*pixels)
|
||||||
|
:"memory");
|
||||||
|
pixels += line_size;
|
||||||
|
block += line_size;
|
||||||
|
}
|
||||||
|
while (--h);
|
||||||
|
}
|
||||||
|
#endif // NO_RND
|
||||||
|
|
||||||
|
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
JUMPALIGN();
|
||||||
|
do {
|
||||||
|
__asm__ volatile(
|
||||||
|
"movq %0, %%mm0 \n\t"
|
||||||
|
"movq %1, %%mm1 \n\t"
|
||||||
|
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||||
|
"movq %%mm2, %0 \n\t"
|
||||||
|
"movq 8%0, %%mm0 \n\t"
|
||||||
|
"movq 8%1, %%mm1 \n\t"
|
||||||
|
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||||
|
"movq %%mm2, 8%0 \n\t"
|
||||||
|
:"+m"(*block)
|
||||||
|
:"m"(*pixels)
|
||||||
|
:"memory");
|
||||||
|
pixels += line_size;
|
||||||
|
block += line_size;
|
||||||
|
}
|
||||||
|
while (--h);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef NO_RND
|
||||||
|
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
JUMPALIGN();
|
||||||
|
do {
|
||||||
|
__asm__ volatile(
|
||||||
|
"movq %1, %%mm0 \n\t"
|
||||||
|
"movq 1%1, %%mm1 \n\t"
|
||||||
|
"movq %0, %%mm3 \n\t"
|
||||||
|
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||||
|
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||||
|
"movq %%mm0, %0 \n\t"
|
||||||
|
:"+m"(*block)
|
||||||
|
:"m"(*pixels)
|
||||||
|
:"memory");
|
||||||
|
pixels += line_size;
|
||||||
|
block += line_size;
|
||||||
|
} while (--h);
|
||||||
|
}
|
||||||
|
#endif // NO_RND
|
||||||
|
|
||||||
|
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
JUMPALIGN();
|
||||||
|
do {
|
||||||
|
__asm__ volatile(
|
||||||
|
"movq %1, %%mm0 \n\t"
|
||||||
|
"movq 1%1, %%mm1 \n\t"
|
||||||
|
"movq %0, %%mm3 \n\t"
|
||||||
|
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||||
|
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||||
|
"movq %%mm0, %0 \n\t"
|
||||||
|
"movq 8%1, %%mm0 \n\t"
|
||||||
|
"movq 9%1, %%mm1 \n\t"
|
||||||
|
"movq 8%0, %%mm3 \n\t"
|
||||||
|
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||||
|
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||||
|
"movq %%mm0, 8%0 \n\t"
|
||||||
|
:"+m"(*block)
|
||||||
|
:"m"(*pixels)
|
||||||
|
:"memory");
|
||||||
|
pixels += line_size;
|
||||||
|
block += line_size;
|
||||||
|
} while (--h);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_BFE(mm6);
|
||||||
|
__asm__ volatile(
|
||||||
|
"lea (%3, %3), %%"REG_a" \n\t"
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||||
|
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||||
|
"movq (%2), %%mm3 \n\t"
|
||||||
|
OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
|
||||||
|
"movq (%2, %3), %%mm3 \n\t"
|
||||||
|
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||||
|
"movq %%mm0, (%2) \n\t"
|
||||||
|
"movq %%mm1, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
|
||||||
|
"movq (%1, %3), %%mm1 \n\t"
|
||||||
|
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||||
|
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||||
|
"movq (%2), %%mm3 \n\t"
|
||||||
|
OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
|
||||||
|
"movq (%2, %3), %%mm3 \n\t"
|
||||||
|
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||||
|
"movq %%mm2, (%2) \n\t"
|
||||||
|
"movq %%mm1, (%2, %3) \n\t"
|
||||||
|
"add %%"REG_a", %1 \n\t"
|
||||||
|
"add %%"REG_a", %2 \n\t"
|
||||||
|
|
||||||
|
"subl $4, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||||
|
:"r"((x86_reg)line_size)
|
||||||
|
:REG_a, "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
// this routine is 'slightly' suboptimal but mostly unused
|
||||||
|
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||||
|
{
|
||||||
|
MOVQ_ZERO(mm7);
|
||||||
|
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||||
|
__asm__ volatile(
|
||||||
|
"movq (%1), %%mm0 \n\t"
|
||||||
|
"movq 1(%1), %%mm4 \n\t"
|
||||||
|
"movq %%mm0, %%mm1 \n\t"
|
||||||
|
"movq %%mm4, %%mm5 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm0, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm1, %%mm5 \n\t"
|
||||||
|
"xor %%"REG_a", %%"REG_a" \n\t"
|
||||||
|
"add %3, %1 \n\t"
|
||||||
|
".p2align 3 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||||
|
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||||
|
"movq %%mm0, %%mm1 \n\t"
|
||||||
|
"movq %%mm2, %%mm3 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||||
|
"paddusw %%mm2, %%mm0 \n\t"
|
||||||
|
"paddusw %%mm3, %%mm1 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm0, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm1, %%mm5 \n\t"
|
||||||
|
"psrlw $2, %%mm4 \n\t"
|
||||||
|
"psrlw $2, %%mm5 \n\t"
|
||||||
|
"movq (%2, %%"REG_a"), %%mm3 \n\t"
|
||||||
|
"packuswb %%mm5, %%mm4 \n\t"
|
||||||
|
"pcmpeqd %%mm2, %%mm2 \n\t"
|
||||||
|
"paddb %%mm2, %%mm2 \n\t"
|
||||||
|
OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
|
||||||
|
"movq %%mm5, (%2, %%"REG_a") \n\t"
|
||||||
|
"add %3, %%"REG_a" \n\t"
|
||||||
|
|
||||||
|
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||||
|
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
||||||
|
"movq %%mm2, %%mm3 \n\t"
|
||||||
|
"movq %%mm4, %%mm5 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||||
|
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm2, %%mm4 \n\t"
|
||||||
|
"paddusw %%mm3, %%mm5 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm0 \n\t"
|
||||||
|
"paddusw %%mm6, %%mm1 \n\t"
|
||||||
|
"paddusw %%mm4, %%mm0 \n\t"
|
||||||
|
"paddusw %%mm5, %%mm1 \n\t"
|
||||||
|
"psrlw $2, %%mm0 \n\t"
|
||||||
|
"psrlw $2, %%mm1 \n\t"
|
||||||
|
"movq (%2, %%"REG_a"), %%mm3 \n\t"
|
||||||
|
"packuswb %%mm1, %%mm0 \n\t"
|
||||||
|
"pcmpeqd %%mm2, %%mm2 \n\t"
|
||||||
|
"paddb %%mm2, %%mm2 \n\t"
|
||||||
|
OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
|
||||||
|
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||||
|
"add %3, %%"REG_a" \n\t"
|
||||||
|
|
||||||
|
"subl $2, %0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
:"+g"(h), "+S"(pixels)
|
||||||
|
:"D"(block), "r"((x86_reg)line_size)
|
||||||
|
:REG_a, "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
//FIXME optimize
|
||||||
|
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
||||||
|
DEF(put, pixels8_y2)(block , pixels , line_size, h);
|
||||||
|
DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
||||||
|
DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
||||||
|
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
||||||
|
DEF(avg, pixels8_y2)(block , pixels , line_size, h);
|
||||||
|
DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
||||||
|
DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
||||||
|
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||||
|
}
|
@ -174,83 +174,3 @@ cglobal %1_pixels16_l2, 6,6
|
|||||||
INIT_MMX mmxext
|
INIT_MMX mmxext
|
||||||
PIXELS16_L2 put
|
PIXELS16_L2 put
|
||||||
PIXELS16_L2 avg
|
PIXELS16_L2 avg
|
||||||
|
|
||||||
INIT_MMX mmxext
|
|
||||||
; void pixels(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
%macro PIXELS48 2
|
|
||||||
%if %2 == 4
|
|
||||||
%define OP movh
|
|
||||||
%else
|
|
||||||
%define OP mova
|
|
||||||
%endif
|
|
||||||
cglobal %1_pixels%2, 4,5
|
|
||||||
movsxdifnidn r2, r2d
|
|
||||||
lea r4, [r2*3]
|
|
||||||
.loop:
|
|
||||||
OP m0, [r1]
|
|
||||||
OP m1, [r1+r2]
|
|
||||||
OP m2, [r1+r2*2]
|
|
||||||
OP m3, [r1+r4]
|
|
||||||
lea r1, [r1+r2*4]
|
|
||||||
%ifidn %1, avg
|
|
||||||
pavgb m0, [r0]
|
|
||||||
pavgb m1, [r0+r2]
|
|
||||||
pavgb m2, [r0+r2*2]
|
|
||||||
pavgb m3, [r0+r4]
|
|
||||||
%endif
|
|
||||||
OP [r0], m0
|
|
||||||
OP [r0+r2], m1
|
|
||||||
OP [r0+r2*2], m2
|
|
||||||
OP [r0+r4], m3
|
|
||||||
sub r3d, 4
|
|
||||||
lea r0, [r0+r2*4]
|
|
||||||
jne .loop
|
|
||||||
RET
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
PIXELS48 put, 4
|
|
||||||
PIXELS48 avg, 4
|
|
||||||
PIXELS48 put, 8
|
|
||||||
PIXELS48 avg, 8
|
|
||||||
|
|
||||||
|
|
||||||
INIT_XMM sse2
|
|
||||||
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
cglobal put_pixels16, 4,5,4
|
|
||||||
lea r4, [r2*3]
|
|
||||||
.loop:
|
|
||||||
movu m0, [r1]
|
|
||||||
movu m1, [r1+r2]
|
|
||||||
movu m2, [r1+r2*2]
|
|
||||||
movu m3, [r1+r4]
|
|
||||||
lea r1, [r1+r2*4]
|
|
||||||
mova [r0], m0
|
|
||||||
mova [r0+r2], m1
|
|
||||||
mova [r0+r2*2], m2
|
|
||||||
mova [r0+r4], m3
|
|
||||||
sub r3d, 4
|
|
||||||
lea r0, [r0+r2*4]
|
|
||||||
jnz .loop
|
|
||||||
REP_RET
|
|
||||||
|
|
||||||
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
||||||
cglobal avg_pixels16, 4,5,4
|
|
||||||
lea r4, [r2*3]
|
|
||||||
.loop:
|
|
||||||
movu m0, [r1]
|
|
||||||
movu m1, [r1+r2]
|
|
||||||
movu m2, [r1+r2*2]
|
|
||||||
movu m3, [r1+r4]
|
|
||||||
lea r1, [r1+r2*4]
|
|
||||||
pavgb m0, [r0]
|
|
||||||
pavgb m1, [r0+r2]
|
|
||||||
pavgb m2, [r0+r2*2]
|
|
||||||
pavgb m3, [r0+r4]
|
|
||||||
mova [r0], m0
|
|
||||||
mova [r0+r2], m1
|
|
||||||
mova [r0+r2*2], m2
|
|
||||||
mova [r0+r4], m3
|
|
||||||
sub r3d, 4
|
|
||||||
lea r0, [r0+r2*4]
|
|
||||||
jnz .loop
|
|
||||||
REP_RET
|
|
||||||
|
Loading…
Reference in New Issue
Block a user