mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
x86/dsputilenc: port sum_abs_dctelem functions to yasm
Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
ddeb58b90c
commit
61eea421b2
@ -487,3 +487,45 @@ cglobal pix_norm1, 2, 4
|
||||
movd eax, m1
|
||||
RET
|
||||
|
||||
%macro DCT_SAD4 1
|
||||
mova m2, [blockq+%1+0 ]
|
||||
mova m3, [blockq+%1+16]
|
||||
mova m4, [blockq+%1+32]
|
||||
mova m5, [blockq+%1+48]
|
||||
ABS1_SUM m2, m6, m0
|
||||
ABS1_SUM m3, m6, m1
|
||||
ABS1_SUM m4, m6, m0
|
||||
ABS1_SUM m5, m6, m1
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------
|
||||
;int ff_sum_abs_dctelem(int16_t *block)
|
||||
;-----------------------------------------------
|
||||
; %1 = number of xmm registers used
|
||||
|
||||
%macro SUM_ABS_DCTELEM 1
|
||||
cglobal sum_abs_dctelem, 1, 1, %1, block
|
||||
pxor m0, m0
|
||||
pxor m1, m1
|
||||
DCT_SAD4 0
|
||||
%if mmsize == 8
|
||||
DCT_SAD4 8
|
||||
%endif
|
||||
DCT_SAD4 64
|
||||
%if mmsize == 8
|
||||
DCT_SAD4 72
|
||||
%endif
|
||||
paddusw m0, m1
|
||||
HSUM m0, m1, eax
|
||||
and eax, 0xFFFF
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
SUM_ABS_DCTELEM 0
|
||||
INIT_MMX mmxext
|
||||
SUM_ABS_DCTELEM 0
|
||||
INIT_XMM sse2
|
||||
SUM_ABS_DCTELEM 7
|
||||
INIT_XMM ssse3
|
||||
SUM_ABS_DCTELEM 6
|
||||
|
@ -38,6 +38,10 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
|
||||
int stride);
|
||||
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
|
||||
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
|
||||
int ff_sum_abs_dctelem_mmx(int16_t *block);
|
||||
int ff_sum_abs_dctelem_mmxext(int16_t *block);
|
||||
int ff_sum_abs_dctelem_sse2(int16_t *block);
|
||||
int ff_sum_abs_dctelem_ssse3(int16_t *block);
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
@ -759,118 +763,6 @@ static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
|
||||
*left = src2[w - 1];
|
||||
}
|
||||
|
||||
#define MMABS_MMX(a,z) \
|
||||
"pxor " #z ", " #z " \n\t" \
|
||||
"pcmpgtw " #a ", " #z " \n\t" \
|
||||
"pxor " #z ", " #a " \n\t" \
|
||||
"psubw " #z ", " #a " \n\t"
|
||||
|
||||
#define MMABS_MMXEXT(a, z) \
|
||||
"pxor " #z ", " #z " \n\t" \
|
||||
"psubw " #a ", " #z " \n\t" \
|
||||
"pmaxsw " #z ", " #a " \n\t"
|
||||
|
||||
#define MMABS_SSSE3(a,z) \
|
||||
"pabsw " #a ", " #a " \n\t"
|
||||
|
||||
#define MMABS_SUM(a,z, sum) \
|
||||
MMABS(a,z) \
|
||||
"paddusw " #a ", " #sum " \n\t"
|
||||
|
||||
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
|
||||
* up to about 100k on extreme inputs. But that's very unlikely to occur in
|
||||
* natural video, and it's even more unlikely to not have any alternative
|
||||
* mvs/modes with lower cost. */
|
||||
#define HSUM_MMX(a, t, dst) \
|
||||
"movq " #a ", " #t " \n\t" \
|
||||
"psrlq $32, " #a " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"movq " #a ", " #t " \n\t" \
|
||||
"psrlq $16, " #a " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"movd " #a ", " #dst " \n\t" \
|
||||
|
||||
#define HSUM_MMXEXT(a, t, dst) \
|
||||
"pshufw $0x0E, " #a ", " #t " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"pshufw $0x01, " #a ", " #t " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"movd " #a ", " #dst " \n\t" \
|
||||
|
||||
#define HSUM_SSE2(a, t, dst) \
|
||||
"movhlps " #a ", " #t " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"pshuflw $0x0E, " #a ", " #t " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"pshuflw $0x01, " #a ", " #t " \n\t" \
|
||||
"paddusw " #t ", " #a " \n\t" \
|
||||
"movd " #a ", " #dst " \n\t" \
|
||||
|
||||
#define DCT_SAD4(m, mm, o) \
|
||||
"mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
|
||||
"mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
|
||||
"mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
|
||||
"mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
|
||||
MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
|
||||
MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
|
||||
MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
|
||||
MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
|
||||
|
||||
#define DCT_SAD_MMX \
|
||||
"pxor %%mm0, %%mm0 \n\t" \
|
||||
"pxor %%mm1, %%mm1 \n\t" \
|
||||
DCT_SAD4(q, %%mm, 0) \
|
||||
DCT_SAD4(q, %%mm, 8) \
|
||||
DCT_SAD4(q, %%mm, 64) \
|
||||
DCT_SAD4(q, %%mm, 72) \
|
||||
"paddusw %%mm1, %%mm0 \n\t" \
|
||||
HSUM(%%mm0, %%mm1, %0)
|
||||
|
||||
#define DCT_SAD_SSE2 \
|
||||
"pxor %%xmm0, %%xmm0 \n\t" \
|
||||
"pxor %%xmm1, %%xmm1 \n\t" \
|
||||
DCT_SAD4(dqa, %%xmm, 0) \
|
||||
DCT_SAD4(dqa, %%xmm, 64) \
|
||||
"paddusw %%xmm1, %%xmm0 \n\t" \
|
||||
HSUM(%%xmm0, %%xmm1, %0)
|
||||
|
||||
#define DCT_SAD_FUNC(cpu) \
|
||||
static int sum_abs_dctelem_ ## cpu(int16_t *block) \
|
||||
{ \
|
||||
int sum; \
|
||||
__asm__ volatile ( \
|
||||
DCT_SAD \
|
||||
:"=r"(sum) \
|
||||
:"r"(block)); \
|
||||
return sum & 0xFFFF; \
|
||||
}
|
||||
|
||||
#define DCT_SAD DCT_SAD_MMX
|
||||
#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
|
||||
#define MMABS(a, z) MMABS_MMX(a, z)
|
||||
DCT_SAD_FUNC(mmx)
|
||||
#undef MMABS
|
||||
#undef HSUM
|
||||
|
||||
#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
|
||||
#define MMABS(a, z) MMABS_MMXEXT(a, z)
|
||||
DCT_SAD_FUNC(mmxext)
|
||||
#undef HSUM
|
||||
#undef DCT_SAD
|
||||
|
||||
#define DCT_SAD DCT_SAD_SSE2
|
||||
#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
|
||||
DCT_SAD_FUNC(sse2)
|
||||
#undef MMABS
|
||||
|
||||
#if HAVE_SSSE3_INLINE
|
||||
#define MMABS(a, z) MMABS_SSSE3(a, z)
|
||||
DCT_SAD_FUNC(ssse3)
|
||||
#undef MMABS
|
||||
#endif
|
||||
#undef HSUM
|
||||
#undef DCT_SAD
|
||||
|
||||
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
|
||||
int size)
|
||||
{
|
||||
@ -1012,8 +904,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||
c->fdct = ff_fdct_mmx;
|
||||
|
||||
c->diff_bytes = diff_bytes_mmx;
|
||||
c->sum_abs_dctelem = sum_abs_dctelem_mmx;
|
||||
|
||||
c->sse[0] = sse16_mmx;
|
||||
c->sse[1] = sse8_mmx;
|
||||
c->vsad[4] = vsad_intra16_mmx;
|
||||
@ -1041,7 +931,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
|
||||
c->fdct = ff_fdct_mmxext;
|
||||
|
||||
c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
|
||||
c->vsad[4] = vsad_intra16_mmxext;
|
||||
|
||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||
@ -1055,8 +944,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||
if (!high_bit_depth &&
|
||||
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
|
||||
c->fdct = ff_fdct_sse2;
|
||||
|
||||
c->sum_abs_dctelem = sum_abs_dctelem_sse2;
|
||||
}
|
||||
|
||||
#if HAVE_SSSE3_INLINE
|
||||
@ -1065,7 +952,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||
c->try_8x8basis = try_8x8basis_ssse3;
|
||||
}
|
||||
c->add_8x8basis = add_8x8basis_ssse3;
|
||||
c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
|
||||
}
|
||||
#endif
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
@ -1073,15 +959,18 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->sse[0] = ff_sse16_sse2;
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
|
||||
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
|
||||
@ -1089,9 +978,12 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
|
||||
#endif
|
||||
}
|
||||
|
||||
ff_dsputil_init_pix_mmx(c, avctx);
|
||||
|
Loading…
Reference in New Issue
Block a user