diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index 2ca05f69ea..e5665ac781 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = { #endif /* HAVE_6REGS */ -#if HAVE_INLINE_ASM -#if HAVE_SSE2_INLINE +#if HAVE_SSE2_EXTERNAL +void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64], + const uint16_t dct_offset[64]); + static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) { const int intra = s->c.mb_intra; @@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) s->dct_count[intra]++; - __asm__ volatile( - "pxor %%xmm6, %%xmm6 \n\t" - "1: \n\t" - "pxor %%xmm0, %%xmm0 \n\t" - "pxor %%xmm1, %%xmm1 \n\t" - "movdqa (%0), %%xmm2 \n\t" - "movdqa 16(%0), %%xmm3 \n\t" - "pcmpgtw %%xmm2, %%xmm0 \n\t" - "pcmpgtw %%xmm3, %%xmm1 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, %%xmm4 \n\t" - "movdqa %%xmm3, %%xmm5 \n\t" - "psubusw (%2), %%xmm2 \n\t" - "psubusw 16(%2), %%xmm3 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm3, 16(%0) \n\t" - "movdqa %%xmm4, %%xmm2 \n\t" - "movdqa %%xmm5, %%xmm0 \n\t" - "punpcklwd %%xmm6, %%xmm4 \n\t" - "punpckhwd %%xmm6, %%xmm2 \n\t" - "punpcklwd %%xmm6, %%xmm5 \n\t" - "punpckhwd %%xmm6, %%xmm0 \n\t" - "paddd (%1), %%xmm4 \n\t" - "paddd 16(%1), %%xmm2 \n\t" - "paddd 32(%1), %%xmm5 \n\t" - "paddd 48(%1), %%xmm0 \n\t" - "movdqa %%xmm4, (%1) \n\t" - "movdqa %%xmm2, 16(%1) \n\t" - "movdqa %%xmm5, 32(%1) \n\t" - "movdqa %%xmm0, 48(%1) \n\t" - "add $32, %0 \n\t" - "add $64, %1 \n\t" - "add $32, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6") - ); + ff_mpv_denoise_dct_sse2(block, sum, offset); } -#endif /* HAVE_SSE2_INLINE */ -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_SSE2_EXTERNAL */ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) { @@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) #if HAVE_6REGS s->dct_quantize = dct_quantize_sse2; #endif +#if HAVE_SSE2_EXTERNAL s->denoise_dct = denoise_dct_sse2; +#endif } #if HAVE_6REGS && HAVE_SSSE3_INLINE if (INLINE_SSSE3(cpu_flags)) diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index d12646ae54..0e86a5304c 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -24,6 +24,52 @@ %include "libavutil/x86/x86util.asm" SECTION .text + +INIT_XMM sse2 +cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset + pxor m6, m6 + lea r3, [sumq+256] +.loop: + mova m2, [blockq] + mova m3, [blockq+16] + mova m0, m6 + mova m1, m6 + pcmpgtw m0, m2 + pcmpgtw m1, m3 + pxor m2, m0 + pxor m3, m1 + psubw m2, m0 + psubw m3, m1 + psubusw m4, m2, [offsetq] + psubusw m5, m3, [offsetq+16] + pxor m4, m0 + pxor m5, m1 + add offsetq, 32 + psubw m4, m0 + psubw m5, m1 + mova [blockq], m4 + mova [blockq+16], m5 + mova m0, m2 + mova m1, m3 + add blockq, 32 + punpcklwd m0, m6 + punpckhwd m2, m6 + punpcklwd m1, m6 + punpckhwd m3, m6 + paddd m0, [sumq] + paddd m2, [sumq+16] + paddd m1, [sumq+32] + paddd m3, [sumq+48] + mova [sumq], m0 + mova [sumq+16], m2 + mova [sumq+32], m1 + mova [sumq+48], m3 + add sumq, 64 + cmp sumq, r3 + jb .loop + RET + + ; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size) ; %1 = number of loops ; %2 = number of GPRs used