You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
avcodec/x86/mpegvideoenc: Port denoise_dct_sse2 to external assembly
Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
|
||||
|
||||
#endif /* HAVE_6REGS */
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
#if HAVE_SSE2_INLINE
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
|
||||
const uint16_t dct_offset[64]);
|
||||
|
||||
static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
|
||||
{
|
||||
const int intra = s->c.mb_intra;
|
||||
@@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
|
||||
|
||||
s->dct_count[intra]++;
|
||||
|
||||
__asm__ volatile(
|
||||
"pxor %%xmm6, %%xmm6 \n\t"
|
||||
"1: \n\t"
|
||||
"pxor %%xmm0, %%xmm0 \n\t"
|
||||
"pxor %%xmm1, %%xmm1 \n\t"
|
||||
"movdqa (%0), %%xmm2 \n\t"
|
||||
"movdqa 16(%0), %%xmm3 \n\t"
|
||||
"pcmpgtw %%xmm2, %%xmm0 \n\t"
|
||||
"pcmpgtw %%xmm3, %%xmm1 \n\t"
|
||||
"pxor %%xmm0, %%xmm2 \n\t"
|
||||
"pxor %%xmm1, %%xmm3 \n\t"
|
||||
"psubw %%xmm0, %%xmm2 \n\t"
|
||||
"psubw %%xmm1, %%xmm3 \n\t"
|
||||
"movdqa %%xmm2, %%xmm4 \n\t"
|
||||
"movdqa %%xmm3, %%xmm5 \n\t"
|
||||
"psubusw (%2), %%xmm2 \n\t"
|
||||
"psubusw 16(%2), %%xmm3 \n\t"
|
||||
"pxor %%xmm0, %%xmm2 \n\t"
|
||||
"pxor %%xmm1, %%xmm3 \n\t"
|
||||
"psubw %%xmm0, %%xmm2 \n\t"
|
||||
"psubw %%xmm1, %%xmm3 \n\t"
|
||||
"movdqa %%xmm2, (%0) \n\t"
|
||||
"movdqa %%xmm3, 16(%0) \n\t"
|
||||
"movdqa %%xmm4, %%xmm2 \n\t"
|
||||
"movdqa %%xmm5, %%xmm0 \n\t"
|
||||
"punpcklwd %%xmm6, %%xmm4 \n\t"
|
||||
"punpckhwd %%xmm6, %%xmm2 \n\t"
|
||||
"punpcklwd %%xmm6, %%xmm5 \n\t"
|
||||
"punpckhwd %%xmm6, %%xmm0 \n\t"
|
||||
"paddd (%1), %%xmm4 \n\t"
|
||||
"paddd 16(%1), %%xmm2 \n\t"
|
||||
"paddd 32(%1), %%xmm5 \n\t"
|
||||
"paddd 48(%1), %%xmm0 \n\t"
|
||||
"movdqa %%xmm4, (%1) \n\t"
|
||||
"movdqa %%xmm2, 16(%1) \n\t"
|
||||
"movdqa %%xmm5, 32(%1) \n\t"
|
||||
"movdqa %%xmm0, 48(%1) \n\t"
|
||||
"add $32, %0 \n\t"
|
||||
"add $64, %1 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (block), "+r" (sum), "+r" (offset)
|
||||
: "r"(block+64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6")
|
||||
);
|
||||
ff_mpv_denoise_dct_sse2(block, sum, offset);
|
||||
}
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* HAVE_SSE2_EXTERNAL */
|
||||
|
||||
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
|
||||
{
|
||||
@@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
|
||||
#if HAVE_6REGS
|
||||
s->dct_quantize = dct_quantize_sse2;
|
||||
#endif
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
s->denoise_dct = denoise_dct_sse2;
|
||||
#endif
|
||||
}
|
||||
#if HAVE_6REGS && HAVE_SSSE3_INLINE
|
||||
if (INLINE_SSSE3(cpu_flags))
|
||||
|
||||
@@ -24,6 +24,52 @@
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
|
||||
pxor m6, m6
|
||||
lea r3, [sumq+256]
|
||||
.loop:
|
||||
mova m2, [blockq]
|
||||
mova m3, [blockq+16]
|
||||
mova m0, m6
|
||||
mova m1, m6
|
||||
pcmpgtw m0, m2
|
||||
pcmpgtw m1, m3
|
||||
pxor m2, m0
|
||||
pxor m3, m1
|
||||
psubw m2, m0
|
||||
psubw m3, m1
|
||||
psubusw m4, m2, [offsetq]
|
||||
psubusw m5, m3, [offsetq+16]
|
||||
pxor m4, m0
|
||||
pxor m5, m1
|
||||
add offsetq, 32
|
||||
psubw m4, m0
|
||||
psubw m5, m1
|
||||
mova [blockq], m4
|
||||
mova [blockq+16], m5
|
||||
mova m0, m2
|
||||
mova m1, m3
|
||||
add blockq, 32
|
||||
punpcklwd m0, m6
|
||||
punpckhwd m2, m6
|
||||
punpcklwd m1, m6
|
||||
punpckhwd m3, m6
|
||||
paddd m0, [sumq]
|
||||
paddd m2, [sumq+16]
|
||||
paddd m1, [sumq+32]
|
||||
paddd m3, [sumq+48]
|
||||
mova [sumq], m0
|
||||
mova [sumq+16], m2
|
||||
mova [sumq+32], m1
|
||||
mova [sumq+48], m3
|
||||
add sumq, 64
|
||||
cmp sumq, r3
|
||||
jb .loop
|
||||
RET
|
||||
|
||||
|
||||
; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size)
|
||||
; %1 = number of loops
|
||||
; %2 = number of GPRs used
|
||||
|
||||
Reference in New Issue
Block a user