You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to ASM
Both GCC and Clang completely unroll the unlikely loop at -O3, leading to codesize bloat; their code is also suboptimal, as they don't make use of pmulhrsw (even with -mssse3). This commit therefore ports the whole function to external assembly. The new function occupies 176B here vs 1406B for GCC. Benchmarks for a testcase with huge qscale (notice that the C version is unrolled just like the unlikely loop in the SSSE3 version): add_8x8basis_c: 43.4 ( 1.00x) add_8x8basis_ssse3 (old): 43.6 ( 1.00x) add_8x8basis_ssse3 (new): 11.9 ( 3.63x) Reviewed-by: Kieran Kunhya <kieran@kunhya.com> Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -25,6 +25,58 @@
|
|||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
|
; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale)
|
||||||
|
INIT_XMM ssse3
|
||||||
|
cglobal add_8x8basis, 3, 3+ARCH_X86_64, 4, rem, basis, scale
|
||||||
|
movd m0, scaled
|
||||||
|
add scaled, 1024
|
||||||
|
add basisq, 128
|
||||||
|
add remq, 128
|
||||||
|
%if ARCH_X86_64
|
||||||
|
%define OFF r3q
|
||||||
|
mov r3q, -128
|
||||||
|
cmp scaled, 2047
|
||||||
|
%else
|
||||||
|
%define OFF r2q
|
||||||
|
cmp scaled, 2047
|
||||||
|
mov r2q, -128
|
||||||
|
%endif
|
||||||
|
ja .huge_scale
|
||||||
|
|
||||||
|
punpcklwd m0, m0
|
||||||
|
pshufd m0, m0, 0x0
|
||||||
|
psllw m0, 5
|
||||||
|
.loop1:
|
||||||
|
mova m1, [basisq+OFF]
|
||||||
|
mova m2, [basisq+OFF+16]
|
||||||
|
pmulhrsw m1, m0
|
||||||
|
pmulhrsw m2, m0
|
||||||
|
paddw m1, [remq+OFF]
|
||||||
|
paddw m2, [remq+OFF+16]
|
||||||
|
mova [remq+OFF], m1
|
||||||
|
mova [remq+OFF+16], m2
|
||||||
|
add OFF, 32
|
||||||
|
js .loop1
|
||||||
|
RET
|
||||||
|
|
||||||
|
.huge_scale:
|
||||||
|
pslld m0, 6
|
||||||
|
punpcklwd m0, m0
|
||||||
|
pshufd m1, m0, 0x55
|
||||||
|
psrlw m0, 1
|
||||||
|
pshufd m0, m0, 0x0
|
||||||
|
.loop2:
|
||||||
|
mova m2, [basisq+OFF]
|
||||||
|
pmulhrsw m3, m2, m0
|
||||||
|
pmullw m2, m1
|
||||||
|
paddw m2, m3
|
||||||
|
paddw m2, [remq+OFF]
|
||||||
|
mova [remq+OFF], m2
|
||||||
|
add OFF, 16
|
||||||
|
js .loop2
|
||||||
|
RET
|
||||||
|
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
|
cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
|
||||||
pxor m6, m6
|
pxor m6, m6
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
|
|||||||
int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
|
int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
|
||||||
int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
|
int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
|
||||||
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
|
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
|
||||||
|
void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale);
|
||||||
|
|
||||||
#if HAVE_INLINE_ASM
|
#if HAVE_INLINE_ASM
|
||||||
#if HAVE_SSSE3_INLINE
|
#if HAVE_SSSE3_INLINE
|
||||||
@@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c
|
|||||||
);
|
);
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale)
|
|
||||||
{
|
|
||||||
x86_reg i=0;
|
|
||||||
|
|
||||||
if (FFABS(scale) < 1024) {
|
|
||||||
scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT);
|
|
||||||
__asm__ volatile(
|
|
||||||
"movd %3, %%xmm2 \n\t"
|
|
||||||
"punpcklwd %%xmm2, %%xmm2 \n\t"
|
|
||||||
"pshufd $0, %%xmm2, %%xmm2 \n\t"
|
|
||||||
".p2align 4 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"movdqa (%1, %0), %%xmm0 \n\t"
|
|
||||||
"movdqa 16(%1, %0), %%xmm1 \n\t"
|
|
||||||
"pmulhrsw %%xmm2, %%xmm0 \n\t"
|
|
||||||
"pmulhrsw %%xmm2, %%xmm1 \n\t"
|
|
||||||
"paddw (%2, %0), %%xmm0 \n\t"
|
|
||||||
"paddw 16(%2, %0), %%xmm1 \n\t"
|
|
||||||
"movdqa %%xmm0, (%2, %0) \n\t"
|
|
||||||
"movdqa %%xmm1, 16(%2, %0) \n\t"
|
|
||||||
"add $32, %0 \n\t"
|
|
||||||
"cmp $128, %0 \n\t" // FIXME optimize & bench
|
|
||||||
" jb 1b \n\t"
|
|
||||||
: "+r" (i)
|
|
||||||
: "r"(basis), "r"(rem), "g"(scale)
|
|
||||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
for (i=0; i<8*8; i++) {
|
|
||||||
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* HAVE_SSSE3_INLINE */
|
#endif /* HAVE_SSSE3_INLINE */
|
||||||
|
|
||||||
/* Draw the edges of width 'w' of an image of size width, height */
|
/* Draw the edges of width 'w' of an image of size width, height */
|
||||||
@@ -227,15 +193,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
|
|||||||
c->draw_edges = draw_edges_mmx;
|
c->draw_edges = draw_edges_mmx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif /* HAVE_INLINE_ASM */
|
||||||
|
|
||||||
|
if (X86_SSSE3(cpu_flags)) {
|
||||||
#if HAVE_SSSE3_INLINE
|
#if HAVE_SSSE3_INLINE
|
||||||
if (INLINE_SSSE3(cpu_flags)) {
|
|
||||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||||
c->try_8x8basis = try_8x8basis_ssse3;
|
c->try_8x8basis = try_8x8basis_ssse3;
|
||||||
}
|
}
|
||||||
c->add_8x8basis = add_8x8basis_ssse3;
|
|
||||||
}
|
|
||||||
#endif /* HAVE_SSSE3_INLINE */
|
#endif /* HAVE_SSSE3_INLINE */
|
||||||
|
#if HAVE_SSSE3_EXTERNAL
|
||||||
|
c->add_8x8basis = ff_add_8x8basis_ssse3;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* HAVE_INLINE_ASM */
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user