1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avcodec/x86/simple_idct: Port to SSE2

Before this commit, the (32-bit only) simple idct came in three
versions: A pure MMX IDCT and idct-put and idct-add versions
which use SSE2 at the put and add stage, but still use pure MMX
for the actual IDCT.

This commit ports said IDCT to SSE2; this was entirely trivial
for the IDCT1-5 and IDCT7 parts (where one can directly use
the full register width) and was easy for IDCT6 and IDCT8
(involving a few movhps and pshufds). Unfortunately, DC_COND_INIT
and Z_COND_INIT still use only the lower half of the registers.

This saved 4658B here; the benchmarking option of the dct test tool
showed a 15% speedup.

Reviewed-by: Lynne <dev@lynne.ee>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-04 14:25:54 +01:00
parent 625f5c993c
commit ade54335b2
4 changed files with 629 additions and 624 deletions

View File

@@ -90,7 +90,7 @@ static const struct algo idct_tab_arch[] = {
#endif
#else
#if HAVE_SSE2_EXTERNAL
{ "SIMPLE-SSE2", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
{ "SIMPLE-SSE2", ff_simple_idct_sse2, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2},
#endif
#endif
#endif

View File

@@ -76,7 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct = ff_simple_idct_mmx;
c->idct = ff_simple_idct_sse2;
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;

File diff suppressed because it is too large Load Diff

View File

@@ -22,10 +22,7 @@
#include <stddef.h>
#include <stdint.h>
void ff_simple_idct_mmx(int16_t *block);
void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_sse2(int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);