From ddd74276f85b3c53809d59ffc640b7b45f5a125f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Wed, 12 Nov 2025 21:42:32 +0100 Subject: [PATCH] avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2 It gains a lot because it has to operate on eight words; it also saves 608B of .text here. Old benchmarks: column_fidct_c: 3365.7 ( 1.00x) column_fidct_mmx: 1784.6 ( 1.89x) New benchmarks: column_fidct_c: 3361.5 ( 1.00x) column_fidct_sse2: 801.1 ( 4.20x) Signed-off-by: Andreas Rheinhardt --- libavfilter/x86/vf_fspp.asm | 209 ++++++++++++++++----------------- libavfilter/x86/vf_fspp_init.c | 4 +- tests/checkasm/vf_fspp.c | 4 +- 3 files changed, 107 insertions(+), 110 deletions(-) diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index f61efc99f8..3f37911722 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -26,18 +26,18 @@ SECTION_RODATA cextern fspp_dither +pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2 +pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4 +pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14) +pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14) +pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2 +pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13) +pw_2: times 8 dw 2 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) -pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14) pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) -pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13) pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) -pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) -pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) -pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13) pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) -pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) pw_4: times 4 dw 4 -pw_2: times 4 dw 2 SECTION .text @@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q mova [thrq+16*7], m3 RET -%macro COLUMN_FDCT 1-3 0, 0 - movq m1, [srcq+DCTSIZE*0*2] - movq m7, [srcq+DCTSIZE*3*2] - movq m0, m1 +%macro COLUMN_FDCT 1 + mova m1, [srcq+DCTSIZE*0*2] + mova m7, [srcq+DCTSIZE*3*2] + mova m0, m1 paddw m1, [srcq+DCTSIZE*7*2] - movq m3, m7 + mova m3, m7 paddw m7, [srcq+DCTSIZE*4*2] - movq m5, m1 - movq m6, [srcq+DCTSIZE*1*2] + mova m5, m1 + mova m6, [srcq+DCTSIZE*1*2] psubw m1, m7 - movq m2, [srcq+DCTSIZE*2*2] - movq m4, m6 + mova m2, [srcq+DCTSIZE*2*2] + mova m4, m6 paddw m6, [srcq+DCTSIZE*6*2] paddw m5, m7 paddw m2, [srcq+DCTSIZE*5*2] - movq m7, m6 + mova m7, m6 paddw m6, m2 psubw m7, m2 - movq m2, m5 + mova m2, m5 paddw m5, m6 psubw m2, m6 paddw m7, m1 - movq m6, [thrq+4*16+%2] + mova m6, [thrq+4*16] psllw m7, 1 - psubw m5, [thrq+%2] + psubw m5, [thrq] psubw m2, m6 - paddusw m5, [thrq+%2] + paddusw m5, [thrq] paddusw m2, m6 pmulhw m7, [pw_5A82] - paddw m5, [thrq+%2] + paddw m5, [thrq] paddw m2, m6 - psubusw m5, [thrq+%2] + psubusw m5, [thrq] psubusw m2, m6 paddw m5, [pw_2] - movq m6, m2 + mova m6, m2 paddw m2, m5 psubw m5, m6 - movq m6, m1 + mova m6, m1 paddw m1, m7 - psubw m1, [thrq+2*16+%2] + psubw m1, [thrq+2*16] psubw m6, m7 - movq m7, [thrq+6*16+%2] + mova m7, [thrq+6*16] psraw m5, 2 - paddusw m1, [thrq+2*16+%2] + paddusw m1, [thrq+2*16] psubw m6, m7 - paddw m1, [thrq+2*16+%2] + paddw m1, [thrq+2*16] paddusw m6, m7 - psubusw m1, [thrq+2*16+%2] + psubusw m1, [thrq+2*16] paddw m6, m7 psubw m3, [srcq+DCTSIZE*4*2] psubusw m6, m7 - movq m7, m1 + mova m7, m1 psraw m2, 2 psubw m4, [srcq+DCTSIZE*6*2] psubw m1, m6 psubw m0, [srcq+DCTSIZE*7*2] paddw m6, m7 psraw m6, 2 - movq m7, m2 + mova m7, m2 pmulhw m1, [pw_5A82] paddw m2, m6 - movq [rsp], m2 + mova [rsp], m2 psubw m7, m6 - movq m2, [srcq+DCTSIZE*2*2] + mova m2, [srcq+DCTSIZE*2*2] psubw m1, m6 psubw m2, [srcq+DCTSIZE*5*2] - movq m6, m5 - movq [rsp+8*3], m7 + mova m6, m5 + mova [rsp+16*3], m7 paddw m3, m2 paddw m2, m4 paddw m4, m0 - movq m7, m3 + mova m7, m3 psubw m3, m4 psllw m7, 1 pmulhw m3, [pw_61F8] psllw m4, 2 + add srcq, 32 pmulhw m7, [pw_4546] psllw m2, 1 pmulhw m4, [pw_539F] @@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q pmulhw m2, [pw_5A82] psubw m6, m1 paddw m7, m3 - movq [rsp+8], m5 + mova [rsp+16], m5 paddw m4, m3 - movq m3, [thrq+3*16+%2] - movq m1, m0 - movq [rsp+8*2], m6 + mova m3, [thrq+3*16] + mova m1, m0 + mova [rsp+16*2], m6 psubw m1, m2 paddw m0, m2 - movq m5, m1 - movq m2, [thrq+5*16+%2] + mova m5, m1 + mova m2, [thrq+5*16] psubw m1, m7 paddw m5, m7 psubw m1, m3 - movq m7, [thrq+16+%2] + mova m7, [thrq+16] psubw m5, m2 - movq m6, m0 + mova m6, m0 paddw m0, m4 paddusw m1, m3 psubw m6, m4 - movq m4, [thrq+7*16+%2] + mova m4, [thrq+7*16] psubw m0, m7 psubw m6, m4 paddusw m5, m2 @@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q psubusw m1, m3 psubusw m5, m2 psubusw m6, m4 - movq m4, m1 + mova m4, m1 por m4, m5 paddusw m0, m7 por m4, m6 paddw m0, m7 packssdw m4, m4 psubusw m0, m7 - movd tmpd, m4 - or tmpd, tmpd +%if ARCH_X86_64 + movq tmpq, m4 +%else + packssdw m4, m4 + movd tmpd, m4 +%endif + or tmpq, tmpq jnz %1 - movq m4, [rsp] + mova m4, [rsp] psraw m3, m0, 2 mova m5, [outq+DCTSIZE*0*2] pmulhw m1, m0, [pw_7642] pmulhw m2, m0, [pw_4546] pmulhw m0, [pw_5A82] paddw m5, m4 - movq m6, [rsp+8] + mova m6, [rsp+16] psubw m2, m1 psubw m4, m3 - movq m7, [outq+DCTSIZE*1*2] + mova m7, [outq+DCTSIZE*1*2] paddw m5, m3 psubw m1, m3 mova [outq+DCTSIZE*7*2], m4 @@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m2, m0 mova [outq+DCTSIZE*0*2], m5 paddw m7, m6 - movq m3, [rsp+8*2] + mova m3, [rsp+16*2] psubw m6, m1 - movq m4, [outq+DCTSIZE*2*2] + mova m4, [outq+DCTSIZE*2*2] paddw m7, m1 - movq [outq], m5 + mova [outq], m5 paddw m4, m3 - movq [outq+DCTSIZE*6*2], m6 + mova [outq+DCTSIZE*6*2], m6 psubw m3, m0 - movq m5, [outq+DCTSIZE*5*2] + mova m5, [outq+DCTSIZE*5*2] paddw m4, m0 - movq m6, [outq+DCTSIZE*3*2] + mova m6, [outq+DCTSIZE*3*2] paddw m5, m3 - movq m0, [rsp+8*3] - add srcq, 8+%3 - movq [outq+DCTSIZE*1*2], m7 + mova m0, [rsp+16*3] + mova [outq+DCTSIZE*1*2], m7 paddw m6, m0 - movq [outq+DCTSIZE*2*2], m4 + mova [outq+DCTSIZE*2*2], m4 paddw m0, m2 - movq m7, [outq+DCTSIZE*4*2] + mova m7, [outq+DCTSIZE*4*2] psubw m6, m2 - movq [outq+DCTSIZE*5*2], m5 + mova [outq+DCTSIZE*5*2], m5 paddw m7, m0 - movq [outq+DCTSIZE*3*2], m6 - movq [outq+DCTSIZE*4*2], m7 - add outq, 8+%3 + mova [outq+DCTSIZE*3*2], m6 + mova [outq+DCTSIZE*4*2], m7 + add outq, 32 %endmacro -%macro COLUMN_IDCT 0-1 0 - movq m3, m5 +%macro COLUMN_IDCT 0 + mova m3, m5 psubw m5, m1 paddw m3, m1 - movq m2, m0 + mova m2, m0 psubw m0, m6 psllw m1, m5, 1 pmulhw m1, [pw_AC62] @@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q pmulhw m5, [pw_7642] paddw m2, m6 pmulhw m0, [pw_4546] - movq m7, m2 - movq m4, [rsp] + mova m7, m2 + mova m4, [rsp] psubw m2, m3 paddw m7, m3 pmulhw m2, [pw_5A82] - movq m6, m4 + mova m6, m4 psraw m7, 2 paddw m4, [outq] psubw m6, m7 - movq m3, [rsp+8] + mova m3, [rsp+16] paddw m4, m7 - movq [outq+DCTSIZE*7*2], m6 + mova [outq+DCTSIZE*7*2], m6 paddw m1, m5 - movq [outq], m4 + mova [outq], m4 psubw m1, m7 - movq m7, [rsp+8*2] + mova m7, [rsp+16*2] psubw m0, m5 - movq m6, [rsp+8*3] - movq m5, m3 + mova m6, [rsp+16*3] + mova m5, m3 paddw m3, [outq+DCTSIZE*1*2] psubw m5, m1 psubw m2, m1 paddw m3, m1 - movq [outq+DCTSIZE*6*2], m5 - movq m4, m7 + mova [outq+DCTSIZE*6*2], m5 + mova m4, m7 paddw m7, [outq+DCTSIZE*2*2] psubw m4, m2 paddw m4, [outq+DCTSIZE*5*2] paddw m7, m2 - movq [outq+DCTSIZE*1*2], m3 + mova [outq+DCTSIZE*1*2], m3 paddw m0, m2 - movq [outq+DCTSIZE*2*2], m7 - movq m1, m6 + mova [outq+DCTSIZE*2*2], m7 + mova m1, m6 paddw m6, [outq+DCTSIZE*4*2] psubw m1, m0 paddw m1, [outq+DCTSIZE*3*2] paddw m6, m0 - movq [outq+DCTSIZE*5*2], m4 - add srcq, 8+%1 - movq [outq+DCTSIZE*4*2], m6 - movq [outq+DCTSIZE*3*2], m1 - add outq, 8+%1 + mova [outq+DCTSIZE*5*2], m4 + mova [outq+DCTSIZE*4*2], m6 + mova [outq+DCTSIZE*3*2], m1 + add outq, 32 %endmacro -INIT_MMX mmx -;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); -cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp -.fdct1: - COLUMN_FDCT .idct1 - jmp .fdct2 +;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp +.fdct: + COLUMN_FDCT .idct + sub cntd, 2 + jg .fdct + RET -.idct1: +.idct: COLUMN_IDCT - -.fdct2: - COLUMN_FDCT .idct2, 8, 16 sub cntd, 2 - jg .fdct1 - RET - -.idct2: - COLUMN_IDCT 16 - sub cntd, 2 - jg .fdct1 + jg .fdct RET +INIT_MMX mmx ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 add strideq, strideq diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index ee875547d2..c7a9b1799e 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); @@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - s->column_fidct = ff_column_fidct_mmx; s->row_idct = ff_row_idct_mmx; s->row_fdct = ff_row_fdct_mmx; } @@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) s->store_slice = ff_store_slice_sse2; s->store_slice2 = ff_store_slice2_sse2; s->mul_thrmat = ff_mul_thrmat_sse2; + s->column_fidct = ff_column_fidct_sse2; } } diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index de407739d9..ccb9cd1e7d 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -136,8 +136,8 @@ static void check_column_fidct(void) NB_BLOCKS = 8, ///< arbitrary }; FSPPDSPContext fspp; - declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data, - int16_t *output, int cnt); + declare_func(void, int16_t *thr_adr, int16_t *data, + int16_t *output, int cnt); ff_fsppdsp_init(&fspp);