1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2

It gains a lot because it has to operate on eight words;
it also saves 608B of .text here.

Old benchmarks:
column_fidct_c:                                       3365.7 ( 1.00x)
column_fidct_mmx:                                     1784.6 ( 1.89x)

New benchmarks:
column_fidct_c:                                       3361.5 ( 1.00x)
column_fidct_sse2:                                     801.1 ( 4.20x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-12 21:42:32 +01:00
parent 68b11cde82
commit ddd74276f8
3 changed files with 107 additions and 110 deletions

View File

@@ -26,18 +26,18 @@
SECTION_RODATA SECTION_RODATA
cextern fspp_dither cextern fspp_dither
pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
pw_2: times 8 dw 2
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_4: times 4 dw 4 pw_4: times 4 dw 4
pw_2: times 4 dw 2
SECTION .text SECTION .text
@@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
mova [thrq+16*7], m3 mova [thrq+16*7], m3
RET RET
%macro COLUMN_FDCT 1-3 0, 0 %macro COLUMN_FDCT 1
movq m1, [srcq+DCTSIZE*0*2] mova m1, [srcq+DCTSIZE*0*2]
movq m7, [srcq+DCTSIZE*3*2] mova m7, [srcq+DCTSIZE*3*2]
movq m0, m1 mova m0, m1
paddw m1, [srcq+DCTSIZE*7*2] paddw m1, [srcq+DCTSIZE*7*2]
movq m3, m7 mova m3, m7
paddw m7, [srcq+DCTSIZE*4*2] paddw m7, [srcq+DCTSIZE*4*2]
movq m5, m1 mova m5, m1
movq m6, [srcq+DCTSIZE*1*2] mova m6, [srcq+DCTSIZE*1*2]
psubw m1, m7 psubw m1, m7
movq m2, [srcq+DCTSIZE*2*2] mova m2, [srcq+DCTSIZE*2*2]
movq m4, m6 mova m4, m6
paddw m6, [srcq+DCTSIZE*6*2] paddw m6, [srcq+DCTSIZE*6*2]
paddw m5, m7 paddw m5, m7
paddw m2, [srcq+DCTSIZE*5*2] paddw m2, [srcq+DCTSIZE*5*2]
movq m7, m6 mova m7, m6
paddw m6, m2 paddw m6, m2
psubw m7, m2 psubw m7, m2
movq m2, m5 mova m2, m5
paddw m5, m6 paddw m5, m6
psubw m2, m6 psubw m2, m6
paddw m7, m1 paddw m7, m1
movq m6, [thrq+4*16+%2] mova m6, [thrq+4*16]
psllw m7, 1 psllw m7, 1
psubw m5, [thrq+%2] psubw m5, [thrq]
psubw m2, m6 psubw m2, m6
paddusw m5, [thrq+%2] paddusw m5, [thrq]
paddusw m2, m6 paddusw m2, m6
pmulhw m7, [pw_5A82] pmulhw m7, [pw_5A82]
paddw m5, [thrq+%2] paddw m5, [thrq]
paddw m2, m6 paddw m2, m6
psubusw m5, [thrq+%2] psubusw m5, [thrq]
psubusw m2, m6 psubusw m2, m6
paddw m5, [pw_2] paddw m5, [pw_2]
movq m6, m2 mova m6, m2
paddw m2, m5 paddw m2, m5
psubw m5, m6 psubw m5, m6
movq m6, m1 mova m6, m1
paddw m1, m7 paddw m1, m7
psubw m1, [thrq+2*16+%2] psubw m1, [thrq+2*16]
psubw m6, m7 psubw m6, m7
movq m7, [thrq+6*16+%2] mova m7, [thrq+6*16]
psraw m5, 2 psraw m5, 2
paddusw m1, [thrq+2*16+%2] paddusw m1, [thrq+2*16]
psubw m6, m7 psubw m6, m7
paddw m1, [thrq+2*16+%2] paddw m1, [thrq+2*16]
paddusw m6, m7 paddusw m6, m7
psubusw m1, [thrq+2*16+%2] psubusw m1, [thrq+2*16]
paddw m6, m7 paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2] psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7 psubusw m6, m7
movq m7, m1 mova m7, m1
psraw m2, 2 psraw m2, 2
psubw m4, [srcq+DCTSIZE*6*2] psubw m4, [srcq+DCTSIZE*6*2]
psubw m1, m6 psubw m1, m6
psubw m0, [srcq+DCTSIZE*7*2] psubw m0, [srcq+DCTSIZE*7*2]
paddw m6, m7 paddw m6, m7
psraw m6, 2 psraw m6, 2
movq m7, m2 mova m7, m2
pmulhw m1, [pw_5A82] pmulhw m1, [pw_5A82]
paddw m2, m6 paddw m2, m6
movq [rsp], m2 mova [rsp], m2
psubw m7, m6 psubw m7, m6
movq m2, [srcq+DCTSIZE*2*2] mova m2, [srcq+DCTSIZE*2*2]
psubw m1, m6 psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2] psubw m2, [srcq+DCTSIZE*5*2]
movq m6, m5 mova m6, m5
movq [rsp+8*3], m7 mova [rsp+16*3], m7
paddw m3, m2 paddw m3, m2
paddw m2, m4 paddw m2, m4
paddw m4, m0 paddw m4, m0
movq m7, m3 mova m7, m3
psubw m3, m4 psubw m3, m4
psllw m7, 1 psllw m7, 1
pmulhw m3, [pw_61F8] pmulhw m3, [pw_61F8]
psllw m4, 2 psllw m4, 2
add srcq, 32
pmulhw m7, [pw_4546] pmulhw m7, [pw_4546]
psllw m2, 1 psllw m2, 1
pmulhw m4, [pw_539F] pmulhw m4, [pw_539F]
@@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m2, [pw_5A82] pmulhw m2, [pw_5A82]
psubw m6, m1 psubw m6, m1
paddw m7, m3 paddw m7, m3
movq [rsp+8], m5 mova [rsp+16], m5
paddw m4, m3 paddw m4, m3
movq m3, [thrq+3*16+%2] mova m3, [thrq+3*16]
movq m1, m0 mova m1, m0
movq [rsp+8*2], m6 mova [rsp+16*2], m6
psubw m1, m2 psubw m1, m2
paddw m0, m2 paddw m0, m2
movq m5, m1 mova m5, m1
movq m2, [thrq+5*16+%2] mova m2, [thrq+5*16]
psubw m1, m7 psubw m1, m7
paddw m5, m7 paddw m5, m7
psubw m1, m3 psubw m1, m3
movq m7, [thrq+16+%2] mova m7, [thrq+16]
psubw m5, m2 psubw m5, m2
movq m6, m0 mova m6, m0
paddw m0, m4 paddw m0, m4
paddusw m1, m3 paddusw m1, m3
psubw m6, m4 psubw m6, m4
movq m4, [thrq+7*16+%2] mova m4, [thrq+7*16]
psubw m0, m7 psubw m0, m7
psubw m6, m4 psubw m6, m4
paddusw m5, m2 paddusw m5, m2
@@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubusw m1, m3 psubusw m1, m3
psubusw m5, m2 psubusw m5, m2
psubusw m6, m4 psubusw m6, m4
movq m4, m1 mova m4, m1
por m4, m5 por m4, m5
paddusw m0, m7 paddusw m0, m7
por m4, m6 por m4, m6
paddw m0, m7 paddw m0, m7
packssdw m4, m4 packssdw m4, m4
psubusw m0, m7 psubusw m0, m7
%if ARCH_X86_64
movq tmpq, m4
%else
packssdw m4, m4
movd tmpd, m4 movd tmpd, m4
or tmpd, tmpd %endif
or tmpq, tmpq
jnz %1 jnz %1
movq m4, [rsp] mova m4, [rsp]
psraw m3, m0, 2 psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2] mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642] pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546] pmulhw m2, m0, [pw_4546]
pmulhw m0, [pw_5A82] pmulhw m0, [pw_5A82]
paddw m5, m4 paddw m5, m4
movq m6, [rsp+8] mova m6, [rsp+16]
psubw m2, m1 psubw m2, m1
psubw m4, m3 psubw m4, m3
movq m7, [outq+DCTSIZE*1*2] mova m7, [outq+DCTSIZE*1*2]
paddw m5, m3 paddw m5, m3
psubw m1, m3 psubw m1, m3
mova [outq+DCTSIZE*7*2], m4 mova [outq+DCTSIZE*7*2], m4
@@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m0 paddw m2, m0
mova [outq+DCTSIZE*0*2], m5 mova [outq+DCTSIZE*0*2], m5
paddw m7, m6 paddw m7, m6
movq m3, [rsp+8*2] mova m3, [rsp+16*2]
psubw m6, m1 psubw m6, m1
movq m4, [outq+DCTSIZE*2*2] mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1 paddw m7, m1
movq [outq], m5 mova [outq], m5
paddw m4, m3 paddw m4, m3
movq [outq+DCTSIZE*6*2], m6 mova [outq+DCTSIZE*6*2], m6
psubw m3, m0 psubw m3, m0
movq m5, [outq+DCTSIZE*5*2] mova m5, [outq+DCTSIZE*5*2]
paddw m4, m0 paddw m4, m0
movq m6, [outq+DCTSIZE*3*2] mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3 paddw m5, m3
movq m0, [rsp+8*3] mova m0, [rsp+16*3]
add srcq, 8+%3 mova [outq+DCTSIZE*1*2], m7
movq [outq+DCTSIZE*1*2], m7
paddw m6, m0 paddw m6, m0
movq [outq+DCTSIZE*2*2], m4 mova [outq+DCTSIZE*2*2], m4
paddw m0, m2 paddw m0, m2
movq m7, [outq+DCTSIZE*4*2] mova m7, [outq+DCTSIZE*4*2]
psubw m6, m2 psubw m6, m2
movq [outq+DCTSIZE*5*2], m5 mova [outq+DCTSIZE*5*2], m5
paddw m7, m0 paddw m7, m0
movq [outq+DCTSIZE*3*2], m6 mova [outq+DCTSIZE*3*2], m6
movq [outq+DCTSIZE*4*2], m7 mova [outq+DCTSIZE*4*2], m7
add outq, 8+%3 add outq, 32
%endmacro %endmacro
%macro COLUMN_IDCT 0-1 0 %macro COLUMN_IDCT 0
movq m3, m5 mova m3, m5
psubw m5, m1 psubw m5, m1
paddw m3, m1 paddw m3, m1
movq m2, m0 mova m2, m0
psubw m0, m6 psubw m0, m6
psllw m1, m5, 1 psllw m1, m5, 1
pmulhw m1, [pw_AC62] pmulhw m1, [pw_AC62]
@@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m5, [pw_7642] pmulhw m5, [pw_7642]
paddw m2, m6 paddw m2, m6
pmulhw m0, [pw_4546] pmulhw m0, [pw_4546]
movq m7, m2 mova m7, m2
movq m4, [rsp] mova m4, [rsp]
psubw m2, m3 psubw m2, m3
paddw m7, m3 paddw m7, m3
pmulhw m2, [pw_5A82] pmulhw m2, [pw_5A82]
movq m6, m4 mova m6, m4
psraw m7, 2 psraw m7, 2
paddw m4, [outq] paddw m4, [outq]
psubw m6, m7 psubw m6, m7
movq m3, [rsp+8] mova m3, [rsp+16]
paddw m4, m7 paddw m4, m7
movq [outq+DCTSIZE*7*2], m6 mova [outq+DCTSIZE*7*2], m6
paddw m1, m5 paddw m1, m5
movq [outq], m4 mova [outq], m4
psubw m1, m7 psubw m1, m7
movq m7, [rsp+8*2] mova m7, [rsp+16*2]
psubw m0, m5 psubw m0, m5
movq m6, [rsp+8*3] mova m6, [rsp+16*3]
movq m5, m3 mova m5, m3
paddw m3, [outq+DCTSIZE*1*2] paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1 psubw m5, m1
psubw m2, m1 psubw m2, m1
paddw m3, m1 paddw m3, m1
movq [outq+DCTSIZE*6*2], m5 mova [outq+DCTSIZE*6*2], m5
movq m4, m7 mova m4, m7
paddw m7, [outq+DCTSIZE*2*2] paddw m7, [outq+DCTSIZE*2*2]
psubw m4, m2 psubw m4, m2
paddw m4, [outq+DCTSIZE*5*2] paddw m4, [outq+DCTSIZE*5*2]
paddw m7, m2 paddw m7, m2
movq [outq+DCTSIZE*1*2], m3 mova [outq+DCTSIZE*1*2], m3
paddw m0, m2 paddw m0, m2
movq [outq+DCTSIZE*2*2], m7 mova [outq+DCTSIZE*2*2], m7
movq m1, m6 mova m1, m6
paddw m6, [outq+DCTSIZE*4*2] paddw m6, [outq+DCTSIZE*4*2]
psubw m1, m0 psubw m1, m0
paddw m1, [outq+DCTSIZE*3*2] paddw m1, [outq+DCTSIZE*3*2]
paddw m6, m0 paddw m6, m0
movq [outq+DCTSIZE*5*2], m4 mova [outq+DCTSIZE*5*2], m4
add srcq, 8+%1 mova [outq+DCTSIZE*4*2], m6
movq [outq+DCTSIZE*4*2], m6 mova [outq+DCTSIZE*3*2], m1
movq [outq+DCTSIZE*3*2], m1 add outq, 32
add outq, 8+%1
%endmacro %endmacro
INIT_MMX mmx ;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp .fdct:
.fdct1: COLUMN_FDCT .idct
COLUMN_FDCT .idct1 sub cntd, 2
jmp .fdct2 jg .fdct
RET
.idct1: .idct:
COLUMN_IDCT COLUMN_IDCT
.fdct2:
COLUMN_FDCT .idct2, 8, 16
sub cntd, 2 sub cntd, 2
jg .fdct1 jg .fdct
RET
.idct2:
COLUMN_IDCT 16
sub cntd, 2
jg .fdct1
RET RET
INIT_MMX mmx
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
add strideq, strideq add strideq, strideq

View File

@@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
@@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx; s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx; s->row_fdct = ff_row_fdct_mmx;
} }
@@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
s->store_slice = ff_store_slice_sse2; s->store_slice = ff_store_slice_sse2;
s->store_slice2 = ff_store_slice2_sse2; s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2; s->mul_thrmat = ff_mul_thrmat_sse2;
s->column_fidct = ff_column_fidct_sse2;
} }
} }

View File

@@ -136,7 +136,7 @@ static void check_column_fidct(void)
NB_BLOCKS = 8, ///< arbitrary NB_BLOCKS = 8, ///< arbitrary
}; };
FSPPDSPContext fspp; FSPPDSPContext fspp;
declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data, declare_func(void, int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt); int16_t *output, int cnt);
ff_fsppdsp_init(&fspp); ff_fsppdsp_init(&fspp);