1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2

It gains a lot because it has to operate on eight words;
it also saves 608B of .text here.

Old benchmarks:
column_fidct_c:                                       3365.7 ( 1.00x)
column_fidct_mmx:                                     1784.6 ( 1.89x)

New benchmarks:
column_fidct_c:                                       3361.5 ( 1.00x)
column_fidct_sse2:                                     801.1 ( 4.20x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-12 21:42:32 +01:00
parent 68b11cde82
commit ddd74276f8
3 changed files with 107 additions and 110 deletions

View File

@@ -26,18 +26,18 @@
SECTION_RODATA
cextern fspp_dither
pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
pw_2: times 8 dw 2
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_4: times 4 dw 4
pw_2: times 4 dw 2
SECTION .text
@@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
mova [thrq+16*7], m3
RET
%macro COLUMN_FDCT 1-3 0, 0
movq m1, [srcq+DCTSIZE*0*2]
movq m7, [srcq+DCTSIZE*3*2]
movq m0, m1
%macro COLUMN_FDCT 1
mova m1, [srcq+DCTSIZE*0*2]
mova m7, [srcq+DCTSIZE*3*2]
mova m0, m1
paddw m1, [srcq+DCTSIZE*7*2]
movq m3, m7
mova m3, m7
paddw m7, [srcq+DCTSIZE*4*2]
movq m5, m1
movq m6, [srcq+DCTSIZE*1*2]
mova m5, m1
mova m6, [srcq+DCTSIZE*1*2]
psubw m1, m7
movq m2, [srcq+DCTSIZE*2*2]
movq m4, m6
mova m2, [srcq+DCTSIZE*2*2]
mova m4, m6
paddw m6, [srcq+DCTSIZE*6*2]
paddw m5, m7
paddw m2, [srcq+DCTSIZE*5*2]
movq m7, m6
mova m7, m6
paddw m6, m2
psubw m7, m2
movq m2, m5
mova m2, m5
paddw m5, m6
psubw m2, m6
paddw m7, m1
movq m6, [thrq+4*16+%2]
mova m6, [thrq+4*16]
psllw m7, 1
psubw m5, [thrq+%2]
psubw m5, [thrq]
psubw m2, m6
paddusw m5, [thrq+%2]
paddusw m5, [thrq]
paddusw m2, m6
pmulhw m7, [pw_5A82]
paddw m5, [thrq+%2]
paddw m5, [thrq]
paddw m2, m6
psubusw m5, [thrq+%2]
psubusw m5, [thrq]
psubusw m2, m6
paddw m5, [pw_2]
movq m6, m2
mova m6, m2
paddw m2, m5
psubw m5, m6
movq m6, m1
mova m6, m1
paddw m1, m7
psubw m1, [thrq+2*16+%2]
psubw m1, [thrq+2*16]
psubw m6, m7
movq m7, [thrq+6*16+%2]
mova m7, [thrq+6*16]
psraw m5, 2
paddusw m1, [thrq+2*16+%2]
paddusw m1, [thrq+2*16]
psubw m6, m7
paddw m1, [thrq+2*16+%2]
paddw m1, [thrq+2*16]
paddusw m6, m7
psubusw m1, [thrq+2*16+%2]
psubusw m1, [thrq+2*16]
paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7
movq m7, m1
mova m7, m1
psraw m2, 2
psubw m4, [srcq+DCTSIZE*6*2]
psubw m1, m6
psubw m0, [srcq+DCTSIZE*7*2]
paddw m6, m7
psraw m6, 2
movq m7, m2
mova m7, m2
pmulhw m1, [pw_5A82]
paddw m2, m6
movq [rsp], m2
mova [rsp], m2
psubw m7, m6
movq m2, [srcq+DCTSIZE*2*2]
mova m2, [srcq+DCTSIZE*2*2]
psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2]
movq m6, m5
movq [rsp+8*3], m7
mova m6, m5
mova [rsp+16*3], m7
paddw m3, m2
paddw m2, m4
paddw m4, m0
movq m7, m3
mova m7, m3
psubw m3, m4
psllw m7, 1
pmulhw m3, [pw_61F8]
psllw m4, 2
add srcq, 32
pmulhw m7, [pw_4546]
psllw m2, 1
pmulhw m4, [pw_539F]
@@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m2, [pw_5A82]
psubw m6, m1
paddw m7, m3
movq [rsp+8], m5
mova [rsp+16], m5
paddw m4, m3
movq m3, [thrq+3*16+%2]
movq m1, m0
movq [rsp+8*2], m6
mova m3, [thrq+3*16]
mova m1, m0
mova [rsp+16*2], m6
psubw m1, m2
paddw m0, m2
movq m5, m1
movq m2, [thrq+5*16+%2]
mova m5, m1
mova m2, [thrq+5*16]
psubw m1, m7
paddw m5, m7
psubw m1, m3
movq m7, [thrq+16+%2]
mova m7, [thrq+16]
psubw m5, m2
movq m6, m0
mova m6, m0
paddw m0, m4
paddusw m1, m3
psubw m6, m4
movq m4, [thrq+7*16+%2]
mova m4, [thrq+7*16]
psubw m0, m7
psubw m6, m4
paddusw m5, m2
@@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubusw m1, m3
psubusw m5, m2
psubusw m6, m4
movq m4, m1
mova m4, m1
por m4, m5
paddusw m0, m7
por m4, m6
paddw m0, m7
packssdw m4, m4
psubusw m0, m7
movd tmpd, m4
or tmpd, tmpd
%if ARCH_X86_64
movq tmpq, m4
%else
packssdw m4, m4
movd tmpd, m4
%endif
or tmpq, tmpq
jnz %1
movq m4, [rsp]
mova m4, [rsp]
psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546]
pmulhw m0, [pw_5A82]
paddw m5, m4
movq m6, [rsp+8]
mova m6, [rsp+16]
psubw m2, m1
psubw m4, m3
movq m7, [outq+DCTSIZE*1*2]
mova m7, [outq+DCTSIZE*1*2]
paddw m5, m3
psubw m1, m3
mova [outq+DCTSIZE*7*2], m4
@@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
movq m3, [rsp+8*2]
mova m3, [rsp+16*2]
psubw m6, m1
movq m4, [outq+DCTSIZE*2*2]
mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1
movq [outq], m5
mova [outq], m5
paddw m4, m3
movq [outq+DCTSIZE*6*2], m6
mova [outq+DCTSIZE*6*2], m6
psubw m3, m0
movq m5, [outq+DCTSIZE*5*2]
mova m5, [outq+DCTSIZE*5*2]
paddw m4, m0
movq m6, [outq+DCTSIZE*3*2]
mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3
movq m0, [rsp+8*3]
add srcq, 8+%3
movq [outq+DCTSIZE*1*2], m7
mova m0, [rsp+16*3]
mova [outq+DCTSIZE*1*2], m7
paddw m6, m0
movq [outq+DCTSIZE*2*2], m4
mova [outq+DCTSIZE*2*2], m4
paddw m0, m2
movq m7, [outq+DCTSIZE*4*2]
mova m7, [outq+DCTSIZE*4*2]
psubw m6, m2
movq [outq+DCTSIZE*5*2], m5
mova [outq+DCTSIZE*5*2], m5
paddw m7, m0
movq [outq+DCTSIZE*3*2], m6
movq [outq+DCTSIZE*4*2], m7
add outq, 8+%3
mova [outq+DCTSIZE*3*2], m6
mova [outq+DCTSIZE*4*2], m7
add outq, 32
%endmacro
%macro COLUMN_IDCT 0-1 0
movq m3, m5
%macro COLUMN_IDCT 0
mova m3, m5
psubw m5, m1
paddw m3, m1
movq m2, m0
mova m2, m0
psubw m0, m6
psllw m1, m5, 1
pmulhw m1, [pw_AC62]
@@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m5, [pw_7642]
paddw m2, m6
pmulhw m0, [pw_4546]
movq m7, m2
movq m4, [rsp]
mova m7, m2
mova m4, [rsp]
psubw m2, m3
paddw m7, m3
pmulhw m2, [pw_5A82]
movq m6, m4
mova m6, m4
psraw m7, 2
paddw m4, [outq]
psubw m6, m7
movq m3, [rsp+8]
mova m3, [rsp+16]
paddw m4, m7
movq [outq+DCTSIZE*7*2], m6
mova [outq+DCTSIZE*7*2], m6
paddw m1, m5
movq [outq], m4
mova [outq], m4
psubw m1, m7
movq m7, [rsp+8*2]
mova m7, [rsp+16*2]
psubw m0, m5
movq m6, [rsp+8*3]
movq m5, m3
mova m6, [rsp+16*3]
mova m5, m3
paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1
psubw m2, m1
paddw m3, m1
movq [outq+DCTSIZE*6*2], m5
movq m4, m7
mova [outq+DCTSIZE*6*2], m5
mova m4, m7
paddw m7, [outq+DCTSIZE*2*2]
psubw m4, m2
paddw m4, [outq+DCTSIZE*5*2]
paddw m7, m2
movq [outq+DCTSIZE*1*2], m3
mova [outq+DCTSIZE*1*2], m3
paddw m0, m2
movq [outq+DCTSIZE*2*2], m7
movq m1, m6
mova [outq+DCTSIZE*2*2], m7
mova m1, m6
paddw m6, [outq+DCTSIZE*4*2]
psubw m1, m0
paddw m1, [outq+DCTSIZE*3*2]
paddw m6, m0
movq [outq+DCTSIZE*5*2], m4
add srcq, 8+%1
movq [outq+DCTSIZE*4*2], m6
movq [outq+DCTSIZE*3*2], m1
add outq, 8+%1
mova [outq+DCTSIZE*5*2], m4
mova [outq+DCTSIZE*4*2], m6
mova [outq+DCTSIZE*3*2], m1
add outq, 32
%endmacro
INIT_MMX mmx
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
.fdct1:
COLUMN_FDCT .idct1
jmp .fdct2
;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
.fdct:
COLUMN_FDCT .idct
sub cntd, 2
jg .fdct
RET
.idct1:
.idct:
COLUMN_IDCT
.fdct2:
COLUMN_FDCT .idct2, 8, 16
sub cntd, 2
jg .fdct1
RET
.idct2:
COLUMN_IDCT 16
sub cntd, 2
jg .fdct1
jg .fdct
RET
INIT_MMX mmx
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
add strideq, strideq

View File

@@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
@@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
@@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
s->store_slice = ff_store_slice_sse2;
s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
s->column_fidct = ff_column_fidct_sse2;
}
}

View File

@@ -136,8 +136,8 @@ static void check_column_fidct(void)
NB_BLOCKS = 8, ///< arbitrary
};
FSPPDSPContext fspp;
declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt);
declare_func(void, int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt);
ff_fsppdsp_init(&fspp);