1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-24 13:56:33 +02:00

avcodec: [loongarch] Optimize vp9_lpf/idct with LSX.

ffmpeg -i ../10_vp9_1080p_30fps_3Mbps.webm -f rawvideo -y /dev/null -an
before:294fps
after :567fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Jin Bo 2021-12-18 22:27:56 +08:00 committed by Michael Niedermayer
parent 2fd914e079
commit fea299f876
5 changed files with 4626 additions and 1 deletions

View File

@ -13,4 +13,6 @@ LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LSX-OBJS-$(CONFIG_VP8_DECODER) += loongarch/vp8_mc_lsx.o \ LSX-OBJS-$(CONFIG_VP8_DECODER) += loongarch/vp8_mc_lsx.o \
loongarch/vp8_lpf_lsx.o loongarch/vp8_lpf_lsx.o
LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_intra_lsx.o loongarch/vp9_intra_lsx.o \
loongarch/vp9_lpf_lsx.o \
loongarch/vp9_idct_lsx.o

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -71,6 +71,15 @@
dsp->intra_pred[tx][TOP_DC_PRED] = ff_dc_top_##sz##_lsx; \ dsp->intra_pred[tx][TOP_DC_PRED] = ff_dc_top_##sz##_lsx; \
dsp->intra_pred[tx][TM_VP8_PRED] = ff_tm_##sz##_lsx; \ dsp->intra_pred[tx][TM_VP8_PRED] = ff_tm_##sz##_lsx; \
#define init_idct(tx, nm) \
dsp->itxfm_add[tx][DCT_DCT] = \
dsp->itxfm_add[tx][ADST_DCT] = \
dsp->itxfm_add[tx][DCT_ADST] = \
dsp->itxfm_add[tx][ADST_ADST] = nm##_add_lsx;
#define init_itxfm(tx, sz) \
dsp->itxfm_add[tx][DCT_DCT] = ff_idct_idct_##sz##_add_lsx;
av_cold void ff_vp9dsp_init_loongarch(VP9DSPContext *dsp, int bpp) av_cold void ff_vp9dsp_init_loongarch(VP9DSPContext *dsp, int bpp)
{ {
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
@ -86,8 +95,30 @@ av_cold void ff_vp9dsp_init_loongarch(VP9DSPContext *dsp, int bpp)
init_intra_pred1_lsx(TX_32X32, 32x32); init_intra_pred1_lsx(TX_32X32, 32x32);
init_intra_pred2_lsx(TX_4X4, 4x4); init_intra_pred2_lsx(TX_4X4, 4x4);
init_intra_pred2_lsx(TX_8X8, 8x8); init_intra_pred2_lsx(TX_8X8, 8x8);
init_itxfm(TX_8X8, 8x8);
init_itxfm(TX_16X16, 16x16);
init_idct(TX_32X32, ff_idct_idct_32x32);
dsp->loop_filter_8[0][0] = ff_loop_filter_h_4_8_lsx;
dsp->loop_filter_8[0][1] = ff_loop_filter_v_4_8_lsx;
dsp->loop_filter_8[1][0] = ff_loop_filter_h_8_8_lsx;
dsp->loop_filter_8[1][1] = ff_loop_filter_v_8_8_lsx;
dsp->loop_filter_8[2][0] = ff_loop_filter_h_16_8_lsx;
dsp->loop_filter_8[2][1] = ff_loop_filter_v_16_8_lsx;
dsp->loop_filter_16[0] = ff_loop_filter_h_16_16_lsx;
dsp->loop_filter_16[1] = ff_loop_filter_v_16_16_lsx;
dsp->loop_filter_mix2[0][0][0] = ff_loop_filter_h_44_16_lsx;
dsp->loop_filter_mix2[0][0][1] = ff_loop_filter_v_44_16_lsx;
dsp->loop_filter_mix2[0][1][0] = ff_loop_filter_h_48_16_lsx;
dsp->loop_filter_mix2[0][1][1] = ff_loop_filter_v_48_16_lsx;
dsp->loop_filter_mix2[1][0][0] = ff_loop_filter_h_84_16_lsx;
dsp->loop_filter_mix2[1][0][1] = ff_loop_filter_v_84_16_lsx;
dsp->loop_filter_mix2[1][1][0] = ff_loop_filter_h_88_16_lsx;
dsp->loop_filter_mix2[1][1][1] = ff_loop_filter_v_88_16_lsx;
} }
} }
#undef init_subpel1 #undef init_subpel1
#undef init_subpel2 #undef init_subpel2
#undef init_subpel3 #undef init_subpel3
@ -95,3 +126,5 @@ av_cold void ff_vp9dsp_init_loongarch(VP9DSPContext *dsp, int bpp)
#undef init_fpel #undef init_fpel
#undef init_intra_pred1_lsx #undef init_intra_pred1_lsx
#undef init_intra_pred2_lsx #undef init_intra_pred2_lsx
#undef init_idct
#undef init_itxfm

View File

@ -140,5 +140,43 @@ void ff_tm_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
const uint8_t *top); const uint8_t *top);
void ff_tm_32x32_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, void ff_tm_32x32_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
const uint8_t *top); const uint8_t *top);
void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
int32_t i, int32_t h);
void ff_idct_idct_8x8_add_lsx(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
void ff_idct_idct_16x16_add_lsx(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
void ff_idct_idct_32x32_add_lsx(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
#endif /* AVCODEC_LOONGARCH_VP9DSP_LOONGARCH_H */ #endif /* AVCODEC_LOONGARCH_VP9DSP_LOONGARCH_H */