mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
avcodec/h264: add sse2 versions of previous idct functions
Kaby Lake Pentium: - ff_h264_idct_add_8_sse2: ~1.18x faster than mmxext - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxext
This commit is contained in:
parent
27460dfebc
commit
7aa90b4e94
@ -1140,8 +1140,6 @@ IDCT_DC_DEQUANT 0
|
||||
INIT_MMX sse2
|
||||
IDCT_DC_DEQUANT 7
|
||||
|
||||
INIT_XMM avx
|
||||
|
||||
; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
|
||||
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
|
||||
movd %3, [%7]
|
||||
@ -1170,6 +1168,10 @@ INIT_XMM avx
|
||||
packuswb m1, m1
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_XMM 1
|
||||
|
||||
INIT_XMM %1
|
||||
|
||||
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
|
||||
movsxdifnidn stride_q, stride_d
|
||||
IDCT4_ADD dst_q, block_q, stride_q
|
||||
@ -1182,3 +1184,8 @@ cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
|
||||
DC_ADD_INIT r3
|
||||
DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
IDCT_XMM sse2
|
||||
IDCT_XMM avx
|
||||
|
@ -32,9 +32,11 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int stride);
|
||||
|
||||
IDCT_ADD_FUNC(, 8, mmx)
|
||||
IDCT_ADD_FUNC(, 8, sse2)
|
||||
IDCT_ADD_FUNC(, 8, avx)
|
||||
IDCT_ADD_FUNC(, 10, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(_dc, 8, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, avx)
|
||||
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 8, mmxext)
|
||||
@ -316,6 +318,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
|
||||
}
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_8_sse2;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
|
||||
|
Loading…
Reference in New Issue
Block a user