You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-10-06 05:47:18 +02:00
avcodec/x86/h264_qpel: Add ff_{avg,put}_pixels16_l2_shift5_sse2
Up until now this function was emulated via two calls to ff_{avg,pull}_pixels8_l2_shift5_mmxext(). Adding a dedicated function proved beneficial both size wise and performance wise: The new functions take 192B, yet the simplified calls save 256B with GCC and 320B with Clang here. This change will also allow further optimizations. Old benchmarks: avg_h264_qpel_16_mc12_8_c: 1735.8 ( 1.00x) avg_h264_qpel_16_mc12_8_sse2: 300.8 ( 5.77x) avg_h264_qpel_16_mc12_8_ssse3: 233.3 ( 7.44x) avg_h264_qpel_16_mc32_8_c: 1777.9 ( 1.00x) avg_h264_qpel_16_mc32_8_sse2: 275.6 ( 6.45x) avg_h264_qpel_16_mc32_8_ssse3: 235.7 ( 7.54x) put_h264_qpel_16_mc12_8_c: 1808.2 ( 1.00x) put_h264_qpel_16_mc12_8_sse2: 267.2 ( 6.77x) put_h264_qpel_16_mc12_8_ssse3: 231.9 ( 7.80x) put_h264_qpel_16_mc32_8_c: 1766.9 ( 1.00x) put_h264_qpel_16_mc32_8_sse2: 272.9 ( 6.47x) put_h264_qpel_16_mc32_8_ssse3: 229.5 ( 7.70x) New benchmarks: avg_h264_qpel_16_mc12_8_c: 1742.3 ( 1.00x) avg_h264_qpel_16_mc12_8_sse2: 240.3 ( 7.25x) avg_h264_qpel_16_mc12_8_ssse3: 214.8 ( 8.11x) avg_h264_qpel_16_mc32_8_c: 1748.0 ( 1.00x) avg_h264_qpel_16_mc32_8_sse2: 238.0 ( 7.35x) avg_h264_qpel_16_mc32_8_ssse3: 209.2 ( 8.35x) put_h264_qpel_16_mc12_8_c: 2014.4 ( 1.00x) put_h264_qpel_16_mc12_8_sse2: 243.7 ( 8.27x) put_h264_qpel_16_mc12_8_ssse3: 211.5 ( 9.52x) put_h264_qpel_16_mc32_8_c: 1800.0 ( 1.00x) put_h264_qpel_16_mc32_8_sse2: 238.8 ( 7.54x) put_h264_qpel_16_mc32_8_ssse3: 206.7 ( 8.71x) Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -68,7 +68,8 @@ void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, in
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
|
||||
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
|
||||
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
|
||||
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
|
||||
void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
|
||||
|
||||
DEF_QPEL(avg)
|
||||
DEF_QPEL(put)
|
||||
@@ -104,12 +105,6 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(u
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
|
||||
{\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
|
||||
}\
|
||||
|
||||
|
||||
#if ARCH_X86_64
|
||||
@@ -191,6 +186,9 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin
|
||||
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
|
||||
#define ff_put_pixels16_l2_shift5_mmxext ff_put_pixels16_l2_shift5_sse2
|
||||
#define ff_avg_pixels16_l2_shift5_mmxext ff_avg_pixels16_l2_shift5_sse2
|
||||
|
||||
#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
|
||||
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
|
||||
|
@@ -734,15 +734,19 @@ PIXELS4_L2_SHIFT5 put
|
||||
PIXELS4_L2_SHIFT5 avg
|
||||
|
||||
|
||||
%macro PIXELS8_L2_SHIFT5 1
|
||||
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
%macro PIXELS_L2_SHIFT5 2
|
||||
%if cpuflag(sse2)
|
||||
cglobal %1_pixels%2_l2_shift5, 6, 6, 4 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
%else
|
||||
cglobal %1_pixels%2_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
%endif
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+8]
|
||||
mova m2, [r1+48]
|
||||
mova m3, [r1+48+8]
|
||||
movu m0, [r1]
|
||||
movu m1, [r1+%2]
|
||||
movu m2, [r1+48]
|
||||
movu m3, [r1+48+%2]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
psraw m2, 5
|
||||
@@ -751,8 +755,8 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
packuswb m2, m3
|
||||
pavgb m0, [r2]
|
||||
pavgb m2, [r2+r4]
|
||||
op_%1 m0, [r0], m4
|
||||
op_%1 m2, [r0+r3], m5
|
||||
op_%1 m0, [r0], m1
|
||||
op_%1 m2, [r0+r3], m1
|
||||
lea r2, [r2+2*r4]
|
||||
add r1, 48*2
|
||||
lea r0, [r0+2*r3]
|
||||
@@ -762,9 +766,12 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PIXELS8_L2_SHIFT5 put
|
||||
PIXELS8_L2_SHIFT5 avg
|
||||
PIXELS_L2_SHIFT5 put, 8
|
||||
PIXELS_L2_SHIFT5 avg, 8
|
||||
|
||||
INIT_XMM sse2
|
||||
PIXELS_L2_SHIFT5 put, 16
|
||||
PIXELS_L2_SHIFT5 avg, 16
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro QPEL16_H_LOWPASS_L2_OP 1
|
||||
|
Reference in New Issue
Block a user