1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-10-06 05:47:18 +02:00

avcodec/x86/h264_qpel: Port pixel8_l2_shift5 from MMXEXT to SSE2

This abides by the ABI (no missing emms) and yields a tiny
performance improvement here.

Old benchmarks:
avg_h264_qpel_8_mc12_8_c:                              419.9 ( 1.00x)
avg_h264_qpel_8_mc12_8_sse2:                            78.9 ( 5.32x)
avg_h264_qpel_8_mc12_8_ssse3:                           71.7 ( 5.86x)
avg_h264_qpel_8_mc32_8_c:                              429.1 ( 1.00x)
avg_h264_qpel_8_mc32_8_sse2:                            76.9 ( 5.58x)
avg_h264_qpel_8_mc32_8_ssse3:                           73.4 ( 5.84x)
put_h264_qpel_8_mc12_8_c:                              424.0 ( 1.00x)
put_h264_qpel_8_mc12_8_sse2:                            78.6 ( 5.40x)
put_h264_qpel_8_mc12_8_ssse3:                           70.6 ( 6.00x)
put_h264_qpel_8_mc32_8_c:                              425.7 ( 1.00x)
put_h264_qpel_8_mc32_8_sse2:                            75.2 ( 5.66x)
put_h264_qpel_8_mc32_8_ssse3:                           70.4 ( 6.05x)

New benchmarks:
avg_h264_qpel_8_mc12_8_c:                              425.7 ( 1.00x)
avg_h264_qpel_8_mc12_8_sse2:                            77.5 ( 5.49x)
avg_h264_qpel_8_mc12_8_ssse3:                           69.8 ( 6.10x)
avg_h264_qpel_8_mc32_8_c:                              423.7 ( 1.00x)
avg_h264_qpel_8_mc32_8_sse2:                            74.6 ( 5.68x)
avg_h264_qpel_8_mc32_8_ssse3:                           71.9 ( 5.89x)
put_h264_qpel_8_mc12_8_c:                              422.2 ( 1.00x)
put_h264_qpel_8_mc12_8_sse2:                            75.8 ( 5.57x)
put_h264_qpel_8_mc12_8_ssse3:                           67.9 ( 6.22x)
put_h264_qpel_8_mc32_8_c:                              421.8 ( 1.00x)
put_h264_qpel_8_mc32_8_sse2:                            72.6 ( 5.81x)
put_h264_qpel_8_mc32_8_ssse3:                           67.7 ( 6.23x)

Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-09-29 22:59:10 +02:00
parent 4ac9162beb
commit 697da64c8e
2 changed files with 32 additions and 16 deletions

View File

@@ -70,7 +70,7 @@ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_
void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\
DEF_QPEL(avg)
@@ -186,8 +186,8 @@ SSSE3_HV2_LOWPASS_WRAPPER(put)
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_put_pixels16_l2_shift5_mmxext ff_put_pixels16_l2_shift5_sse2
#define ff_avg_pixels16_l2_shift5_mmxext ff_avg_pixels16_l2_shift5_sse2
#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext
#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext
#define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
@@ -309,7 +309,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((uintptr_t)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+2, halfHV, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
@@ -319,7 +319,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((uintptr_t)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+3, halfHV, stride);\
}\
#define H264_MC(QPEL, SIZE, MMX, ALIGN)\

View File

@@ -781,13 +781,30 @@ INIT_MMX mmxext
PIXELS4_L2_SHIFT5 put
PIXELS4_L2_SHIFT5 avg
%macro PIXELS8_L2_SHIFT5 1
cglobal %1_pixels8_l2_shift5, 5, 5, 3 ; dst, src16, src8, dstStride
movsxdifnidn r3, r3d
mov r4d, 8
.loop:
movu m0, [r1]
movu m1, [r1+48]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
pavgb m0, [r2]
pshufd m1, m0, 0xee ; low half of m1 is high half of m0
op_%1h m0, [r0], m2
op_%1h m1, [r0+r3], m2
add r1, 48*2
add r2, 8*2
lea r0, [r0+2*r3]
sub r4d, 2
jne .loop
RET
%endmacro
%macro PIXELS_L2_SHIFT5 2
%if cpuflag(sse2)
%macro PIXELS16_L2_SHIFT5 2
cglobal %1_pixels%2_l2_shift5, 5, 5, 4 ; dst, src16, src8, dstStride
%else
cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride
%endif
movsxdifnidn r3, r3d
mov r4d, %2
.loop:
@@ -813,13 +830,12 @@ cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride
RET
%endmacro
INIT_MMX mmxext
PIXELS_L2_SHIFT5 put, 8
PIXELS_L2_SHIFT5 avg, 8
INIT_XMM sse2
PIXELS_L2_SHIFT5 put, 16
PIXELS_L2_SHIFT5 avg, 16
PIXELS8_L2_SHIFT5 put
PIXELS8_L2_SHIFT5 avg
PIXELS16_L2_SHIFT5 put, 16
PIXELS16_L2_SHIFT5 avg, 16
%if ARCH_X86_64
%macro QPEL16_H_LOWPASS_L2_OP 1