1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-10-06 05:47:18 +02:00

avcodec/x86/h264_qpel: Add ff_{avg,put}_h264_qpel16_h_lowpass_l2_sse2()

These functions are currently emulated via four calls to the versions
for 8x8 blocks. In fact, the size savings from the simplified calls
in h264_qpel.c (GCC 1344B, Clang 1280B) more than outweigh the size
of the added functions (512B) here.

It is also beneficial performance-wise. Old benchmarks:
avg_h264_qpel_16_mc11_8_c:                            1414.1 ( 1.00x)
avg_h264_qpel_16_mc11_8_sse2:                          206.2 ( 6.86x)
avg_h264_qpel_16_mc11_8_ssse3:                         177.7 ( 7.96x)
avg_h264_qpel_16_mc13_8_c:                            1417.0 ( 1.00x)
avg_h264_qpel_16_mc13_8_sse2:                          207.4 ( 6.83x)
avg_h264_qpel_16_mc13_8_ssse3:                         178.2 ( 7.95x)
avg_h264_qpel_16_mc21_8_c:                            1632.8 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2:                          349.3 ( 4.67x)
avg_h264_qpel_16_mc21_8_ssse3:                         291.3 ( 5.60x)
avg_h264_qpel_16_mc23_8_c:                            1640.2 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2:                          351.3 ( 4.67x)
avg_h264_qpel_16_mc23_8_ssse3:                         290.8 ( 5.64x)
avg_h264_qpel_16_mc31_8_c:                            1411.7 ( 1.00x)
avg_h264_qpel_16_mc31_8_sse2:                          203.4 ( 6.94x)
avg_h264_qpel_16_mc31_8_ssse3:                         178.9 ( 7.89x)
avg_h264_qpel_16_mc33_8_c:                            1409.7 ( 1.00x)
avg_h264_qpel_16_mc33_8_sse2:                          204.6 ( 6.89x)
avg_h264_qpel_16_mc33_8_ssse3:                         178.1 ( 7.92x)
put_h264_qpel_16_mc11_8_c:                            1391.0 ( 1.00x)
put_h264_qpel_16_mc11_8_sse2:                          197.4 ( 7.05x)
put_h264_qpel_16_mc11_8_ssse3:                         176.1 ( 7.90x)
put_h264_qpel_16_mc13_8_c:                            1395.9 ( 1.00x)
put_h264_qpel_16_mc13_8_sse2:                          196.7 ( 7.10x)
put_h264_qpel_16_mc13_8_ssse3:                         177.7 ( 7.85x)
put_h264_qpel_16_mc21_8_c:                            1609.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2:                          341.1 ( 4.72x)
put_h264_qpel_16_mc21_8_ssse3:                         289.2 ( 5.57x)
put_h264_qpel_16_mc23_8_c:                            1604.0 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2:                          340.9 ( 4.71x)
put_h264_qpel_16_mc23_8_ssse3:                         289.6 ( 5.54x)
put_h264_qpel_16_mc31_8_c:                            1390.2 ( 1.00x)
put_h264_qpel_16_mc31_8_sse2:                          194.6 ( 7.14x)
put_h264_qpel_16_mc31_8_ssse3:                         176.4 ( 7.88x)
put_h264_qpel_16_mc33_8_c:                            1400.4 ( 1.00x)
put_h264_qpel_16_mc33_8_sse2:                          198.5 ( 7.06x)
put_h264_qpel_16_mc33_8_ssse3:                         176.2 ( 7.95x)

New benchmarks:
avg_h264_qpel_16_mc11_8_c:                            1413.3 ( 1.00x)
avg_h264_qpel_16_mc11_8_sse2:                          171.8 ( 8.23x)
avg_h264_qpel_16_mc11_8_ssse3:                         173.0 ( 8.17x)
avg_h264_qpel_16_mc13_8_c:                            1423.2 ( 1.00x)
avg_h264_qpel_16_mc13_8_sse2:                          172.0 ( 8.27x)
avg_h264_qpel_16_mc13_8_ssse3:                         173.4 ( 8.21x)
avg_h264_qpel_16_mc21_8_c:                            1641.3 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2:                          322.1 ( 5.10x)
avg_h264_qpel_16_mc21_8_ssse3:                         291.3 ( 5.63x)
avg_h264_qpel_16_mc23_8_c:                            1629.1 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2:                          323.0 ( 5.04x)
avg_h264_qpel_16_mc23_8_ssse3:                         293.3 ( 5.55x)
avg_h264_qpel_16_mc31_8_c:                            1409.2 ( 1.00x)
avg_h264_qpel_16_mc31_8_sse2:                          172.0 ( 8.19x)
avg_h264_qpel_16_mc31_8_ssse3:                         173.7 ( 8.11x)
avg_h264_qpel_16_mc33_8_c:                            1402.5 ( 1.00x)
avg_h264_qpel_16_mc33_8_sse2:                          172.5 ( 8.13x)
avg_h264_qpel_16_mc33_8_ssse3:                         173.6 ( 8.08x)
put_h264_qpel_16_mc11_8_c:                            1393.7 ( 1.00x)
put_h264_qpel_16_mc11_8_sse2:                          170.4 ( 8.18x)
put_h264_qpel_16_mc11_8_ssse3:                         178.2 ( 7.82x)
put_h264_qpel_16_mc13_8_c:                            1398.0 ( 1.00x)
put_h264_qpel_16_mc13_8_sse2:                          170.2 ( 8.21x)
put_h264_qpel_16_mc13_8_ssse3:                         178.6 ( 7.83x)
put_h264_qpel_16_mc21_8_c:                            1619.6 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2:                          320.6 ( 5.05x)
put_h264_qpel_16_mc21_8_ssse3:                         297.2 ( 5.45x)
put_h264_qpel_16_mc23_8_c:                            1617.4 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2:                          320.0 ( 5.05x)
put_h264_qpel_16_mc23_8_ssse3:                         297.4 ( 5.44x)
put_h264_qpel_16_mc31_8_c:                            1389.7 ( 1.00x)
put_h264_qpel_16_mc31_8_sse2:                          169.9 ( 8.18x)
put_h264_qpel_16_mc31_8_ssse3:                         178.1 ( 7.80x)
put_h264_qpel_16_mc33_8_c:                            1394.0 ( 1.00x)
put_h264_qpel_16_mc33_8_sse2:                          170.9 ( 8.16x)
put_h264_qpel_16_mc33_8_ssse3:                         176.9 ( 7.88x)

Notice that the SSSE3 versions of mc21 and mc23 benefit from
an optimized version of hv2_lowpass.

Also notice that there is no SSE2 version of the purely horizontal
motion compensation. This means that src2 is currently always aligned
when calling the SSE2 functions (and that srcStride is always equal
to the block width). Yet this has not been exploited (yet).

Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-09-29 19:51:47 +02:00
parent 4880fa4dca
commit cd077e88d1
2 changed files with 75 additions and 3 deletions

View File

@@ -59,6 +59,7 @@ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
@@ -177,9 +178,6 @@ ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, i
SSSE3_HV2_LOWPASS_WRAPPER(avg)
SSSE3_HV2_LOWPASS_WRAPPER(put)
QPEL_H264_H16(avg_, sse2)
QPEL_H264_H16(put_, sse2)
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2

View File

@@ -276,6 +276,80 @@ QPEL8_H_LOWPASS_L2_OP put
QPEL8_H_LOWPASS_L2_OP avg
%macro QPEL16_H_LOWPASS_L2 1
%if ARCH_X86_64
cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,9 ; dst, src, src2, dstStride, srcStride
mova m8, [pw_16]
%define PW_16 m8
%else
cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, srcStride
%define PW_16 [pw_16]
%endif
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 16
pxor m7, m7
mova m6, [pw_5]
.loop:
movu m0, [r1]
movu m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpcklbw m2, m7
punpckhbw m1, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
movu m2, [r1-1]
movu m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpcklbw m4, m7
punpckhbw m3, m7
punpckhbw m5, m7
paddw m2, m4
paddw m3, m5
psubw m0, m2
psubw m1, m3
pmullw m0, m6
pmullw m1, m6
movu m2, [r1-2]
movu m4, [r1+3]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpcklbw m4, m7
punpckhbw m3, m7
punpckhbw m5, m7
paddw m2, m4
paddw m3, m5
paddw m0, m2
paddw m1, m3
paddw m0, PW_16
paddw m1, PW_16
psraw m0, 5
psraw m1, 5
packuswb m0, m1
movu m4, [r2]
pavgb m0, m4
op_%1 m0, [r0], m4
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
RET
%endmacro
INIT_XMM sse2
QPEL16_H_LOWPASS_L2 put
QPEL16_H_LOWPASS_L2 avg
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d