diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index c49a866c5d..75caac8805 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -59,6 +59,7 @@ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t * void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel16_h_lowpass_l2_sse2(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\ @@ -177,9 +178,6 @@ ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, i SSSE3_HV2_LOWPASS_WRAPPER(avg) SSSE3_HV2_LOWPASS_WRAPPER(put) -QPEL_H264_H16(avg_, sse2) -QPEL_H264_H16(put_, sse2) - #define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 #define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 #define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index dc55a8ad93..101ab21647 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -276,6 +276,80 @@ QPEL8_H_LOWPASS_L2_OP put QPEL8_H_LOWPASS_L2_OP avg +%macro QPEL16_H_LOWPASS_L2 1 +%if ARCH_X86_64 +cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,9 ; dst, src, src2, dstStride, srcStride + mova m8, [pw_16] +%define PW_16 m8 +%else +cglobal %1_h264_qpel16_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, srcStride +%define PW_16 [pw_16] +%endif + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 16 + pxor m7, m7 + mova m6, [pw_5] +.loop: + movu m0, [r1] + movu m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpcklbw m2, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + movu m2, [r1-1] + movu m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + psubw m0, m2 + psubw m1, m3 + pmullw m0, m6 + pmullw m1, m6 + movu m2, [r1-2] + movu m4, [r1+3] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + paddw m0, m2 + paddw m1, m3 + paddw m0, PW_16 + paddw m1, PW_16 + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + movu m4, [r2] + pavgb m0, m4 + op_%1 m0, [r0], m4 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + RET +%endmacro + +INIT_XMM sse2 +QPEL16_H_LOWPASS_L2 put +QPEL16_H_LOWPASS_L2 avg + + %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride movsxdifnidn r3, r3d