1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-10-06 05:47:18 +02:00
Files
FFmpeg/libavcodec/x86/h264_qpel_8bit.asm
Andreas Rheinhardt 35aaf697e9 avcodec/x86/h264_qpel_8bit: Replace qpel8_h_lowpass_l2 MMXEXT by SSE2
Using xmm registers here is very natural, as it allows to
operate on eight words at a time. It also saves 48B here
and does not clobber the MMX state.

Old benchmarks (only tests affected by the modified function are shown):
avg_h264_qpel_8_mc11_8_c:                              352.2 ( 1.00x)
avg_h264_qpel_8_mc11_8_sse2:                            70.4 ( 5.00x)
avg_h264_qpel_8_mc11_8_ssse3:                           53.9 ( 6.53x)
avg_h264_qpel_8_mc13_8_c:                              353.3 ( 1.00x)
avg_h264_qpel_8_mc13_8_sse2:                            72.8 ( 4.86x)
avg_h264_qpel_8_mc13_8_ssse3:                           53.8 ( 6.57x)
avg_h264_qpel_8_mc21_8_c:                              404.0 ( 1.00x)
avg_h264_qpel_8_mc21_8_sse2:                           116.1 ( 3.48x)
avg_h264_qpel_8_mc21_8_ssse3:                           94.3 ( 4.28x)
avg_h264_qpel_8_mc23_8_c:                              398.9 ( 1.00x)
avg_h264_qpel_8_mc23_8_sse2:                           118.6 ( 3.36x)
avg_h264_qpel_8_mc23_8_ssse3:                           94.8 ( 4.21x)
avg_h264_qpel_8_mc31_8_c:                              352.7 ( 1.00x)
avg_h264_qpel_8_mc31_8_sse2:                            71.4 ( 4.94x)
avg_h264_qpel_8_mc31_8_ssse3:                           53.8 ( 6.56x)
avg_h264_qpel_8_mc33_8_c:                              354.0 ( 1.00x)
avg_h264_qpel_8_mc33_8_sse2:                            70.6 ( 5.01x)
avg_h264_qpel_8_mc33_8_ssse3:                           53.7 ( 6.59x)
avg_h264_qpel_16_mc11_8_c:                            1417.0 ( 1.00x)
avg_h264_qpel_16_mc11_8_sse2:                          276.9 ( 5.12x)
avg_h264_qpel_16_mc11_8_ssse3:                         178.8 ( 7.92x)
avg_h264_qpel_16_mc13_8_c:                            1427.3 ( 1.00x)
avg_h264_qpel_16_mc13_8_sse2:                          277.4 ( 5.14x)
avg_h264_qpel_16_mc13_8_ssse3:                         179.7 ( 7.94x)
avg_h264_qpel_16_mc21_8_c:                            1634.1 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2:                          421.3 ( 3.88x)
avg_h264_qpel_16_mc21_8_ssse3:                         291.2 ( 5.61x)
avg_h264_qpel_16_mc23_8_c:                            1627.0 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2:                          420.8 ( 3.87x)
avg_h264_qpel_16_mc23_8_ssse3:                         291.0 ( 5.59x)
avg_h264_qpel_16_mc31_8_c:                            1418.4 ( 1.00x)
avg_h264_qpel_16_mc31_8_sse2:                          278.5 ( 5.09x)
avg_h264_qpel_16_mc31_8_ssse3:                         178.6 ( 7.94x)
avg_h264_qpel_16_mc33_8_c:                            1407.3 ( 1.00x)
avg_h264_qpel_16_mc33_8_sse2:                          277.6 ( 5.07x)
avg_h264_qpel_16_mc33_8_ssse3:                         179.9 ( 7.82x)
put_h264_qpel_8_mc11_8_c:                              348.1 ( 1.00x)
put_h264_qpel_8_mc11_8_sse2:                            69.1 ( 5.04x)
put_h264_qpel_8_mc11_8_ssse3:                           53.8 ( 6.47x)
put_h264_qpel_8_mc13_8_c:                              349.3 ( 1.00x)
put_h264_qpel_8_mc13_8_sse2:                            69.7 ( 5.01x)
put_h264_qpel_8_mc13_8_ssse3:                           53.7 ( 6.51x)
put_h264_qpel_8_mc21_8_c:                              398.5 ( 1.00x)
put_h264_qpel_8_mc21_8_sse2:                           115.0 ( 3.46x)
put_h264_qpel_8_mc21_8_ssse3:                           95.3 ( 4.18x)
put_h264_qpel_8_mc23_8_c:                              399.9 ( 1.00x)
put_h264_qpel_8_mc23_8_sse2:                           120.8 ( 3.31x)
put_h264_qpel_8_mc23_8_ssse3:                           95.4 ( 4.19x)
put_h264_qpel_8_mc31_8_c:                              350.4 ( 1.00x)
put_h264_qpel_8_mc31_8_sse2:                            69.6 ( 5.03x)
put_h264_qpel_8_mc31_8_ssse3:                           54.2 ( 6.47x)
put_h264_qpel_8_mc33_8_c:                              353.1 ( 1.00x)
put_h264_qpel_8_mc33_8_sse2:                            71.0 ( 4.97x)
put_h264_qpel_8_mc33_8_ssse3:                           54.2 ( 6.51x)
put_h264_qpel_16_mc11_8_c:                            1384.2 ( 1.00x)
put_h264_qpel_16_mc11_8_sse2:                          272.9 ( 5.07x)
put_h264_qpel_16_mc11_8_ssse3:                         178.3 ( 7.76x)
put_h264_qpel_16_mc13_8_c:                            1393.6 ( 1.00x)
put_h264_qpel_16_mc13_8_sse2:                          271.1 ( 5.14x)
put_h264_qpel_16_mc13_8_ssse3:                         178.3 ( 7.82x)
put_h264_qpel_16_mc21_8_c:                            1612.6 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2:                          416.5 ( 3.87x)
put_h264_qpel_16_mc21_8_ssse3:                         289.1 ( 5.58x)
put_h264_qpel_16_mc23_8_c:                            1621.3 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2:                          416.9 ( 3.89x)
put_h264_qpel_16_mc23_8_ssse3:                         289.4 ( 5.60x)
put_h264_qpel_16_mc31_8_c:                            1408.4 ( 1.00x)
put_h264_qpel_16_mc31_8_sse2:                          273.5 ( 5.15x)
put_h264_qpel_16_mc31_8_ssse3:                         176.9 ( 7.96x)
put_h264_qpel_16_mc33_8_c:                            1396.4 ( 1.00x)
put_h264_qpel_16_mc33_8_sse2:                          276.3 ( 5.05x)
put_h264_qpel_16_mc33_8_ssse3:                         176.4 ( 7.92x)

New benchmarks:
avg_h264_qpel_8_mc11_8_c:                              352.1 ( 1.00x)
avg_h264_qpel_8_mc11_8_sse2:                            52.5 ( 6.71x)
avg_h264_qpel_8_mc11_8_ssse3:                           53.9 ( 6.54x)
avg_h264_qpel_8_mc13_8_c:                              350.8 ( 1.00x)
avg_h264_qpel_8_mc13_8_sse2:                            54.7 ( 6.42x)
avg_h264_qpel_8_mc13_8_ssse3:                           54.3 ( 6.46x)
avg_h264_qpel_8_mc21_8_c:                              400.1 ( 1.00x)
avg_h264_qpel_8_mc21_8_sse2:                            98.6 ( 4.06x)
avg_h264_qpel_8_mc21_8_ssse3:                           95.5 ( 4.19x)
avg_h264_qpel_8_mc23_8_c:                              400.4 ( 1.00x)
avg_h264_qpel_8_mc23_8_sse2:                           101.4 ( 3.95x)
avg_h264_qpel_8_mc23_8_ssse3:                           95.9 ( 4.18x)
avg_h264_qpel_8_mc31_8_c:                              352.4 ( 1.00x)
avg_h264_qpel_8_mc31_8_sse2:                            52.9 ( 6.67x)
avg_h264_qpel_8_mc31_8_ssse3:                           54.4 ( 6.48x)
avg_h264_qpel_8_mc33_8_c:                              354.5 ( 1.00x)
avg_h264_qpel_8_mc33_8_sse2:                            52.9 ( 6.70x)
avg_h264_qpel_8_mc33_8_ssse3:                           54.4 ( 6.52x)
avg_h264_qpel_16_mc11_8_c:                            1420.4 ( 1.00x)
avg_h264_qpel_16_mc11_8_sse2:                          204.8 ( 6.93x)
avg_h264_qpel_16_mc11_8_ssse3:                         177.9 ( 7.98x)
avg_h264_qpel_16_mc13_8_c:                            1409.8 ( 1.00x)
avg_h264_qpel_16_mc13_8_sse2:                          206.4 ( 6.83x)
avg_h264_qpel_16_mc13_8_ssse3:                         178.0 ( 7.92x)
avg_h264_qpel_16_mc21_8_c:                            1634.1 ( 1.00x)
avg_h264_qpel_16_mc21_8_sse2:                          349.6 ( 4.67x)
avg_h264_qpel_16_mc21_8_ssse3:                         290.0 ( 5.63x)
avg_h264_qpel_16_mc23_8_c:                            1624.1 ( 1.00x)
avg_h264_qpel_16_mc23_8_sse2:                          350.0 ( 4.64x)
avg_h264_qpel_16_mc23_8_ssse3:                         291.9 ( 5.56x)
avg_h264_qpel_16_mc31_8_c:                            1407.2 ( 1.00x)
avg_h264_qpel_16_mc31_8_sse2:                          205.8 ( 6.84x)
avg_h264_qpel_16_mc31_8_ssse3:                         178.2 ( 7.90x)
avg_h264_qpel_16_mc33_8_c:                            1400.5 ( 1.00x)
avg_h264_qpel_16_mc33_8_sse2:                          206.3 ( 6.79x)
avg_h264_qpel_16_mc33_8_ssse3:                         179.4 ( 7.81x)
put_h264_qpel_8_mc11_8_c:                              349.7 ( 1.00x)
put_h264_qpel_8_mc11_8_sse2:                            50.2 ( 6.96x)
put_h264_qpel_8_mc11_8_ssse3:                           51.3 ( 6.82x)
put_h264_qpel_8_mc13_8_c:                              349.8 ( 1.00x)
put_h264_qpel_8_mc13_8_sse2:                            50.7 ( 6.90x)
put_h264_qpel_8_mc13_8_ssse3:                           51.7 ( 6.76x)
put_h264_qpel_8_mc21_8_c:                              398.0 ( 1.00x)
put_h264_qpel_8_mc21_8_sse2:                            96.5 ( 4.13x)
put_h264_qpel_8_mc21_8_ssse3:                           92.3 ( 4.31x)
put_h264_qpel_8_mc23_8_c:                              401.4 ( 1.00x)
put_h264_qpel_8_mc23_8_sse2:                           102.3 ( 3.92x)
put_h264_qpel_8_mc23_8_ssse3:                           92.8 ( 4.32x)
put_h264_qpel_8_mc31_8_c:                              349.4 ( 1.00x)
put_h264_qpel_8_mc31_8_sse2:                            50.8 ( 6.88x)
put_h264_qpel_8_mc31_8_ssse3:                           51.8 ( 6.75x)
put_h264_qpel_8_mc33_8_c:                              351.1 ( 1.00x)
put_h264_qpel_8_mc33_8_sse2:                            52.2 ( 6.73x)
put_h264_qpel_8_mc33_8_ssse3:                           51.7 ( 6.79x)
put_h264_qpel_16_mc11_8_c:                            1391.1 ( 1.00x)
put_h264_qpel_16_mc11_8_sse2:                          196.6 ( 7.07x)
put_h264_qpel_16_mc11_8_ssse3:                         178.2 ( 7.81x)
put_h264_qpel_16_mc13_8_c:                            1385.2 ( 1.00x)
put_h264_qpel_16_mc13_8_sse2:                          195.6 ( 7.08x)
put_h264_qpel_16_mc13_8_ssse3:                         176.6 ( 7.84x)
put_h264_qpel_16_mc21_8_c:                            1607.5 ( 1.00x)
put_h264_qpel_16_mc21_8_sse2:                          341.0 ( 4.71x)
put_h264_qpel_16_mc21_8_ssse3:                         289.1 ( 5.56x)
put_h264_qpel_16_mc23_8_c:                            1616.7 ( 1.00x)
put_h264_qpel_16_mc23_8_sse2:                          340.8 ( 4.74x)
put_h264_qpel_16_mc23_8_ssse3:                         288.6 ( 5.60x)
put_h264_qpel_16_mc31_8_c:                            1397.6 ( 1.00x)
put_h264_qpel_16_mc31_8_sse2:                          197.3 ( 7.08x)
put_h264_qpel_16_mc31_8_ssse3:                         175.4 ( 7.97x)
put_h264_qpel_16_mc33_8_c:                            1394.3 ( 1.00x)
put_h264_qpel_16_mc33_8_sse2:                          197.7 ( 7.05x)
put_h264_qpel_16_mc33_8_ssse3:                         175.2 ( 7.96x)

As can be seen, the SSE2 version is often neck-to-neck with the SSSE3
version (which also benefits from a better hv2_lowpass SSSE3
implementation for mc21 and mc23) for eight byte block sizes.
Unsurprisingly, SSSE3 beats SSE2 for 16x16 blocks: For SSE2,
these blocks are processed by calling the 8x8 function four times
whereas SSSE3 has a dedicated function (on x64).
This implementation should also be extendable to an AVX version
for 16x16 blocks.

Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-04 07:06:33 +02:00

829 lines
20 KiB
NASM

;*****************************************************************************
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2012 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
cextern pw_16
cextern pw_5
cextern pb_0
SECTION .text
; void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
; ptrdiff_t line_size)
INIT_MMX mmxext
cglobal avg_pixels4, 3,4
lea r3, [r2*3]
movh m0, [r1]
movh m1, [r1+r2]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r3]
movh [r0], m0
movh [r0+r2], m1
movh [r0+r2*2], m2
movh [r0+r3], m3
RET
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2-3
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2-3
mova %2, %1
%endmacro
; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; ptrdiff_t stride)
%macro PIXELS4_L2 1
%define OP op_%1h
cglobal %1_pixels4_l2, 4,4
mova m0, [r1]
mova m1, [r1+r3]
lea r1, [r1+2*r3]
pavgb m0, [r2]
pavgb m1, [r2+4]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r3]
pavgb m0, [r2+8]
pavgb m1, [r2+12]
OP m0, [r0], m3
OP m1, [r0+r3], m3
RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2 put
PIXELS4_L2 avg
%macro QPEL4_H_LOWPASS_OP 1
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r4d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
psraw m0, 5
packuswb m0, m0
op_%1h m0, [r0], m6
add r0, r2
add r1, r3
dec r4d
jg .loop
RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_OP put
QPEL4_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
movu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
op_%1h m2, [r0], m4
add r1, r3
add r0, r2
dec r4d
jne .loop
RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_OP_XMM put
QPEL8_H_LOWPASS_OP_XMM avg
%macro QPEL4_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r5d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
movh m3, [r2]
psraw m0, 5
packuswb m0, m0
pavgb m0, m3
op_%1h m0, [r0], m6
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_L2_OP put
QPEL4_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mova m3, [pw_16]
mov r5d, 8
pxor m5, m5
mova m4, [pw_5]
.loop:
movh m0, [r1]
movh m1, [r1+1]
punpcklbw m0, m5
punpcklbw m1, m5
paddw m0, m1
psllw m0, 2
movh m1, [r1-1]
movh m2, [r1+2]
punpcklbw m1, m5
punpcklbw m2, m5
paddw m1, m2
psubw m0, m1
pmullw m0, m4
movh m1, [r1-2]
movh m2, [r1+3]
punpcklbw m1, m5
punpcklbw m2, m5
paddw m0, m1
paddw m0, m2
paddw m0, m3
psraw m0, 5
packuswb m0, m5
movh m2, [r2]
pavgb m0, m2
op_%1h m0, [r0], m2
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
RET
%endmacro
INIT_XMM sse2
QPEL8_H_LOWPASS_L2_OP put
QPEL8_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
lddqu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
movh m3, [r2]
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
pavgb m2, m3
op_%1h m2, [r0], m4
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_L2_OP_XMM put
QPEL8_H_LOWPASS_L2_OP_XMM avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
%macro FILT_V 1
mova m6, m2
movh m5, [r1]
paddw m6, m3
psllw m6, 2
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, [pw_16]
add r1, r3
paddw m0, m5
paddw m6, m0
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
add r0, r2
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_V_LOWPASS_OP 1
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
RET
%endmacro
INIT_MMX mmxext
QPEL4_V_LOWPASS_OP put
QPEL4_V_LOWPASS_OP avg
%macro QPEL8OR16_V_LOWPASS_OP 1
%if cpuflag(sse2)
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
%else
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
%endif
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
cmp r4d, 16
jne .end
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
.end:
RET
%endmacro
INIT_XMM sse2
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
; All functions that use this are required to have args:
; src, tmp, srcSize
%macro FILT_HV 1 ; offset
mova m6, m2
movh m5, [r0]
paddw m6, m3
psllw m6, 2
paddw m0, [pw_16]
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
add r0, r2
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
RET
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
movsxdifnidn r2, r2d
mov r3d, 4
.loop:
mova m0, [r0]
paddw m0, [r0+10]
mova m1, [r0+2]
paddw m1, [r0+8]
mova m2, [r0+4]
paddw m2, [r0+6]
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddsw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r1], m7
add r0, 24
add r1, r2
dec r3d
jnz .loop
RET
%endmacro
INIT_MMX mmxext
QPEL4_HV1_LOWPASS_OP put
QPEL4_HV1_LOWPASS_OP avg
%macro QPEL8OR16_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*48
FILT_HV 1*48
FILT_HV 2*48
FILT_HV 3*48
FILT_HV 4*48
FILT_HV 5*48
FILT_HV 6*48
FILT_HV 7*48
cmp r3d, 16
jne .end
FILT_HV 8*48
FILT_HV 9*48
FILT_HV 10*48
FILT_HV 11*48
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
FILT_HV 15*48
.end:
RET
%endmacro
INIT_XMM sse2
QPEL8OR16_HV1_LOWPASS_OP put
%macro QPEL8OR16_HV2_LOWPASS_OP 1
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 4,4 ; dst, tmp, dstStride, h
movsxdifnidn r2, r2d
.loop:
mova m0, [r1]
mova m3, [r1+8]
mova m1, [r1+2]
mova m4, [r1+10]
paddw m0, m4
paddw m1, m3
paddw m3, [r1+18]
paddw m4, [r1+16]
mova m2, [r1+4]
mova m5, [r1+12]
paddw m2, [r1+6]
paddw m5, [r1+14]
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddsw m0, m2
paddsw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m0, m3
op_%1 m0, [r0], m7
add r1, 48
add r0, r2
dec r3d
jne .loop
RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
%ifidn %1, avg
cglobal %1_h264_qpel8_hv2_lowpass, 3,4,7 ; dst, tmp, dstStride
%else
cglobal %1_h264_qpel8_hv2_lowpass, 3,4,6 ; dst, tmp, dstStride
%endif
movsxdifnidn r2, r2d
mov r3d, 8
.loop:
mova m1, [r1+16]
mova m0, [r1]
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m5, m0, 10
palignr m4, m0, 8
palignr m3, m0, 6
palignr m2, m0, 4
palignr m1, m0, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r0], m6
add r1, 48
add r0, r2
dec r3d
jne .loop
RET
cglobal %1_h264_qpel16_hv2_lowpass, 3,4,8 ; dst, tmp, dstStride
movsxdifnidn r2, r2d
mov r3d, 16
.loop:
mova m4, [r1+32]
mova m5, [r1+16]
mova m7, [r1]
mova m3, m4
mova m2, m4
mova m1, m4
mova m0, m4
palignr m0, m5, 10
palignr m1, m5, 8
palignr m2, m5, 6
palignr m3, m5, 4
palignr m4, m5, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
mova m6, m5
mova m4, m5
mova m3, m5
palignr m4, m7, 8
palignr m6, m7, 2
palignr m3, m7, 10
paddw m4, m6
mova m6, m5
palignr m5, m7, 6
palignr m6, m7, 4
paddw m3, m7
paddw m5, m6
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddw m0, m2
paddw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m3, m0
op_%1 m3, [r0], m7
add r1, 48
add r0, r2
dec r3d
jne .loop
RET
%endmacro
INIT_XMM ssse3
QPEL8OR16_HV2_LOWPASS_OP_XMM put
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
%macro PIXELS4_L2_SHIFT5 1
cglobal %1_pixels4_l2_shift5,4,4 ; dst, src16, src8, dstStride
movsxdifnidn r3, r3d
mova m0, [r1]
mova m1, [r1+24]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
lea r0, [r0+r3*2]
mova m0, [r1+48]
mova m1, [r1+72]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2+2*4]
pavgb m1, [r2+3*4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2_SHIFT5 put
PIXELS4_L2_SHIFT5 avg
%macro PIXELS_L2_SHIFT5 2
%if cpuflag(sse2)
cglobal %1_pixels%2_l2_shift5, 5, 5, 4 ; dst, src16, src8, dstStride
%else
cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride
%endif
movsxdifnidn r3, r3d
mov r4d, %2
.loop:
movu m0, [r1]
movu m1, [r1+%2]
movu m2, [r1+48]
movu m3, [r1+48+%2]
psraw m0, 5
psraw m1, 5
psraw m2, 5
psraw m3, 5
packuswb m0, m1
packuswb m2, m3
pavgb m0, [r2]
pavgb m2, [r2+%2]
op_%1 m0, [r0], m1
op_%1 m2, [r0+r3], m1
lea r2, [r2+2*%2]
add r1, 48*2
lea r0, [r0+2*r3]
sub r4d, 2
jne .loop
RET
%endmacro
INIT_MMX mmxext
PIXELS_L2_SHIFT5 put, 8
PIXELS_L2_SHIFT5 avg, 8
INIT_XMM sse2
PIXELS_L2_SHIFT5 put, 16
PIXELS_L2_SHIFT5 avg, 16
%if ARCH_X86_64
%macro QPEL16_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 16
pxor m15, m15
mova m14, [pw_5]
mova m13, [pw_16]
.loop:
lddqu m1, [r1+6]
lddqu m7, [r1-2]
mova m0, m1
punpckhbw m1, m15
punpcklbw m0, m15
punpcklbw m7, m15
mova m2, m1
mova m6, m0
mova m3, m1
mova m8, m0
mova m4, m1
mova m9, m0
mova m12, m0
mova m11, m1
palignr m11, m0, 10
palignr m12, m7, 10
palignr m4, m0, 2
palignr m9, m7, 2
palignr m3, m0, 4
palignr m8, m7, 4
palignr m2, m0, 6
palignr m6, m7, 6
paddw m11, m0
palignr m1, m0, 8
palignr m0, m7, 8
paddw m7, m12
paddw m2, m3
paddw m6, m8
paddw m1, m4
paddw m0, m9
psllw m2, 2
psllw m6, 2
psubw m2, m1
psubw m6, m0
paddw m11, m13
paddw m7, m13
pmullw m2, m14
pmullw m6, m14
lddqu m3, [r2]
paddw m2, m11
paddw m6, m7
psraw m2, 5
psraw m6, 5
packuswb m6, m2
pavgb m6, m3
op_%1 m6, [r0], m11
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
RET
%endmacro
INIT_XMM ssse3
QPEL16_H_LOWPASS_L2_OP put
QPEL16_H_LOWPASS_L2_OP avg
%endif