1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00
Files
FFmpeg/libavcodec/x86/h264_chromamc.asm
Andreas Rheinhardt 79080a547a avcodec/x86/h264_chromamc: Use xmm regs in chroma_mc4 SSSE3 functions
Doubling the register size allowed to avoid two pmaddubsw.
It is also ABI compliant (the old version lacked an emms)
and the average versions no longer rely on padding (the old versions
used pavgb with a memory operand reading eight bytes,
although only four are needed).

Old benchmarks (the latter four refer to RV40):
avg_h264_chroma_mc4_8_c:                               145.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3:                            32.3 ( 4.51x)
put_h264_chroma_mc4_8_c:                               136.1 ( 1.00x)
put_h264_chroma_mc4_8_ssse3:                            29.0 ( 4.70x)
avg_chroma_mc4_c:                                      162.1 ( 1.00x)
avg_chroma_mc4_ssse3:                                   31.1 ( 5.22x)
put_chroma_mc4_c:                                      137.5 ( 1.00x)
put_chroma_mc4_ssse3:                                   28.6 ( 4.81x)

New benchmarks:
avg_h264_chroma_mc4_8_c:                               146.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3:                            26.5 ( 5.53x)
put_h264_chroma_mc4_8_c:                               136.8 ( 1.00x)
put_h264_chroma_mc4_8_ssse3:                            22.5 ( 6.09x)
avg_chroma_mc4_c:                                      165.5 ( 1.00x)
avg_chroma_mc4_ssse3:                                   27.2 ( 6.08x)
put_chroma_mc4_c:                                      138.1 ( 1.00x)
put_chroma_mc4_ssse3:                                   23.2 ( 5.96x)

Reviewed-by: Lynne <dev@lynne.ee>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-06 02:16:28 +01:00

408 lines
11 KiB
NASM

;******************************************************************************
;* MMX/SSSE3-optimized functions for H.264 chroma MC
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;* 2005-2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "config_components.asm"
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_3
cextern pw_4
cextern pw_8
pw_28: times 8 dw 28
cextern pw_32
cextern pw_64
cextern rv40_bias
SECTION .text
%macro mv0_pixels_mc8 0
lea r4, [r2*2 ]
.next4rows:
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
sub r3d, 4
jne .next4rows
%endmacro
%macro chroma_mc2_mmx_func 2
cglobal %1_%2_chroma_mc2, 6, 7, 0
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
movd m2, [r1]
punpcklbw m2, m7
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
movd m0, [r1]
punpcklbw m0, m7
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [rnd_2d_%2]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
packuswb m1, m7
CHROMAMC_AVG4 m1, m3, [r0]
movd r5d, m1
mov [r0], r5w
add r0, r2
sub r3d, 1
jnz .nextrow
RET
%endmacro
%define rnd_1d_h264 pw_4
%define rnd_2d_h264 pw_32
%define rnd_1d_vc1 pw_3
%define rnd_2d_vc1 pw_28
%macro NOTHING 2-3
%endmacro
%macro DIRECT_AVG 2
PAVGB %1, %2
%endmacro
%macro COPY_AVG 3
movd %2, %3
PAVGB %1, %2
%endmacro
INIT_MMX mmxext
%define CHROMAMC_AVG NOTHING
%define CHROMAMC_AVG4 NOTHING
chroma_mc2_mmx_func put, h264
%define CHROMAMC_AVG DIRECT_AVG
%define CHROMAMC_AVG4 COPY_AVG
chroma_mc2_mmx_func avg, h264
%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
..@%1_%2_chroma_mc8_no_filter_ %+ cpuname:
mv0_pixels_mc8
RET
.at_least_one_non_zero:
test r5d, r5d
je .my_is_zero
test r4d, r4d
je .mx_is_zero
; general case, bilinear
movdqa m5, [rnd_2d_%2]
..@%1_%2_chroma_mc8_both_nonzero_ %+ cpuname:
mov r6d, r4d
shl r4d, 8
sub r4, r6
mov r6, 8
add r4, 8 ; x*288+8 = x<<8 | (8-x)
sub r6d, r5d
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
movd m7, r6d
movd m6, r4d
movq m0, [r1 ]
movq m1, [r1+1]
pshuflw m7, m7, 0
pshuflw m6, m6, 0
punpcklbw m0, m1
movlhps m7, m7
movlhps m6, m6
.next2rows:
movq m1, [r1+r2*1 ]
movq m2, [r1+r2*1+1]
movq m3, [r1+r2*2 ]
movq m4, [r1+r2*2+1]
lea r1, [r1+r2*2]
punpcklbw m1, m2
movdqa m2, m1
punpcklbw m3, m4
movdqa m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movdqa m0, m4
psrlw m3, 6
%ifidn %1, avg
movq m2, [r0 ]
movhps m2, [r0+r2]
%endif
packuswb m1, m3
CHROMAMC_AVG m1, m2
movq [r0 ], m1
movhps [r0+r2], m1
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
RET
.my_is_zero:
movdqa m6, [rnd_1d_%2]
..@%1_%2_chroma_mc8_my_zero_ %+ cpuname:
mov r5d, r4d
shl r4d, 8
add r4, 8
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
movd m7, r4d
pshuflw m7, m7, 0
movlhps m7, m7
.next2xrows:
movq m0, [r1 ]
movq m1, [r1 +1]
movq m2, [r1+r2 ]
movq m3, [r1+r2+1]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
jg .next2xrows
RET
.mx_is_zero:
movdqa m6, [rnd_1d_%2]
..@%1_%2_chroma_mc8_mx_zero_ %+ cpuname:
mov r4d, r5d
shl r5d, 8
add r5, 8
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
movd m7, r5d
pshuflw m7, m7, 0
movlhps m7, m7
.next2yrows:
movq m0, [r1 ]
movq m1, [r1+r2 ]
movdqa m2, m1
movq m3, [r1+r2*2]
lea r1, [r1+r2*2]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2yrows
RET
%endmacro
%macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 8
mova m5, [pw_32]
..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
mov r6d, r4d
shl r4d, 8
movd m0, [r1]
sub r6d, 8
sub r4d, r6d ; x << 8 | (8-x)
mov r6d, r5d
shl r5d, 16
movd m1, [r1+1]
sub r6d, 8
sub r5d, r6d ; y << 16 | (8-y)
imul r4d, r5d ; xy << 24 | (8-x)y << 16 | x(8-y) << 8 | (8-x)(8-y)
add r1, r2
movd m6, r4d ; ABCD
punpcklwd m6, m6 ; ABABCDCD
pshufd m7, m6, 0x55 ; CDCDCDCDCDCDCDCD
punpcklbw m0, m1
pshufd m6, m6, 0x0 ; ABABABABABABABAB
.next2rows:
movd m1, [r1]
movd m2, [r1+1]
movd m3, [r1+r2]
movd m4, [r1+r2+1]
punpcklbw m1, m2
punpcklqdq m0, m1
pmaddubsw m0, m6
punpcklbw m3, m4
punpcklqdq m1, m3
pmaddubsw m1, m7
%ifidn %1, avg
movd m2, [r0]
movd m4, [r0+r2]
%endif
paddw m0, m5
lea r1, [r1+r2*2]
paddw m0, m1
psrlw m0, 6
packuswb m0, m0
pshufd m1, m0, 0x1
%ifidn %1, avg
pavgb m0, m2
pavgb m1, m4
%endif
sub r3d, 2
movd [r0], m0
movd [r0+r2], m1
mova m0, m3
lea r0, [r0+r2*2]
jg .next2rows
RET
%endmacro
%macro rv40_get_bias 1 ; dst reg
%if !PIC || UNIX64
; on UNIX64 we have enough volatile registers
%if PIC && UNIX64
lea r7, [rv40_bias]
%endif
mov r6d, r5d
and r6d, 6 ; &~1 for mx/my=[0,7]
lea r6d, [r6d*4+r4d]
sar r6d, 1
%if PIC && UNIX64
movd %1, [r7+4*r6]
%else
movd %1, [rv40_bias+4*r6]
%endif
%else ; PIC && !UNIX64, de facto WIN64
lea r6, [rv40_bias]
%ifidn r5d, r5m ; always false for currently supported calling conventions
push r5
%endif
and r5d, 6 ; &~1 for mx/my=[0,7]
lea r5d, [r5d*4+r4d]
sar r5d, 1
movd %1, [r6+4*r5]
%ifidn r5d, r5m
pop r5
%else
mov r5d, r5m
%endif
%endif
SPLATW %1, %1
%endmacro
%macro rv40_chroma_mc8_func 1 ; put vs avg
%if CONFIG_RV40_DECODER
cglobal rv40_%1_chroma_mc8, 6, 7+UNIX64, 8
mov r6d, r5d
or r6d, r4d
jz ..@%1_h264_chroma_mc8_no_filter_ %+ cpuname
rv40_get_bias m5
; the bilinear code expects bias in m5, the one-dimensional code in m6
mova m6, m5
psraw m6, 3
test r5d, r5d
je ..@%1_h264_chroma_mc8_my_zero_ %+ cpuname
test r4d, r4d
je ..@%1_h264_chroma_mc8_mx_zero_ %+ cpuname
jmp ..@%1_h264_chroma_mc8_both_nonzero_ %+ cpuname
%endif
%endmacro
%macro rv40_chroma_mc4_func 1 ; put vs avg
%if CONFIG_RV40_DECODER
cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 8
rv40_get_bias m5
jmp ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
%endif
%endmacro
INIT_XMM ssse3
%define CHROMAMC_AVG NOTHING
chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1, _nornd
rv40_chroma_mc8_func put
chroma_mc4_ssse3_func put, h264
rv40_chroma_mc4_func put
%define CHROMAMC_AVG DIRECT_AVG
chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1, _nornd
rv40_chroma_mc8_func avg
chroma_mc4_ssse3_func avg, h264
rv40_chroma_mc4_func avg