mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-18 03:19:31 +02:00
0e52a4e434
Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub. Additionally this changes parameters dx and dy from int to intptr_t. This allows dx & dy to be used as pointer offsets without needing to use movsxd. Benchmarks ( AMD 7940HS ) Before: BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 | Chimera_8bit_1080P_1000_frames.vvc | 204.3 | NovosobornayaSquare_1920x1080.bin | 197.3 | RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 | After: BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 | Chimera_8bit_1080P_1000_frames.vvc | 216.0 | NovosobornayaSquare_1920x1080.bin | 204.0| RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 | Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
134 lines
3.8 KiB
NASM
134 lines
3.8 KiB
NASM
; /*
|
|
; * Provide SIMD DMVR SAD functions for VVC decoding
|
|
; *
|
|
; * Copyright (c) 2024 Stone Chen
|
|
; *
|
|
; * This file is part of FFmpeg.
|
|
; *
|
|
; * FFmpeg is free software; you can redistribute it and/or
|
|
; * modify it under the terms of the GNU Lesser General Public
|
|
; * License as published by the Free Software Foundation; either
|
|
; * version 2.1 of the License, or (at your option) any later version.
|
|
; *
|
|
; * FFmpeg is distributed in the hope that it will be useful,
|
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; * Lesser General Public License for more details.
|
|
; *
|
|
; * You should have received a copy of the GNU Lesser General Public
|
|
; * License along with FFmpeg; if not, write to the Free Software
|
|
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
; */
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
%define MAX_PB_SIZE 128
|
|
%define ROWS 2
|
|
|
|
SECTION_RODATA
|
|
|
|
pw_1: times 2 dw 1
|
|
|
|
; DMVR SAD is only calculated on even rows to reduce complexity
|
|
SECTION .text
|
|
|
|
%macro MIN_MAX_SAD 3
|
|
pminuw %3, %2, %1
|
|
pmaxuw %1, %2, %1
|
|
psubusw %1, %1, %3
|
|
%endmacro
|
|
|
|
%macro HORIZ_ADD 3 ; xm0, xm1, m1
|
|
vextracti128 %1, %3, q0001 ; 3 2 1 0
|
|
paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
|
|
pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2)
|
|
paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2)
|
|
pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3)
|
|
paddd %1, %1, %2 ; (01234567)
|
|
%endmacro
|
|
|
|
%if ARCH_X86_64
|
|
%if HAVE_AVX2_EXTERNAL
|
|
|
|
INIT_YMM avx2
|
|
|
|
cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
|
|
movsxdifnidn dxq, dxd
|
|
movsxdifnidn dyq, dyd
|
|
|
|
sub dxq, 2
|
|
sub dyq, 2
|
|
|
|
mov off1q, 2
|
|
mov off2q, 2
|
|
|
|
add off1q, dyq
|
|
sub off2q, dyq
|
|
|
|
shl off1q, 7
|
|
shl off2q, 7
|
|
|
|
add off1q, dxq
|
|
sub off2q, dxq
|
|
|
|
lea src1q, [src1q + off1q * 2 + 2 * 2]
|
|
lea src2q, [src2q + off2q * 2 + 2 * 2]
|
|
|
|
pxor m3, m3
|
|
vpbroadcastd m4, [pw_1]
|
|
|
|
cmp block_wd, 16
|
|
jge vvc_sad_16_128
|
|
|
|
vvc_sad_8:
|
|
.loop_height:
|
|
movu xm0, [src1q]
|
|
vinserti128 m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
|
|
movu xm1, [src2q]
|
|
vinserti128 m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
|
|
|
|
MIN_MAX_SAD m1, m0, m2
|
|
pmaddwd m1, m4
|
|
paddd m3, m1
|
|
|
|
add src1q, 2 * MAX_PB_SIZE * ROWS * 2
|
|
add src2q, 2 * MAX_PB_SIZE * ROWS * 2
|
|
|
|
sub block_hd, 4
|
|
jg .loop_height
|
|
|
|
HORIZ_ADD xm0, xm3, m3
|
|
movd eax, xm0
|
|
RET
|
|
|
|
vvc_sad_16_128:
|
|
sar block_wd, 4
|
|
.loop_height:
|
|
mov off1q, src1q
|
|
mov off2q, src2q
|
|
mov row_idxd, block_wd
|
|
|
|
.loop_width:
|
|
movu m0, [src1q]
|
|
movu m1, [src2q]
|
|
MIN_MAX_SAD m1, m0, m2
|
|
pmaddwd m1, m4
|
|
paddd m3, m1
|
|
|
|
add src1q, 32
|
|
add src2q, 32
|
|
dec row_idxd
|
|
jg .loop_width
|
|
|
|
lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
|
|
lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
|
|
|
|
sub block_hd, 2
|
|
jg .loop_height
|
|
|
|
HORIZ_ADD xm0, xm3, m3
|
|
movd eax, xm0
|
|
RET
|
|
|
|
%endif
|
|
%endif
|