mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-03 05:10:03 +02:00
cfbdda607d
After this patch, the peformance of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads improves 2fps (45fps-->47fsp). Reviewed-by: yinshiyou-hf@loongson.cn Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
163 lines
4.9 KiB
ArmAsm
163 lines
4.9 KiB
ArmAsm
/*
|
|
* Loongson LSX optimized add_residual functions for HEVC decoding
|
|
*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by jinbo <jinbo@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "loongson_asm.S"
|
|
|
|
/*
|
|
* void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
|
|
*/
|
|
.macro ADD_RES_LSX_4x4_8
|
|
vldrepl.w vr0, a0, 0
|
|
add.d t0, a0, a2
|
|
vldrepl.w vr1, t0, 0
|
|
vld vr2, a1, 0
|
|
|
|
vilvl.w vr1, vr1, vr0
|
|
vsllwil.hu.bu vr1, vr1, 0
|
|
vadd.h vr1, vr1, vr2
|
|
vssrani.bu.h vr1, vr1, 0
|
|
|
|
vstelm.w vr1, a0, 0, 0
|
|
vstelm.w vr1, t0, 0, 1
|
|
.endm
|
|
|
|
function ff_hevc_add_residual4x4_8_lsx
|
|
ADD_RES_LSX_4x4_8
|
|
alsl.d a0, a2, a0, 1
|
|
addi.d a1, a1, 16
|
|
ADD_RES_LSX_4x4_8
|
|
endfunc
|
|
|
|
/*
|
|
* void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
|
|
*/
|
|
.macro ADD_RES_LSX_8x8_8
|
|
vldrepl.d vr0, a0, 0
|
|
add.d t0, a0, a2
|
|
vldrepl.d vr1, t0, 0
|
|
add.d t1, t0, a2
|
|
vldrepl.d vr2, t1, 0
|
|
add.d t2, t1, a2
|
|
vldrepl.d vr3, t2, 0
|
|
|
|
vld vr4, a1, 0
|
|
addi.d t3, zero, 16
|
|
vldx vr5, a1, t3
|
|
addi.d t4, a1, 32
|
|
vld vr6, t4, 0
|
|
vldx vr7, t4, t3
|
|
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vsllwil.hu.bu vr1, vr1, 0
|
|
vsllwil.hu.bu vr2, vr2, 0
|
|
vsllwil.hu.bu vr3, vr3, 0
|
|
vadd.h vr0, vr0, vr4
|
|
vadd.h vr1, vr1, vr5
|
|
vadd.h vr2, vr2, vr6
|
|
vadd.h vr3, vr3, vr7
|
|
vssrani.bu.h vr1, vr0, 0
|
|
vssrani.bu.h vr3, vr2, 0
|
|
|
|
vstelm.d vr1, a0, 0, 0
|
|
vstelm.d vr1, t0, 0, 1
|
|
vstelm.d vr3, t1, 0, 0
|
|
vstelm.d vr3, t2, 0, 1
|
|
.endm
|
|
|
|
function ff_hevc_add_residual8x8_8_lsx
|
|
ADD_RES_LSX_8x8_8
|
|
alsl.d a0, a2, a0, 2
|
|
addi.d a1, a1, 64
|
|
ADD_RES_LSX_8x8_8
|
|
endfunc
|
|
|
|
/*
|
|
* void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
|
|
*/
|
|
function ff_hevc_add_residual16x16_8_lsx
|
|
.rept 8
|
|
vld vr0, a0, 0
|
|
vldx vr2, a0, a2
|
|
|
|
vld vr4, a1, 0
|
|
addi.d t0, zero, 16
|
|
vldx vr5, a1, t0
|
|
addi.d t1, a1, 32
|
|
vld vr6, t1, 0
|
|
vldx vr7, t1, t0
|
|
|
|
vexth.hu.bu vr1, vr0
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vexth.hu.bu vr3, vr2
|
|
vsllwil.hu.bu vr2, vr2, 0
|
|
vadd.h vr0, vr0, vr4
|
|
vadd.h vr1, vr1, vr5
|
|
vadd.h vr2, vr2, vr6
|
|
vadd.h vr3, vr3, vr7
|
|
|
|
vssrani.bu.h vr1, vr0, 0
|
|
vssrani.bu.h vr3, vr2, 0
|
|
|
|
vst vr1, a0, 0
|
|
vstx vr3, a0, a2
|
|
|
|
alsl.d a0, a2, a0, 1
|
|
addi.d a1, a1, 64
|
|
.endr
|
|
endfunc
|
|
|
|
/*
|
|
* void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
|
|
*/
|
|
function ff_hevc_add_residual32x32_8_lsx
|
|
.rept 32
|
|
vld vr0, a0, 0
|
|
addi.w t0, zero, 16
|
|
vldx vr2, a0, t0
|
|
|
|
vld vr4, a1, 0
|
|
vldx vr5, a1, t0
|
|
addi.d t1, a1, 32
|
|
vld vr6, t1, 0
|
|
vldx vr7, t1, t0
|
|
|
|
vexth.hu.bu vr1, vr0
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vexth.hu.bu vr3, vr2
|
|
vsllwil.hu.bu vr2, vr2, 0
|
|
vadd.h vr0, vr0, vr4
|
|
vadd.h vr1, vr1, vr5
|
|
vadd.h vr2, vr2, vr6
|
|
vadd.h vr3, vr3, vr7
|
|
|
|
vssrani.bu.h vr1, vr0, 0
|
|
vssrani.bu.h vr3, vr2, 0
|
|
|
|
vst vr1, a0, 0
|
|
vstx vr3, a0, t0
|
|
|
|
add.d a0, a0, a2
|
|
addi.d a1, a1, 64
|
|
.endr
|
|
endfunc
|