mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
e1b6ecd20a
loongson_asm.S is LoongArch asm optimization helper. Add functions: ff_h264_idct_add_8_lsx ff_h264_idct8_add_8_lsx ff_h264_idct_dc_add_8_lsx ff_h264_idct8_dc_add_8_lsx ff_h264_idct_add16_8_lsx ff_h264_idct8_add4_8_lsx ff_h264_idct_add8_8_lsx ff_h264_idct_add8_422_8_lsx ff_h264_idct_add16_intra_8_lsx ff_h264_luma_dc_dequant_idct_8_lsx Replaced function(LSX is sufficient for these functions): ff_h264_idct_add_lasx ff_h264_idct4x4_addblk_dc_lasx ff_h264_idct_add16_lasx ff_h264_idct8_add4_lasx ff_h264_idct_add8_lasx ff_h264_idct_add8_422_lasx ff_h264_idct_add16_intra_lasx ff_h264_deq_idct_luma_dc_lasx Renamed functions: ff_h264_idct8_addblk_lasx ==> ff_h264_idct8_add_8_lasx ff_h264_idct8_dc_addblk_lasx ==> ff_h264_idct8_dc_add_8_lasx ./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 155fps after: 161fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
659 lines
23 KiB
ArmAsm
659 lines
23 KiB
ArmAsm
/*
|
|
* Loongson LASX optimized h264idct
|
|
*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "loongson_asm.S"
|
|
|
|
/*
|
|
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
|
* void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
|
|
* LSX optimization is enough for this function.
|
|
*/
|
|
function ff_h264_idct_add_8_lsx
|
|
fld.d f0, a1, 0
|
|
fld.d f1, a1, 8
|
|
fld.d f2, a1, 16
|
|
fld.d f3, a1, 24
|
|
vxor.v vr7, vr7, vr7
|
|
add.d t2, a2, a2
|
|
add.d t3, t2, a2
|
|
vst vr7, a1, 0
|
|
vst vr7, a1, 16
|
|
|
|
vadd.h vr4, vr0, vr2
|
|
vsub.h vr5, vr0, vr2
|
|
vsrai.h vr6, vr1, 1
|
|
vsrai.h vr7, vr3, 1
|
|
vsub.h vr6, vr6, vr3
|
|
vadd.h vr7, vr1, vr7
|
|
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
|
|
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
|
|
vadd.h vr4, vr0, vr2
|
|
vsub.h vr5, vr0, vr2
|
|
vsrai.h vr6, vr1, 1
|
|
vsrai.h vr7, vr3, 1
|
|
vsub.h vr6, vr6, vr3
|
|
vadd.h vr7, vr1, vr7
|
|
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
|
|
|
|
fld.s f4, a0, 0
|
|
fldx.s f5, a0, a2
|
|
fldx.s f6, a0, t2
|
|
fldx.s f7, a0, t3
|
|
|
|
vsrari.h vr0, vr0, 6
|
|
vsrari.h vr1, vr1, 6
|
|
vsrari.h vr2, vr2, 6
|
|
vsrari.h vr3, vr3, 6
|
|
|
|
vsllwil.hu.bu vr4, vr4, 0
|
|
vsllwil.hu.bu vr5, vr5, 0
|
|
vsllwil.hu.bu vr6, vr6, 0
|
|
vsllwil.hu.bu vr7, vr7, 0
|
|
vadd.h vr0, vr0, vr4
|
|
vadd.h vr1, vr1, vr5
|
|
vadd.h vr2, vr2, vr6
|
|
vadd.h vr3, vr3, vr7
|
|
vssrarni.bu.h vr1, vr0, 0
|
|
vssrarni.bu.h vr3, vr2, 0
|
|
|
|
vbsrl.v vr0, vr1, 8
|
|
vbsrl.v vr2, vr3, 8
|
|
fst.s f1, a0, 0
|
|
fstx.s f0, a0, a2
|
|
fstx.s f3, a0, t2
|
|
fstx.s f2, a0, t3
|
|
endfunc
|
|
|
|
/*
|
|
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
|
* void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
|
|
*/
|
|
function ff_h264_idct8_add_8_lsx
|
|
ld.h t0, a1, 0
|
|
add.d t2, a2, a2
|
|
add.d t3, t2, a2
|
|
add.d t4, t3, a2
|
|
add.d t5, t4, a2
|
|
add.d t6, t5, a2
|
|
add.d t7, t6, a2
|
|
addi.w t0, t0, 32
|
|
st.h t0, a1, 0
|
|
|
|
vld vr0, a1, 0
|
|
vld vr1, a1, 16
|
|
vld vr2, a1, 32
|
|
vld vr3, a1, 48
|
|
vld vr4, a1, 64
|
|
vld vr5, a1, 80
|
|
vld vr6, a1, 96
|
|
vld vr7, a1, 112
|
|
vxor.v vr8, vr8, vr8
|
|
vst vr8, a1, 0
|
|
vst vr8, a1, 16
|
|
vst vr8, a1, 32
|
|
vst vr8, a1, 48
|
|
vst vr8, a1, 64
|
|
vst vr8, a1, 80
|
|
vst vr8, a1, 96
|
|
vst vr8, a1, 112
|
|
|
|
vadd.h vr18, vr0, vr4
|
|
vsub.h vr19, vr0, vr4
|
|
vsrai.h vr20, vr2, 1
|
|
vsrai.h vr21, vr6, 1
|
|
vsub.h vr20, vr20, vr6
|
|
vadd.h vr21, vr21, vr2
|
|
LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
|
|
vsrai.h vr11, vr7, 1
|
|
vsrai.h vr13, vr3, 1
|
|
vsrai.h vr15, vr5, 1
|
|
vsrai.h vr17, vr1, 1
|
|
vsub.h vr11, vr5, vr11
|
|
vsub.h vr13, vr7, vr13
|
|
vadd.h vr15, vr7, vr15
|
|
vadd.h vr17, vr5, vr17
|
|
vsub.h vr11, vr11, vr7
|
|
vsub.h vr13, vr13, vr3
|
|
vadd.h vr15, vr15, vr5
|
|
vadd.h vr17, vr17, vr1
|
|
vsub.h vr11, vr11, vr3
|
|
vadd.h vr13, vr13, vr1
|
|
vsub.h vr15, vr15, vr1
|
|
vadd.h vr17, vr17, vr3
|
|
vsrai.h vr18, vr11, 2
|
|
vsrai.h vr19, vr13, 2
|
|
vsrai.h vr20, vr15, 2
|
|
vsrai.h vr21, vr17, 2
|
|
vadd.h vr11, vr11, vr21
|
|
vadd.h vr13, vr13, vr20
|
|
vsub.h vr15, vr19, vr15
|
|
vsub.h vr17, vr17, vr18
|
|
LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
|
|
vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
|
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
|
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
|
|
vexth.w.h vr20, vr0
|
|
vexth.w.h vr21, vr1
|
|
vexth.w.h vr22, vr2
|
|
vexth.w.h vr23, vr3
|
|
vexth.w.h vr8, vr4
|
|
vexth.w.h vr9, vr5
|
|
vexth.w.h vr18, vr6
|
|
vexth.w.h vr19, vr7
|
|
vsllwil.w.h vr0, vr0, 0
|
|
vsllwil.w.h vr1, vr1, 0
|
|
vsllwil.w.h vr2, vr2, 0
|
|
vsllwil.w.h vr3, vr3, 0
|
|
vsllwil.w.h vr4, vr4, 0
|
|
vsllwil.w.h vr5, vr5, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vsllwil.w.h vr7, vr7, 0
|
|
|
|
vadd.w vr11, vr0, vr4
|
|
vsub.w vr13, vr0, vr4
|
|
vsrai.w vr15, vr2, 1
|
|
vsrai.w vr17, vr6, 1
|
|
vsub.w vr15, vr15, vr6
|
|
vadd.w vr17, vr17, vr2
|
|
LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
|
|
vsrai.w vr11, vr7, 1
|
|
vsrai.w vr13, vr3, 1
|
|
vsrai.w vr15, vr5, 1
|
|
vsrai.w vr17, vr1, 1
|
|
vsub.w vr11, vr5, vr11
|
|
vsub.w vr13, vr7, vr13
|
|
vadd.w vr15, vr7, vr15
|
|
vadd.w vr17, vr5, vr17
|
|
vsub.w vr11, vr11, vr7
|
|
vsub.w vr13, vr13, vr3
|
|
vadd.w vr15, vr15, vr5
|
|
vadd.w vr17, vr17, vr1
|
|
vsub.w vr11, vr11, vr3
|
|
vadd.w vr13, vr13, vr1
|
|
vsub.w vr15, vr15, vr1
|
|
vadd.w vr17, vr17, vr3
|
|
vsrai.w vr0, vr11, 2
|
|
vsrai.w vr1, vr13, 2
|
|
vsrai.w vr2, vr15, 2
|
|
vsrai.w vr3, vr17, 2
|
|
vadd.w vr11, vr11, vr3
|
|
vadd.w vr13, vr13, vr2
|
|
vsub.w vr15, vr1, vr15
|
|
vsub.w vr17, vr17, vr0
|
|
LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
|
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
|
|
|
|
vadd.w vr11, vr20, vr8
|
|
vsub.w vr13, vr20, vr8
|
|
vsrai.w vr15, vr22, 1
|
|
vsrai.w vr17, vr18, 1
|
|
vsub.w vr15, vr15, vr18
|
|
vadd.w vr17, vr17, vr22
|
|
LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
|
|
vsrai.w vr11, vr19, 1
|
|
vsrai.w vr13, vr23, 1
|
|
vsrai.w vr15, vr9, 1
|
|
vsrai.w vr17, vr21, 1
|
|
vsub.w vr11, vr9, vr11
|
|
vsub.w vr13, vr19, vr13
|
|
vadd.w vr15, vr19, vr15
|
|
vadd.w vr17, vr9, vr17
|
|
vsub.w vr11, vr11, vr19
|
|
vsub.w vr13, vr13, vr23
|
|
vadd.w vr15, vr15, vr9
|
|
vadd.w vr17, vr17, vr21
|
|
vsub.w vr11, vr11, vr23
|
|
vadd.w vr13, vr13, vr21
|
|
vsub.w vr15, vr15, vr21
|
|
vadd.w vr17, vr17, vr23
|
|
vsrai.w vr20, vr11, 2
|
|
vsrai.w vr21, vr13, 2
|
|
vsrai.w vr22, vr15, 2
|
|
vsrai.w vr23, vr17, 2
|
|
vadd.w vr11, vr11, vr23
|
|
vadd.w vr13, vr13, vr22
|
|
vsub.w vr15, vr21, vr15
|
|
vsub.w vr17, vr17, vr20
|
|
LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
|
|
vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
|
|
|
|
vld vr10, a0, 0
|
|
vldx vr11, a0, a2
|
|
vldx vr12, a0, t2
|
|
vldx vr13, a0, t3
|
|
vldx vr14, a0, t4
|
|
vldx vr15, a0, t5
|
|
vldx vr16, a0, t6
|
|
vldx vr17, a0, t7
|
|
vsrani.h.w vr20, vr0, 6
|
|
vsrani.h.w vr21, vr1, 6
|
|
vsrani.h.w vr22, vr2, 6
|
|
vsrani.h.w vr23, vr3, 6
|
|
vsrani.h.w vr8, vr4, 6
|
|
vsrani.h.w vr9, vr5, 6
|
|
vsrani.h.w vr18, vr6, 6
|
|
vsrani.h.w vr19, vr7, 6
|
|
vsllwil.hu.bu vr10, vr10, 0
|
|
vsllwil.hu.bu vr11, vr11, 0
|
|
vsllwil.hu.bu vr12, vr12, 0
|
|
vsllwil.hu.bu vr13, vr13, 0
|
|
vsllwil.hu.bu vr14, vr14, 0
|
|
vsllwil.hu.bu vr15, vr15, 0
|
|
vsllwil.hu.bu vr16, vr16, 0
|
|
vsllwil.hu.bu vr17, vr17, 0
|
|
|
|
vadd.h vr0, vr20, vr10
|
|
vadd.h vr1, vr21, vr11
|
|
vadd.h vr2, vr22, vr12
|
|
vadd.h vr3, vr23, vr13
|
|
vadd.h vr4, vr8, vr14
|
|
vadd.h vr5, vr9, vr15
|
|
vadd.h vr6, vr18, vr16
|
|
vadd.h vr7, vr19, vr17
|
|
vssrarni.bu.h vr1, vr0, 0
|
|
vssrarni.bu.h vr3, vr2, 0
|
|
vssrarni.bu.h vr5, vr4, 0
|
|
vssrarni.bu.h vr7, vr6, 0
|
|
vbsrl.v vr0, vr1, 8
|
|
vbsrl.v vr2, vr3, 8
|
|
vbsrl.v vr4, vr5, 8
|
|
vbsrl.v vr6, vr7, 8
|
|
fst.d f1, a0, 0
|
|
fstx.d f0, a0, a2
|
|
fstx.d f3, a0, t2
|
|
fstx.d f2, a0, t3
|
|
fstx.d f5, a0, t4
|
|
fstx.d f4, a0, t5
|
|
fstx.d f7, a0, t6
|
|
fstx.d f6, a0, t7
|
|
endfunc
|
|
|
|
/*
|
|
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
|
* void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
|
|
*/
|
|
function ff_h264_idct8_add_8_lasx
|
|
ld.h t0, a1, 0
|
|
add.d t2, a2, a2
|
|
add.d t3, t2, a2
|
|
add.d t4, t3, a2
|
|
add.d t5, t4, a2
|
|
add.d t6, t5, a2
|
|
add.d t7, t6, a2
|
|
addi.w t0, t0, 32
|
|
st.h t0, a1, 0
|
|
|
|
vld vr0, a1, 0
|
|
vld vr1, a1, 16
|
|
vld vr2, a1, 32
|
|
vld vr3, a1, 48
|
|
vld vr4, a1, 64
|
|
vld vr5, a1, 80
|
|
vld vr6, a1, 96
|
|
vld vr7, a1, 112
|
|
xvxor.v xr8, xr8, xr8
|
|
xvst xr8, a1, 0
|
|
xvst xr8, a1, 32
|
|
xvst xr8, a1, 64
|
|
xvst xr8, a1, 96
|
|
|
|
vadd.h vr18, vr0, vr4
|
|
vsub.h vr19, vr0, vr4
|
|
vsrai.h vr20, vr2, 1
|
|
vsrai.h vr21, vr6, 1
|
|
vsub.h vr20, vr20, vr6
|
|
vadd.h vr21, vr21, vr2
|
|
LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
|
|
vsrai.h vr11, vr7, 1
|
|
vsrai.h vr13, vr3, 1
|
|
vsrai.h vr15, vr5, 1
|
|
vsrai.h vr17, vr1, 1
|
|
vsub.h vr11, vr5, vr11
|
|
vsub.h vr13, vr7, vr13
|
|
vadd.h vr15, vr7, vr15
|
|
vadd.h vr17, vr5, vr17
|
|
vsub.h vr11, vr11, vr7
|
|
vsub.h vr13, vr13, vr3
|
|
vadd.h vr15, vr15, vr5
|
|
vadd.h vr17, vr17, vr1
|
|
vsub.h vr11, vr11, vr3
|
|
vadd.h vr13, vr13, vr1
|
|
vsub.h vr15, vr15, vr1
|
|
vadd.h vr17, vr17, vr3
|
|
vsrai.h vr18, vr11, 2
|
|
vsrai.h vr19, vr13, 2
|
|
vsrai.h vr20, vr15, 2
|
|
vsrai.h vr21, vr17, 2
|
|
vadd.h vr11, vr11, vr21
|
|
vadd.h vr13, vr13, vr20
|
|
vsub.h vr15, vr19, vr15
|
|
vsub.h vr17, vr17, vr18
|
|
LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
|
|
vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
|
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
|
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
|
|
vext2xv.w.h xr0, xr0
|
|
vext2xv.w.h xr1, xr1
|
|
vext2xv.w.h xr2, xr2
|
|
vext2xv.w.h xr3, xr3
|
|
vext2xv.w.h xr4, xr4
|
|
vext2xv.w.h xr5, xr5
|
|
vext2xv.w.h xr6, xr6
|
|
vext2xv.w.h xr7, xr7
|
|
|
|
xvadd.w xr11, xr0, xr4
|
|
xvsub.w xr13, xr0, xr4
|
|
xvsrai.w xr15, xr2, 1
|
|
xvsrai.w xr17, xr6, 1
|
|
xvsub.w xr15, xr15, xr6
|
|
xvadd.w xr17, xr17, xr2
|
|
LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17, xr10, xr12, xr14, xr16
|
|
xvsrai.w xr11, xr7, 1
|
|
xvsrai.w xr13, xr3, 1
|
|
xvsrai.w xr15, xr5, 1
|
|
xvsrai.w xr17, xr1, 1
|
|
xvsub.w xr11, xr5, xr11
|
|
xvsub.w xr13, xr7, xr13
|
|
xvadd.w xr15, xr7, xr15
|
|
xvadd.w xr17, xr5, xr17
|
|
xvsub.w xr11, xr11, xr7
|
|
xvsub.w xr13, xr13, xr3
|
|
xvadd.w xr15, xr15, xr5
|
|
xvadd.w xr17, xr17, xr1
|
|
xvsub.w xr11, xr11, xr3
|
|
xvadd.w xr13, xr13, xr1
|
|
xvsub.w xr15, xr15, xr1
|
|
xvadd.w xr17, xr17, xr3
|
|
xvsrai.w xr0, xr11, 2
|
|
xvsrai.w xr1, xr13, 2
|
|
xvsrai.w xr2, xr15, 2
|
|
xvsrai.w xr3, xr17, 2
|
|
xvadd.w xr11, xr11, xr3
|
|
xvadd.w xr13, xr13, xr2
|
|
xvsub.w xr15, xr1, xr15
|
|
xvsub.w xr17, xr17, xr0
|
|
LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
|
|
xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7
|
|
|
|
vld vr10, a0, 0
|
|
vldx vr11, a0, a2
|
|
vldx vr12, a0, t2
|
|
vldx vr13, a0, t3
|
|
vldx vr14, a0, t4
|
|
vldx vr15, a0, t5
|
|
vldx vr16, a0, t6
|
|
vldx vr17, a0, t7
|
|
xvldi xr8, 0x806 //"xvldi.w xr8 6"
|
|
xvsran.h.w xr0, xr0, xr8
|
|
xvsran.h.w xr1, xr1, xr8
|
|
xvsran.h.w xr2, xr2, xr8
|
|
xvsran.h.w xr3, xr3, xr8
|
|
xvsran.h.w xr4, xr4, xr8
|
|
xvsran.h.w xr5, xr5, xr8
|
|
xvsran.h.w xr6, xr6, xr8
|
|
xvsran.h.w xr7, xr7, xr8
|
|
xvpermi.d xr0, xr0, 0x08
|
|
xvpermi.d xr1, xr1, 0x08
|
|
xvpermi.d xr2, xr2, 0x08
|
|
xvpermi.d xr3, xr3, 0x08
|
|
xvpermi.d xr4, xr4, 0x08
|
|
xvpermi.d xr5, xr5, 0x08
|
|
xvpermi.d xr6, xr6, 0x08
|
|
xvpermi.d xr7, xr7, 0x08
|
|
|
|
vsllwil.hu.bu vr10, vr10, 0
|
|
vsllwil.hu.bu vr11, vr11, 0
|
|
vsllwil.hu.bu vr12, vr12, 0
|
|
vsllwil.hu.bu vr13, vr13, 0
|
|
vsllwil.hu.bu vr14, vr14, 0
|
|
vsllwil.hu.bu vr15, vr15, 0
|
|
vsllwil.hu.bu vr16, vr16, 0
|
|
vsllwil.hu.bu vr17, vr17, 0
|
|
|
|
vadd.h vr0, vr0, vr10
|
|
vadd.h vr1, vr1, vr11
|
|
vadd.h vr2, vr2, vr12
|
|
vadd.h vr3, vr3, vr13
|
|
vadd.h vr4, vr4, vr14
|
|
vadd.h vr5, vr5, vr15
|
|
vadd.h vr6, vr6, vr16
|
|
vadd.h vr7, vr7, vr17
|
|
vssrarni.bu.h vr1, vr0, 0
|
|
vssrarni.bu.h vr3, vr2, 0
|
|
vssrarni.bu.h vr5, vr4, 0
|
|
vssrarni.bu.h vr7, vr6, 0
|
|
vbsrl.v vr0, vr1, 8
|
|
vbsrl.v vr2, vr3, 8
|
|
vbsrl.v vr4, vr5, 8
|
|
vbsrl.v vr6, vr7, 8
|
|
fst.d f1, a0, 0
|
|
fstx.d f0, a0, a2
|
|
fstx.d f3, a0, t2
|
|
fstx.d f2, a0, t3
|
|
fstx.d f5, a0, t4
|
|
fstx.d f4, a0, t5
|
|
fstx.d f7, a0, t6
|
|
fstx.d f6, a0, t7
|
|
endfunc
|
|
|
|
/*
|
|
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
|
* void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
|
|
* LSX optimization is enough for this function.
|
|
*/
|
|
function ff_h264_idct_dc_add_8_lsx
|
|
vldrepl.h vr4, a1, 0
|
|
add.d t2, a2, a2
|
|
add.d t3, t2, a2
|
|
fld.s f0, a0, 0
|
|
fldx.s f1, a0, a2
|
|
fldx.s f2, a0, t2
|
|
fldx.s f3, a0, t3
|
|
st.h zero, a1, 0
|
|
|
|
vsrari.h vr4, vr4, 6
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr1, vr3, vr2
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vsllwil.hu.bu vr1, vr1, 0
|
|
vadd.h vr0, vr0, vr4
|
|
vadd.h vr1, vr1, vr4
|
|
vssrarni.bu.h vr1, vr0, 0
|
|
|
|
vbsrl.v vr2, vr1, 4
|
|
vbsrl.v vr3, vr1, 8
|
|
vbsrl.v vr4, vr1, 12
|
|
fst.s f1, a0, 0
|
|
fstx.s f2, a0, a2
|
|
fstx.s f3, a0, t2
|
|
fstx.s f4, a0, t3
|
|
endfunc
|
|
|
|
/*
|
|
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
|
* void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
|
|
*/
|
|
function ff_h264_idct8_dc_add_8_lsx
|
|
vldrepl.h vr8, a1, 0
|
|
add.d t2, a2, a2
|
|
add.d t3, t2, a2
|
|
add.d t4, t3, a2
|
|
add.d t5, t4, a2
|
|
add.d t6, t5, a2
|
|
add.d t7, t6, a2
|
|
|
|
fld.d f0, a0, 0
|
|
fldx.d f1, a0, a2
|
|
fldx.d f2, a0, t2
|
|
fldx.d f3, a0, t3
|
|
fldx.d f4, a0, t4
|
|
fldx.d f5, a0, t5
|
|
fldx.d f6, a0, t6
|
|
fldx.d f7, a0, t7
|
|
st.h zero, a1, 0
|
|
|
|
vsrari.h vr8, vr8, 6
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vsllwil.hu.bu vr1, vr1, 0
|
|
vsllwil.hu.bu vr2, vr2, 0
|
|
vsllwil.hu.bu vr3, vr3, 0
|
|
vsllwil.hu.bu vr4, vr4, 0
|
|
vsllwil.hu.bu vr5, vr5, 0
|
|
vsllwil.hu.bu vr6, vr6, 0
|
|
vsllwil.hu.bu vr7, vr7, 0
|
|
vadd.h vr0, vr0, vr8
|
|
vadd.h vr1, vr1, vr8
|
|
vadd.h vr2, vr2, vr8
|
|
vadd.h vr3, vr3, vr8
|
|
vadd.h vr4, vr4, vr8
|
|
vadd.h vr5, vr5, vr8
|
|
vadd.h vr6, vr6, vr8
|
|
vadd.h vr7, vr7, vr8
|
|
vssrarni.bu.h vr1, vr0, 0
|
|
vssrarni.bu.h vr3, vr2, 0
|
|
vssrarni.bu.h vr5, vr4, 0
|
|
vssrarni.bu.h vr7, vr6, 0
|
|
|
|
vbsrl.v vr0, vr1, 8
|
|
vbsrl.v vr2, vr3, 8
|
|
vbsrl.v vr4, vr5, 8
|
|
vbsrl.v vr6, vr7, 8
|
|
fst.d f1, a0, 0
|
|
fstx.d f0, a0, a2
|
|
fstx.d f3, a0, t2
|
|
fstx.d f2, a0, t3
|
|
fstx.d f5, a0, t4
|
|
fstx.d f4, a0, t5
|
|
fstx.d f7, a0, t6
|
|
fstx.d f6, a0, t7
|
|
endfunc
|
|
function ff_h264_idct8_dc_add_8_lasx
|
|
xvldrepl.h xr8, a1, 0
|
|
add.d t2, a2, a2
|
|
add.d t3, t2, a2
|
|
add.d t4, t3, a2
|
|
add.d t5, t4, a2
|
|
add.d t6, t5, a2
|
|
add.d t7, t6, a2
|
|
|
|
fld.d f0, a0, 0
|
|
fldx.d f1, a0, a2
|
|
fldx.d f2, a0, t2
|
|
fldx.d f3, a0, t3
|
|
fldx.d f4, a0, t4
|
|
fldx.d f5, a0, t5
|
|
fldx.d f6, a0, t6
|
|
fldx.d f7, a0, t7
|
|
st.h zero, a1, 0
|
|
|
|
xvsrari.h xr8, xr8, 6
|
|
xvpermi.q xr1, xr0, 0x20
|
|
xvpermi.q xr3, xr2, 0x20
|
|
xvpermi.q xr5, xr4, 0x20
|
|
xvpermi.q xr7, xr6, 0x20
|
|
xvsllwil.hu.bu xr1, xr1, 0
|
|
xvsllwil.hu.bu xr3, xr3, 0
|
|
xvsllwil.hu.bu xr5, xr5, 0
|
|
xvsllwil.hu.bu xr7, xr7, 0
|
|
xvadd.h xr1, xr1, xr8
|
|
xvadd.h xr3, xr3, xr8
|
|
xvadd.h xr5, xr5, xr8
|
|
xvadd.h xr7, xr7, xr8
|
|
|
|
xvssrarni.bu.h xr3, xr1, 0
|
|
xvssrarni.bu.h xr7, xr5, 0
|
|
|
|
xvpermi.q xr1, xr3, 0x11
|
|
xvpermi.q xr5, xr7, 0x11
|
|
xvbsrl.v xr0, xr1, 8
|
|
xvbsrl.v xr2, xr3, 8
|
|
xvbsrl.v xr4, xr5, 8
|
|
xvbsrl.v xr6, xr7, 8
|
|
|
|
fst.d f3, a0, 0
|
|
fstx.d f1, a0, a2
|
|
fstx.d f2, a0, t2
|
|
fstx.d f0, a0, t3
|
|
fstx.d f7, a0, t4
|
|
fstx.d f5, a0, t5
|
|
fstx.d f6, a0, t6
|
|
fstx.d f4, a0, t7
|
|
endfunc
|
|
|
|
/**
|
|
* IDCT transforms the 16 dc values and dequantizes them.
|
|
* @param qmul quantization parameter
|
|
* void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
|
|
* LSX optimization is enough for this function.
|
|
*/
|
|
function ff_h264_luma_dc_dequant_idct_8_lsx
|
|
vld vr0, a1, 0
|
|
vld vr1, a1, 8
|
|
vld vr2, a1, 16
|
|
vld vr3, a1, 24
|
|
vreplgr2vr.w vr8, a2
|
|
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
|
|
LSX_BUTTERFLY_4_H vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
|
|
LSX_BUTTERFLY_4_H vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
|
|
LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
|
|
LSX_BUTTERFLY_4_H vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
|
|
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
|
|
vsllwil.w.h vr0, vr0, 0
|
|
vsllwil.w.h vr1, vr1, 0
|
|
vsllwil.w.h vr2, vr2, 0
|
|
vsllwil.w.h vr3, vr3, 0
|
|
vmul.w vr0, vr0, vr8
|
|
vmul.w vr1, vr1, vr8
|
|
vmul.w vr2, vr2, vr8
|
|
vmul.w vr3, vr3, vr8
|
|
vsrarni.h.w vr1, vr0, 8
|
|
vsrarni.h.w vr3, vr2, 8
|
|
|
|
vstelm.h vr1, a0, 0, 0
|
|
vstelm.h vr1, a0, 32, 4
|
|
vstelm.h vr1, a0, 64, 1
|
|
vstelm.h vr1, a0, 96, 5
|
|
vstelm.h vr3, a0, 128, 0
|
|
vstelm.h vr3, a0, 160, 4
|
|
vstelm.h vr3, a0, 192, 1
|
|
vstelm.h vr3, a0, 224, 5
|
|
addi.d a0, a0, 256
|
|
vstelm.h vr1, a0, 0, 2
|
|
vstelm.h vr1, a0, 32, 6
|
|
vstelm.h vr1, a0, 64, 3
|
|
vstelm.h vr1, a0, 96, 7
|
|
vstelm.h vr3, a0, 128, 2
|
|
vstelm.h vr3, a0, 160, 6
|
|
vstelm.h vr3, a0, 192, 3
|
|
vstelm.h vr3, a0, 224, 7
|
|
endfunc
|