mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-18 03:19:31 +02:00
8815a7719e
./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 199fps after: 214fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
300 lines
8.4 KiB
ArmAsm
300 lines
8.4 KiB
ArmAsm
/*
|
|
* Loongson LSX optimized h264intrapred
|
|
*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by Lu Wang <wanglu@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "loongson_asm.S"
|
|
|
|
const shufa
|
|
.byte 6, 5, 4, 3, 2, 1, 0
|
|
endconst
|
|
|
|
const mulk
|
|
.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
|
|
endconst
|
|
|
|
const mulh
|
|
.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
|
|
.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
|
|
endconst
|
|
|
|
.macro PRED16X16_PLANE
|
|
slli.d t6, a1, 1
|
|
slli.d t4, a1, 3
|
|
addi.d t0, a0, 7
|
|
sub.d t0, t0, a1
|
|
add.d t1, a0, t4
|
|
addi.d t1, t1, -1
|
|
sub.d t2, t1, t6
|
|
|
|
ld.bu t3, t0, 1
|
|
ld.bu t4, t0, -1
|
|
ld.bu t5, t1, 0
|
|
ld.bu t7, t2, 0
|
|
sub.d t3, t3, t4
|
|
sub.d t4, t5, t7
|
|
|
|
la.local t5, mulk
|
|
vld vr0, t5, 0
|
|
fld.d f1, t0, 2
|
|
fld.d f2, t0, -8
|
|
la.local t5, shufa
|
|
fld.d f3, t5, 0
|
|
vshuf.b vr2, vr2, vr2, vr3
|
|
vilvl.b vr1, vr1, vr2
|
|
vhsubw.hu.bu vr1, vr1, vr1
|
|
vmul.h vr0, vr0, vr1
|
|
vhaddw.w.h vr1, vr0, vr0
|
|
vhaddw.d.w vr0, vr1, vr1
|
|
vhaddw.q.d vr1, vr0, vr0
|
|
vpickve2gr.w t5, vr1, 0
|
|
add.d t3, t3, t5
|
|
//2
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ldx.bu t7, t1, a1
|
|
sub.d t5, t7, t8
|
|
slli.d t5, t5, 1
|
|
|
|
//3&4
|
|
add.d t1, t1, t6
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ld.bu t7, t1, 0
|
|
sub.d t7, t7, t8
|
|
slli.d t8, t7, 1
|
|
add.d t7, t7, t8
|
|
add.d t5, t5, t7
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ldx.bu t7, t1, a1
|
|
sub.d t7, t7, t8
|
|
slli.d t7, t7, 2
|
|
add.d t5, t5, t7
|
|
|
|
//5&6
|
|
add.d t1, t1, t6
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ld.bu t7, t1, 0
|
|
sub.d t7, t7, t8
|
|
slli.d t8, t7, 2
|
|
add.d t7, t7, t8
|
|
add.d t5, t5, t7
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ldx.bu t7, t1, a1
|
|
sub.d t7, t7, t8
|
|
slli.d t8, t7, 1
|
|
slli.d t7, t7, 2
|
|
add.d t7, t7, t8
|
|
add.d t5, t5, t7
|
|
|
|
//7&8
|
|
add.d t1, t1, t6
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ld.bu t7, t1, 0
|
|
sub.d t7, t7, t8
|
|
slli.d t8, t7, 3
|
|
sub.d t7, t8, t7
|
|
add.d t5, t5, t7
|
|
sub.d t2, t2, a1
|
|
ld.bu t8, t2, 0
|
|
ldx.bu t7, t1, a1
|
|
sub.d t7, t7, t8
|
|
slli.d t7, t7, 3
|
|
add.d t5, t5, t7
|
|
add.d t4, t4, t5
|
|
add.d t1, t1, a1
|
|
.endm
|
|
|
|
.macro PRED16X16_PLANE_END
|
|
ld.bu t7, t1, 0
|
|
ld.bu t8, t2, 16
|
|
add.d t5, t7, t8
|
|
addi.d t5, t5, 1
|
|
slli.d t5, t5, 4
|
|
add.d t7, t3, t4
|
|
slli.d t8, t7, 3
|
|
sub.d t7, t8, t7
|
|
sub.d t5, t5, t7
|
|
|
|
la.local t8, mulh
|
|
vld vr3, t8, 0
|
|
slli.d t8, t3, 3
|
|
vreplgr2vr.h vr4, t3
|
|
vreplgr2vr.h vr9, t8
|
|
vmul.h vr5, vr3, vr4
|
|
|
|
.rept 16
|
|
move t7, t5
|
|
add.d t5, t5, t4
|
|
vreplgr2vr.h vr6, t7
|
|
vadd.h vr7, vr6, vr5
|
|
vadd.h vr8, vr9, vr7
|
|
vssrani.bu.h vr8, vr7, 5
|
|
vst vr8, a0, 0
|
|
add.d a0, a0, a1
|
|
.endr
|
|
.endm
|
|
|
|
.macro PRED16X16_PLANE_END_LASX
|
|
ld.bu t7, t1, 0
|
|
ld.bu t8, t2, 16
|
|
add.d t5, t7, t8
|
|
addi.d t5, t5, 1
|
|
slli.d t5, t5, 4
|
|
add.d t7, t3, t4
|
|
slli.d t8, t7, 3
|
|
sub.d t7, t8, t7
|
|
sub.d t5, t5, t7
|
|
|
|
la.local t8, mulh
|
|
xvld xr3, t8, 0
|
|
xvreplgr2vr.h xr4, t3
|
|
xvmul.h xr5, xr3, xr4
|
|
|
|
.rept 8
|
|
move t7, t5
|
|
add.d t5, t5, t4
|
|
xvreplgr2vr.h xr6, t7
|
|
xvreplgr2vr.h xr8, t5
|
|
add.d t5, t5, t4
|
|
xvadd.h xr7, xr6, xr5
|
|
xvadd.h xr9, xr8, xr5
|
|
|
|
xvssrani.bu.h xr9, xr7, 5
|
|
vstelm.d vr9, a0, 0, 0
|
|
xvstelm.d xr9, a0, 8, 2
|
|
add.d a0, a0, a1
|
|
vstelm.d vr9, a0, 0, 1
|
|
xvstelm.d xr9, a0, 8, 3
|
|
add.d a0, a0, a1
|
|
.endr
|
|
.endm
|
|
|
|
/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
|
|
*/
|
|
function ff_h264_pred16x16_plane_h264_8_lsx
|
|
PRED16X16_PLANE
|
|
|
|
slli.d t7, t3, 2
|
|
add.d t3, t3, t7
|
|
addi.d t3, t3, 32
|
|
srai.d t3, t3, 6
|
|
slli.d t7, t4, 2
|
|
add.d t4, t4, t7
|
|
addi.d t4, t4, 32
|
|
srai.d t4, t4, 6
|
|
|
|
PRED16X16_PLANE_END
|
|
endfunc
|
|
|
|
/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
|
|
*/
|
|
function ff_h264_pred16x16_plane_rv40_8_lsx
|
|
PRED16X16_PLANE
|
|
|
|
srai.d t7, t3, 2
|
|
add.d t3, t3, t7
|
|
srai.d t3, t3, 4
|
|
srai.d t7, t4, 2
|
|
add.d t4, t4, t7
|
|
srai.d t4, t4, 4
|
|
|
|
PRED16X16_PLANE_END
|
|
endfunc
|
|
|
|
/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
|
|
*/
|
|
function ff_h264_pred16x16_plane_svq3_8_lsx
|
|
PRED16X16_PLANE
|
|
|
|
li.d t6, 4
|
|
li.d t7, 5
|
|
li.d t8, 16
|
|
div.d t3, t3, t6
|
|
mul.d t3, t3, t7
|
|
div.d t3, t3, t8
|
|
div.d t4, t4, t6
|
|
mul.d t4, t4, t7
|
|
div.d t4, t4, t8
|
|
move t7, t3
|
|
move t3, t4
|
|
move t4, t7
|
|
|
|
PRED16X16_PLANE_END
|
|
endfunc
|
|
|
|
/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
|
|
*/
|
|
function ff_h264_pred16x16_plane_h264_8_lasx
|
|
PRED16X16_PLANE
|
|
|
|
slli.d t7, t3, 2
|
|
add.d t3, t3, t7
|
|
addi.d t3, t3, 32
|
|
srai.d t3, t3, 6
|
|
slli.d t7, t4, 2
|
|
add.d t4, t4, t7
|
|
addi.d t4, t4, 32
|
|
srai.d t4, t4, 6
|
|
|
|
PRED16X16_PLANE_END_LASX
|
|
endfunc
|
|
|
|
/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
|
|
*/
|
|
function ff_h264_pred16x16_plane_rv40_8_lasx
|
|
PRED16X16_PLANE
|
|
|
|
srai.d t7, t3, 2
|
|
add.d t3, t3, t7
|
|
srai.d t3, t3, 4
|
|
srai.d t7, t4, 2
|
|
add.d t4, t4, t7
|
|
srai.d t4, t4, 4
|
|
|
|
PRED16X16_PLANE_END_LASX
|
|
endfunc
|
|
|
|
/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
|
|
*/
|
|
function ff_h264_pred16x16_plane_svq3_8_lasx
|
|
PRED16X16_PLANE
|
|
|
|
li.d t5, 4
|
|
li.d t7, 5
|
|
li.d t8, 16
|
|
div.d t3, t3, t5
|
|
mul.d t3, t3, t7
|
|
div.d t3, t3, t8
|
|
div.d t4, t4, t5
|
|
mul.d t4, t4, t7
|
|
div.d t4, t4, t8
|
|
move t7, t3
|
|
move t3, t4
|
|
move t4, t7
|
|
|
|
PRED16X16_PLANE_END_LASX
|
|
endfunc
|