mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
d521b7280c
C908: vp9_tm_4x4_8bpp_c: 116.5 vp9_tm_4x4_8bpp_rvv_i32: 43.5 vp9_tm_8x8_8bpp_c: 416.2 vp9_tm_8x8_8bpp_rvv_i32: 86.0 vp9_tm_16x16_8bpp_c: 1665.5 vp9_tm_16x16_8bpp_rvv_i32: 187.2 vp9_tm_32x32_8bpp_c: 6974.2 vp9_tm_32x32_8bpp_rvv_i32: 625.7 Signed-off-by: Rémi Denis-Courmont <remi@remlab.net>
294 lines
7.7 KiB
ArmAsm
294 lines
7.7 KiB
ArmAsm
/*
|
|
* Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/riscv/asm.S"
|
|
|
|
.macro avgdc size
|
|
vwredsumu.vs v16, v8, v16
|
|
vsetivli zero, 1, e16, m1, ta, ma
|
|
vmv.x.s t1, v16
|
|
addi t1, t1, 1 << (\size - 1)
|
|
srai t1, t1, \size
|
|
.endm
|
|
|
|
.macro getdc type size
|
|
.ifc \type,top
|
|
vmv.v.x v16, zero
|
|
vle8.v v8, (a3)
|
|
avgdc \size
|
|
.else
|
|
.ifc \type,left
|
|
vmv.v.x v16, zero
|
|
vle8.v v8, (a2)
|
|
avgdc \size
|
|
.else
|
|
.ifc \type,dc
|
|
vmv.v.x v16, zero
|
|
vle8.v v8, (a2)
|
|
vwredsumu.vs v16, v8, v16
|
|
vle8.v v8, (a3)
|
|
avgdc \size
|
|
.else
|
|
li t1, \type
|
|
.endif
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
.macro dc_e32 type size n restore
|
|
.ifc \size,32
|
|
li t0, 32
|
|
vsetvli zero, t0, e8, m2, ta, ma
|
|
.else
|
|
vsetivli zero, 16, e8, m1, ta, ma
|
|
.endif
|
|
getdc \type \n
|
|
|
|
.if \restore == 1 && \size == 32
|
|
vsetvli zero, t0, e8, m2, ta, ma
|
|
.elseif \restore == 1 && \size == 16
|
|
vsetivli zero, 16, e8, m1, ta, ma
|
|
.endif
|
|
vmv.v.x v0, t1
|
|
|
|
.rept \size
|
|
vse8.v v0, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
|
|
ret
|
|
.endm
|
|
|
|
.macro dc_e64 type size n restore
|
|
vsetivli zero, 8, e8, mf2, ta, ma
|
|
getdc \type \n
|
|
|
|
li t0, 64
|
|
vsetvli zero, t0, e8, m4, ta, ma
|
|
vmv.v.x v0, t1
|
|
vsetivli zero, 8, e8, mf2, ta, ma
|
|
vsse64.v v0, (a0), a1
|
|
|
|
ret
|
|
.endm
|
|
|
|
.macro func_dc name size type n restore ext
|
|
func ff_\()\name\()_\()\size\()x\size\()_rvv, \ext
|
|
.if \size == 8
|
|
dc_e64 \type \size \n \restore
|
|
.else
|
|
dc_e32 \type \size \n \restore
|
|
.endif
|
|
endfunc
|
|
.endm
|
|
|
|
func_dc dc_127 32 127 0 0 zve32x
|
|
func_dc dc_127 16 127 0 0 zve32x
|
|
func_dc dc_127 8 127 0 0 zve64x
|
|
func_dc dc_128 32 128 0 0 zve32x
|
|
func_dc dc_128 16 128 0 0 zve32x
|
|
func_dc dc_128 8 128 0 0 zve64x
|
|
func_dc dc_129 32 129 0 0 zve32x
|
|
func_dc dc_129 16 129 0 0 zve32x
|
|
func_dc dc_129 8 129 0 0 zve64x
|
|
func_dc dc 32 dc 6 1 zve32x
|
|
func_dc dc 16 dc 5 1 zve32x
|
|
func_dc dc 8 dc 4 0 zve64x
|
|
func_dc dc_left 32 left 5 1 zve32x
|
|
func_dc dc_left 16 left 4 1 zve32x
|
|
func_dc dc_left 8 left 3 0 zve64x
|
|
func_dc dc_top 32 top 5 1 zve32x
|
|
func_dc dc_top 16 top 4 1 zve32x
|
|
func_dc dc_top 8 top 3 0 zve64x
|
|
|
|
func ff_h_32x32_rvv, zve32x
|
|
li t0, 32
|
|
addi a2, a2, 31
|
|
vsetvli zero, t0, e8, m2, ta, ma
|
|
|
|
.rept 2
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
lbu t1, (a2)
|
|
addi a2, a2, -1
|
|
vmv.v.x v\n, t1
|
|
.endr
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
.endr
|
|
|
|
ret
|
|
endfunc
|
|
|
|
func ff_h_16x16_rvv, zve32x
|
|
addi a2, a2, 15
|
|
vsetivli zero, 16, e8, m1, ta, ma
|
|
|
|
.irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
|
|
lbu t1, (a2)
|
|
addi a2, a2, -1
|
|
vmv.v.x v\n, t1
|
|
.endr
|
|
.irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
vse8.v v23, (a0)
|
|
|
|
ret
|
|
endfunc
|
|
|
|
func ff_h_8x8_rvv, zve32x
|
|
addi a2, a2, 7
|
|
vsetivli zero, 8, e8, mf2, ta, ma
|
|
|
|
.irp n 8, 9, 10, 11, 12, 13, 14, 15
|
|
lbu t1, (a2)
|
|
addi a2, a2, -1
|
|
vmv.v.x v\n, t1
|
|
.endr
|
|
.irp n 8, 9, 10, 11, 12, 13, 14
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
vse8.v v15, (a0)
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro tm_sum4 dst1, dst2, dst3, dst4, top, n1
|
|
lbu t1, \n1(a2)
|
|
lbu t2, (\n1-1)(a2)
|
|
lbu t3, (\n1-2)(a2)
|
|
lbu t4, (\n1-3)(a2)
|
|
sub t1, t1, a4
|
|
sub t2, t2, a4
|
|
sub t3, t3, a4
|
|
sub t4, t4, a4
|
|
vadd.vx \dst1, \top, t1
|
|
vadd.vx \dst2, \top, t2
|
|
vadd.vx \dst3, \top, t3
|
|
vadd.vx \dst4, \top, t4
|
|
.endm
|
|
|
|
func ff_tm_32x32_rvv, zve32x
|
|
lbu a4, -1(a3)
|
|
li t5, 32
|
|
|
|
.irp offset 31, 23, 15, 7
|
|
vsetvli zero, t5, e16, m4, ta, ma
|
|
vle8.v v8, (a3)
|
|
vzext.vf2 v28, v8
|
|
|
|
tm_sum4 v0, v4, v8, v12, v28, \offset
|
|
tm_sum4 v16, v20, v24, v28, v28, (\offset-4)
|
|
|
|
.irp n 0, 4, 8, 12, 16, 20, 24, 28
|
|
vmax.vx v\n, v\n, zero
|
|
.endr
|
|
|
|
vsetvli zero, zero, e8, m2, ta, ma
|
|
.irp n 0, 4, 8, 12, 16, 20, 24, 28
|
|
vnclipu.wi v\n, v\n, 0
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
.endr
|
|
|
|
ret
|
|
endfunc
|
|
|
|
func ff_tm_16x16_rvv, zve32x
|
|
vsetivli zero, 16, e16, m2, ta, ma
|
|
vle8.v v8, (a3)
|
|
vzext.vf2 v30, v8
|
|
lbu a4, -1(a3)
|
|
|
|
tm_sum4 v0, v2, v4, v6, v30, 15
|
|
tm_sum4 v8, v10, v12, v14, v30, 11
|
|
tm_sum4 v16, v18, v20, v22, v30, 7
|
|
tm_sum4 v24, v26, v28, v30, v30, 3
|
|
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
vmax.vx v\n, v\n, zero
|
|
.endr
|
|
|
|
vsetvli zero, zero, e8, m1, ta, ma
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28
|
|
vnclipu.wi v\n, v\n, 0
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
vnclipu.wi v30, v30, 0
|
|
vse8.v v30, (a0)
|
|
|
|
ret
|
|
endfunc
|
|
|
|
func ff_tm_8x8_rvv, zve32x
|
|
vsetivli zero, 8, e16, m1, ta, ma
|
|
vle8.v v8, (a3)
|
|
vzext.vf2 v28, v8
|
|
lbu a4, -1(a3)
|
|
|
|
tm_sum4 v16, v17, v18, v19, v28, 7
|
|
tm_sum4 v20, v21, v22, v23, v28, 3
|
|
|
|
.irp n 16, 17, 18, 19, 20, 21, 22, 23
|
|
vmax.vx v\n, v\n, zero
|
|
.endr
|
|
|
|
vsetvli zero, zero, e8, mf2, ta, ma
|
|
.irp n 16, 17, 18, 19, 20, 21, 22
|
|
vnclipu.wi v\n, v\n, 0
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
vnclipu.wi v24, v23, 0
|
|
vse8.v v24, (a0)
|
|
|
|
ret
|
|
endfunc
|
|
|
|
func ff_tm_4x4_rvv, zve32x
|
|
vsetivli zero, 4, e16, mf2, ta, ma
|
|
vle8.v v8, (a3)
|
|
vzext.vf2 v28, v8
|
|
lbu a4, -1(a3)
|
|
|
|
tm_sum4 v16, v17, v18, v19, v28, 3
|
|
|
|
.irp n 16, 17, 18, 19
|
|
vmax.vx v\n, v\n, zero
|
|
.endr
|
|
|
|
vsetvli zero, zero, e8, mf4, ta, ma
|
|
.irp n 16, 17, 18
|
|
vnclipu.wi v\n, v\n, 0
|
|
vse8.v v\n, (a0)
|
|
add a0, a0, a1
|
|
.endr
|
|
vnclipu.wi v24, v19, 0
|
|
vse8.v v24, (a0)
|
|
|
|
ret
|
|
endfunc
|