1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

lavu/fixed_dsp: R-V V fmul_window_scaled

vector_fmul_window_scaled_fixed_c:       4393.7
vector_fmul_window_scaled_fixed_rvv_i64: 1642.7
This commit is contained in:
Rémi Denis-Courmont 2023-10-04 22:50:47 +03:00
parent e49f41fb27
commit e33ce0d9dd
2 changed files with 54 additions and 1 deletions

View File

@ -25,6 +25,9 @@
#include "libavutil/cpu.h"
#include "libavutil/fixed_dsp.h"
void ff_vector_fmul_window_scaled_rvv(int16_t *dst, const int32_t *src0,
const int32_t *src1, const int32_t *win,
int len, uint8_t bits);
void ff_vector_fmul_window_fixed_rvv(int32_t *dst, const int32_t *src0,
const int32_t *src1, const int32_t *win,
int len);
@ -43,8 +46,10 @@ av_cold void ff_fixed_dsp_init_riscv(AVFixedDSPContext *fdsp)
int flags = av_get_cpu_flags();
if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
if (flags & AV_CPU_FLAG_RVV_I64)
if (flags & AV_CPU_FLAG_RVV_I64) {
fdsp->vector_fmul_window_scaled = ff_vector_fmul_window_scaled_rvv;
fdsp->vector_fmul_window = ff_vector_fmul_window_fixed_rvv;
}
fdsp->vector_fmul = ff_vector_fmul_fixed_rvv;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_fixed_rvv;

View File

@ -20,6 +20,54 @@
#include "asm.S"
func ff_vector_fmul_window_scaled_rvv, zve64x
csrwi vxrm, 0
vsetvli t0, zero, e16, m1, ta, ma
sh2add a2, a4, a2
vid.v v0
sh3add t3, a4, a3
vadd.vi v0, v0, 1
sh2add t0, a4, a0
1:
vsetvli t2, a4, e16, m1, ta, ma
slli t4, t2, 2
slli t1, t2, 1
vrsub.vx v2, v0, t2
sub t3, t3, t4
vsetvli zero, zero, e32, m2, ta, ma
sub a2, a2, t4
vle32.v v8, (t3)
sub t0, t0, t1
vle32.v v4, (a2)
sub a4, a4, t2
vrgatherei16.vv v28, v8, v2
vle32.v v16, (a1)
add a1, a1, t4
vrgatherei16.vv v20, v4, v2
vle32.v v24, (a3)
add a3, a3, t4
vwmul.vv v12, v16, v28
vwmul.vv v8, v16, v24
// vwnmsac.vv does _not_ exist so multiply & subtract separately
vwmul.vv v4, v20, v24
vwmacc.vv v8, v20, v28
vsetvli zero, zero, e64, m4, ta, ma
vsub.vv v12, v12, v4
vsetvli zero, zero, e32, m2, ta, ma
vnclip.wi v16, v8, 31
vnclip.wi v20, v12, 31
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wx v8, v16, a5
vnclip.wx v12, v20, a5
vrgatherei16.vv v16, v8, v2
vse16.v v12, (a0)
add a0, a0, t1
vse16.v v16, (t0)
bnez a4, 1b
ret
endfunc
func ff_vector_fmul_window_fixed_rvv, zve64x
csrwi vxrm, 0
vsetvli t0, zero, e16, m1, ta, ma