lavu/fixed_dsp: R-V V vector_fmul_window

2024-12-23 12:43:46 +02:00 · 2023-10-04 19:13:13 +03:00 · 2023-10-04 19:13:13 +03:00 · f39a8790e1
commit f39a8790e1
parent 10eb3b9c9f
2 changed files with 50 additions and 0 deletions
--- a/libavutil/riscv/fixed_dsp_init.c
+++ b/libavutil/riscv/fixed_dsp_init.c
@ -25,6 +25,9 @@
 #include "libavutil/cpu.h"
 #include "libavutil/fixed_dsp.h"
 void ff_vector_fmul_window_fixed_rvv(int32_t *dst, const int32_t *src0,
                                     const int32_t *src1, const int32_t *win,
                                     int len);
 void ff_vector_fmul_fixed_rvv(int *dst, const int *src0, const int *src1,
                              int len);
 void ff_vector_fmul_reverse_fixed_rvv(int *dst, const int *src0,
@ -40,6 +43,9 @@ av_cold void ff_fixed_dsp_init_riscv(AVFixedDSPContext *fdsp)
    int flags = av_get_cpu_flags();
    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
        if (flags & AV_CPU_FLAG_RVV_I64)
            fdsp->vector_fmul_window = ff_vector_fmul_window_fixed_rvv;
        fdsp->vector_fmul = ff_vector_fmul_fixed_rvv;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_fixed_rvv;
        fdsp->vector_fmul_add = ff_vector_fmul_add_fixed_rvv;
--- a/libavutil/riscv/fixed_dsp_rvv.S
+++ b/libavutil/riscv/fixed_dsp_rvv.S
@ -20,6 +20,50 @@
 #include "asm.S"
 func ff_vector_fmul_window_fixed_rvv, zve64x
        csrwi   vxrm, 0
        vsetvli t0, zero, e16, m1, ta, ma
        sh2add  a2, a4, a2
        vid.v   v0
        sh3add  t3, a4, a3
        vadd.vi v0, v0, 1
        sh3add  t0, a4, a0
 1:
        vsetvli t2, a4, e16, m1, ta, ma
        slli    t4, t2, 2
        vrsub.vx v2, v0, t2
        sub     t3, t3, t4
        vsetvli zero, zero, e32, m2, ta, ma
        sub     a2, a2, t4
        vle32.v v8, (t3)
        sub     t0, t0, t4
        vle32.v v4, (a2)
        sub     a4, a4, t2
        vrgatherei16.vv v28, v8, v2
        vle32.v v16, (a1)
        add     a1, a1, t4
        vrgatherei16.vv v20, v4, v2
        vle32.v v24, (a3)
        add     a3, a3, t4
        vwmul.vv v12, v16, v28
        vwmul.vv v8, v16, v24
        // vwnmsac.vv does _not_ exist so multiply & subtract separately
        vwmul.vv v4, v20, v24
        vwmacc.vv v8, v20, v28
        vsetvli zero, zero, e64, m4, ta, ma
        vsub.vv v12, v12, v4
        vsetvli zero, zero, e32, m2, ta, ma
        vnclip.wi v16, v8, 31
        vnclip.wi v20, v12, 31
        vrgatherei16.vv v8, v16, v2
        vse32.v v20, (a0)
        add     a0, a0, t4
        vse32.v v8, (t0)
        bnez    a4, 1b
        ret
 endfunc
 func ff_vector_fmul_fixed_rvv, zve32x
        csrwi   vxrm, 0
 1: