mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-04-08 16:54:03 +02:00
lavc/opusdsp: rewrite R-V V postfilter
This uses a more traditional approach allowing up processing of up to period minus two elements per iteration. This also allows the algorithm to work for all and any vector length. As the T-Head C908 device under test can load 16 elements loop, there is unsurprisingly a little performance drop when the period is minimal and the parallelism is capped at 13 elements: Before: postfilter_15_c: 21222.2 postfilter_15_rvv_f32: 22007.7 postfilter_512_c: 20189.7 postfilter_512_rvv_f32: 22004.2 postfilter_1022_c: 20189.7 postfilter_1022_rvv_f32: 22004.2 After: postfilter_15_c: 20189.5 postfilter_15_rvv_f32: 7057.2 postfilter_512_c: 20189.5 postfilter_512_rvv_f32: 5667.2 postfilter_1022_c: 20192.7 postfilter_1022_rvv_f32: 5667.2
This commit is contained in:
parent
02594c8c01
commit
adc87a5f7c
@ -25,30 +25,15 @@
|
||||
#include "libavutil/riscv/cpu.h"
|
||||
#include "libavcodec/opusdsp.h"
|
||||
|
||||
void ff_opus_postfilter_rvv_128(float *data, int period, float *g, int len);
|
||||
void ff_opus_postfilter_rvv_256(float *data, int period, float *g, int len);
|
||||
void ff_opus_postfilter_rvv_512(float *data, int period, float *g, int len);
|
||||
void ff_opus_postfilter_rvv_1024(float *data, int period, float *g, int len);
|
||||
void ff_opus_postfilter_rvv(float *data, int period, float *g, int len);
|
||||
|
||||
av_cold void ff_opus_dsp_init_riscv(OpusDSP *d)
|
||||
{
|
||||
#if HAVE_RVV
|
||||
int flags = av_get_cpu_flags();
|
||||
|
||||
if (flags & AV_CPU_FLAG_RVV_F32)
|
||||
switch (ff_get_rv_vlenb()) {
|
||||
case 16:
|
||||
d->postfilter = ff_opus_postfilter_rvv_128;
|
||||
break;
|
||||
case 32:
|
||||
d->postfilter = ff_opus_postfilter_rvv_256;
|
||||
break;
|
||||
case 64:
|
||||
d->postfilter = ff_opus_postfilter_rvv_512;
|
||||
break;
|
||||
case 128:
|
||||
d->postfilter = ff_opus_postfilter_rvv_512;
|
||||
break;
|
||||
}
|
||||
if ((flags & AV_CPU_FLAG_RVV_F32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
|
||||
(flags & AV_CPU_FLAG_RVB_BASIC))
|
||||
d->postfilter = ff_opus_postfilter_rvv;
|
||||
#endif
|
||||
}
|
||||
|
@ -20,56 +20,47 @@
|
||||
|
||||
#include "libavutil/riscv/asm.S"
|
||||
|
||||
func ff_opus_postfilter_rvv_128, zve32f
|
||||
lvtypei a5, e32, m2, ta, ma
|
||||
j 1f
|
||||
endfunc
|
||||
func ff_opus_postfilter_rvv, zve32f
|
||||
flw fa0, 0(a2) // g0
|
||||
slli t1, a1, 2
|
||||
flw fa1, 4(a2) // g1
|
||||
sub t0, a0, t1
|
||||
flw fa2, 8(a2) // g2
|
||||
addi t0, t0, 2 * 4 // data - (period - 2) = initial &x0
|
||||
|
||||
func ff_opus_postfilter_rvv_512, zve32f
|
||||
lvtypei a5, e32, mf2, ta, ma
|
||||
j 1f
|
||||
endfunc
|
||||
|
||||
func ff_opus_postfilter_rvv_1024, zve32f
|
||||
lvtypei a5, e32, mf4, ta, ma
|
||||
j 1f
|
||||
endfunc
|
||||
|
||||
func ff_opus_postfilter_rvv_256, zve32f
|
||||
lvtypei a5, e32, m1, ta, ma
|
||||
flw ft4, -16(t0)
|
||||
addi t3, a1, -2 // maximum parallelism w/o stepping our tail
|
||||
flw ft3, -12(t0)
|
||||
flw ft2, -8(t0)
|
||||
flw ft1, -4(t0)
|
||||
1:
|
||||
li a4, 5
|
||||
addi a1, a1, 2
|
||||
slli a1, a1, 2
|
||||
lw t1, 4(a2)
|
||||
vsetivli zero, 3, e32, m1, ta, ma
|
||||
vle32.v v24, (a2)
|
||||
sub a1, a0, a1 // a1 = &x4 = &data[-(period + 2)]
|
||||
vsetvl zero, a4, a5
|
||||
vslide1up.vx v8, v24, t1
|
||||
lw t2, 8(a2)
|
||||
vle32.v v16, (a1)
|
||||
vslide1up.vx v24, v8, t2 // v24 = { g[2], g[1], g[0], g[1], g[2] }
|
||||
2:
|
||||
vsetvl t0, a3, a5
|
||||
vle32.v v0, (a0)
|
||||
sub a3, a3, t0
|
||||
3:
|
||||
vsetvl zero, a4, a5
|
||||
lw t2, 20(a1)
|
||||
vfmul.vv v8, v24, v16
|
||||
addi a0, a0, 4
|
||||
vslide1down.vx v16, v16, t2
|
||||
addi a1, a1, 4
|
||||
vfredusum.vs v0, v8, v0
|
||||
vsetvl zero, t0, a5
|
||||
vmv.x.s t1, v0
|
||||
addi t0, t0, -1
|
||||
vslide1down.vx v0, v0, zero
|
||||
sw t1, -4(a0)
|
||||
bnez t0, 3b
|
||||
|
||||
bnez a3, 2b
|
||||
min t1, a3, t3
|
||||
vsetvli t1, t1, e32, m4, ta, ma
|
||||
vle32.v v0, (t0) // x0
|
||||
sub a3, a3, t1
|
||||
vle32.v v28, (a0)
|
||||
sh2add t0, t1, t0
|
||||
vfslide1up.vf v4, v0, ft1
|
||||
addi t2, t1, -4
|
||||
vfslide1up.vf v8, v4, ft2
|
||||
vfslide1up.vf v12, v8, ft3
|
||||
vfslide1up.vf v16, v12, ft4
|
||||
vfadd.vv v20, v4, v12
|
||||
vfadd.vv v24, v0, v16
|
||||
vslidedown.vx v12, v0, t2
|
||||
vfmacc.vf v28, fa0, v8
|
||||
vslidedown.vi v4, v12, 2
|
||||
vfmacc.vf v28, fa1, v20
|
||||
vslide1down.vx v8, v12, zero
|
||||
vfmacc.vf v28, fa2, v24
|
||||
vslide1down.vx v0, v4, zero
|
||||
vse32.v v28, (a0)
|
||||
vfmv.f.s ft4, v12
|
||||
sh2add a0, t1, a0
|
||||
vfmv.f.s ft2, v4
|
||||
vfmv.f.s ft3, v8
|
||||
vfmv.f.s ft1, v0
|
||||
bnez a3, 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
Loading…
x
Reference in New Issue
Block a user