1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-19 05:49:09 +02:00

lavc/h264dsp: stick R-V V biweight to 16-bit

T-Head C908 (ns):
h264_biweight2_8_c:        2414.5
h264_biweight2_8_rvv_i32:   701.8 (before)
h264_biweight2_8_rvv_i32:   468.5 (after)
h264_biweight4_8_c:        4655.3
h264_biweight4_8_rvv_i32:  1377.5 (before)
h264_biweight4_8_rvv_i32:   931.8 (after)
h264_biweight8_8_c:        9701.5
h264_biweight8_8_rvv_i32:  2896.0 (before)
h264_biweight8_8_rvv_i32:  2070.5 (after)
h264_biweight16_8_c:      18025.0
h264_biweight16_8_rvv_i32: 3460.8 (before)
h264_biweight16_8_rvv_i32: 1978.0 (after)

SpacemiT X60 (ns):
h264_biweight2_8_c:        2415.5
h264_biweight2_8_rvv_i32:   478.2 (before)
h264_biweight2_8_rvv_i32:   362.8 (after)
h264_biweight4_8_c:        4655.3
h264_biweight4_8_rvv_i32:   946.7 (before)
h264_biweight4_8_rvv_i32:   727.3 (after)
h264_biweight8_8_c:        9061.8
h264_biweight8_8_rvv_i32:  2071.7 (before)
h264_biweight8_8_rvv_i32:  1685.8 (after)
h264_biweight16_8_c:      18020.5
h264_biweight16_8_rvv_i32: 3457.2 (before)
h264_biweight16_8_rvv_i32: 1935.8 (after)
This commit is contained in:
Rémi Denis-Courmont 2024-07-29 21:20:37 +03:00
parent 670ff6c7ce
commit afd45c7ff7

View File

@ -56,22 +56,21 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x
addi a7, a7, 1 addi a7, a7, 1
ori a7, a7, 1 ori a7, a7, 1
sll a7, a7, a4 sll a7, a7, a4
addi a4, a4, 1
1: 1:
vsetvli zero, t6, e32, m4, ta, ma vsetvli zero, t6, e16, m2, ta, ma
vle8.v v8, (a0) vle8.v v8, (a0)
addi a3, a3, -1 addi a3, a3, -1
vle8.v v12, (a1) vle8.v v12, (a1)
add a1, a1, a2 add a1, a1, a2
vmv.v.x v16, a7 vmv.v.x v16, a7
vsetvli zero, zero, e16, m2, ta, ma vsetvli zero, zero, e8, m1, ta, ma
vzext.vf2 v24, v8 vwmaccsu.vx v16, a5, v8
vzext.vf2 v28, v12 vwmaccsu.vx v16, a6, v12
vwmaccsu.vx v16, a5, v24 vsetvli zero, zero, e16, m2, ta, ma
vwmaccsu.vx v16, a6, v28
vnclip.wx v16, v16, a4
vmax.vx v16, v16, zero vmax.vx v16, v16, zero
vsetvli zero, zero, e8, m1, ta, ma vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v8, v16, 1 vnclipu.wx v8, v16, a4
vse8.v v8, (a0) vse8.v v8, (a0)
add a0, a0, a2 add a0, a0, a2
bnez a3, 1b bnez a3, 1b
@ -121,33 +120,29 @@ func ff_h264_biweight_pixels_8_rvv, zve32x
addi a7, a7, 1 addi a7, a7, 1
ori a7, a7, 1 ori a7, a7, 1
sll a7, a7, a4 sll a7, a7, a4
addi a4, a4, 1
1: 1:
mv t0, a0 mv t0, a0
mv t1, a1 mv t1, a1
mv t5, t6 mv t5, t6
2: 2:
vsetvli t2, a3, e32, m8, ta, ma vsetvli t2, a3, e16, m8, ta, ma
vlsseg2e8.v v0, (t0), a2 vlsseg2e8.v v0, (t0), a2
vlsseg2e8.v v4, (t1), a2 vlsseg2e8.v v8, (t1), a2
addi t5, t5, -2 addi t5, t5, -2
vmv.v.x v16, a7 vmv.v.x v16, a7
vmv.v.x v24, a7 vmv.v.x v24, a7
vsetvli zero, zero, e16, m4, ta, ma vsetvli zero, zero, e8, m4, ta, ma
vzext.vf2 v8, v0 vwmaccsu.vx v16, a5, v0
vzext.vf2 v12, v2 vwmaccsu.vx v24, a5, v4
vwmaccsu.vx v16, a5, v8
vwmaccsu.vx v24, a5, v12
vzext.vf2 v8, v4
vzext.vf2 v12, v6
vwmaccsu.vx v16, a6, v8 vwmaccsu.vx v16, a6, v8
vwmaccsu.vx v24, a6, v12 vwmaccsu.vx v24, a6, v12
vnclip.wx v8, v16, a4 vsetvli zero, zero, e16, m8, ta, ma
vnclip.wx v12, v24, a4 vmax.vx v16, v16, zero
vmax.vx v8, v8, zero vmax.vx v24, v24, zero
vmax.vx v12, v12, zero vsetvli zero, zero, e8, m4, ta, ma
vsetvli zero, zero, e8, m2, ta, ma vnclipu.wx v0, v16, a4
vnclipu.wi v0, v8, 1 vnclipu.wx v4, v24, a4
vnclipu.wi v2, v12, 1
vssseg2e8.v v0, (t0), a2 vssseg2e8.v v0, (t0), a2
addi t0, t0, 2 addi t0, t0, 2
addi t1, t1, 2 addi t1, t1, 2