1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

lavc/h264dsp: R-V V loop_filter_chroma

T-Head C908:
h264_v_loop_filter_chroma_8bpp_c:      137.4
h264_v_loop_filter_chroma_8bpp_rvv_i32: 54.2
This commit is contained in:
Rémi Denis-Courmont 2024-08-20 21:59:00 +03:00
parent 3a53656837
commit 7d1dda4892
2 changed files with 99 additions and 15 deletions

View File

@ -40,6 +40,13 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta,
int8_t *tc0);
#define IDCT_DEPTH(depth) \
void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
@ -101,6 +108,14 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff =
ff_h264_h_loop_filter_luma_mbaff_8_rvv;
dsp->h264_v_loop_filter_chroma =
ff_h264_v_loop_filter_chroma_8_rvv;
if (chroma_format_idc <= 1) {
dsp->h264_h_loop_filter_chroma =
ff_h264_h_loop_filter_chroma_8_rvv;
dsp->h264_h_loop_filter_chroma_mbaff =
ff_h264_h_loop_filter_chroma_mbaff_8_rvv;
}
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv;

View File

@ -191,61 +191,85 @@ const ff_h264_weight_funcs_8_rvv
.endr
endconst
.variant_cc ff_h264_loop_filter_luma_8_rvv
func ff_h264_loop_filter_luma_8_rvv, zve32x
.macro loop_filter type, inners, e8mul, e16mul
.variant_cc ff_h264_loop_filter_\type\()_8_rvv
func ff_h264_loop_filter_\type\()_8_rvv, zve32x
# p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13
# alpha: a2, beta: a3, tc_orig: v6
csrwi vxrm, 0
.ifc \type, luma
vaaddu.vv v14, v10, v11 # (p0 + q0 + 1) / 2
.endif
vwsubu.vv v16, v9, v12
.ifc \type, luma
vwaddu.vv v18, v8, v14
vwaddu.vv v20, v13, v14
.endif
vnsra.wi v24, v16, 2 # (p1 - q1) / 4
.ifc \type, luma
vnsrl.wi v14, v18, 1
vnsrl.wi v15, v20, 1
vneg.v v5, v6 # -tc_orig
.endif
vwsubu.vv v22, v11, v10 # q0 - p0
.ifc \type, luma
vwsubu.vv v18, v14, v9
vwsubu.vv v20, v15, v12
.endif
vwadd.wv v16, v22, v24
vmsge.vi v7, v6, 0 # tc_orig >= 0
.ifc \type, luma
vnclip.wi v14, v18, 0
vnclip.wi v15, v20, 0
.endif
vnclip.wi v16, v16, 1 # clip8((q0 - p0 + (p1 - q1) / 4 + 1) >> 1)
.ifc \type, luma
vmin.vv v14, v14, v6
vmin.vv v15, v15, v6
vmax.vv v14, v14, v5 # clip(p2 + ... - p1, +/-tc_orig)
vmax.vv v15, v15, v5 # clip(q2 + ... - q1, +/-tc_orig)
.endif
vwsubu.vv v20, v10, v11
vwsubu.vv v24, v9, v10
vwsubu.vv v26, v10, v9
vwsubu.vv v28, v12, v11
vwsubu.vv v30, v11, v12
.ifc \type, luma
vwsubu.vv v0, v8, v10
vwsubu.vv v2, v10, v8
vwsubu.vv v4, v13, v11
vwsubu.vv v18, v11, v13
vsetvli zero, zero, e16, m2, ta, ma
.endif
vsetvli zero, zero, e16, \e16mul, ta, ma
vmax.vv v20, v20, v22 # abs(p0 - q0)
vmax.vv v24, v24, v26 # abs(p1 - p0)
vmax.vv v28, v28, v30 # abs(q1 - q0)
.ifc \type, luma
vmax.vv v22, v0, v2 # abs(p2 - p0)
vmax.vv v26, v4, v18 # abs(q2 - q0)
.endif
vmslt.vx v1, v20, a2
vmslt.vx v2, v24, a3
vmand.mm v7, v7, v1
vmslt.vx v3, v28, a3
vmand.mm v7, v7, v2
.ifc \type, luma
vmslt.vx v0, v22, a3
vmand.mm v7, v7, v3 # whether to update p0 and q0
vmslt.vx v1, v26, a3
vmand.mm v0, v0, v7
vsetvli zero, zero, e8, m1, ta, mu
.else
vmand.mm v0, v7, v3 # whether to update p0 and q0
.endif
vsetvli zero, zero, e8, \e8mul, ta, mu
.ifc \type, luma
vadd.vi v6, v6, 1, v0.t # tc++
vadd.vv v9, v9, v14, v0.t # p1'
vmand.mm v0, v1, v7
vadd.vi v6, v6, 1, v0.t # tc++
vadd.vv v12, v12, v15, v0.t # q1'
vmmv.m v0, v7
.endif
vneg.v v5, v6 # -tc
vmin.vv v16, v16, v6
vwcvtu.x.x.v v18, v10
@ -253,43 +277,59 @@ func ff_h264_loop_filter_luma_8_rvv, zve32x
vwcvtu.x.x.v v20, v11
vwadd.wv v18, v18, v16
vwsub.wv v20, v20, v16
vmmv.m v0, v7
vsetvli zero, zero, e16, m2, ta, ma
vsetvli zero, zero, e16, \e16mul, ta, ma
vmax.vx v18, v18, zero
vmax.vx v20, v20, zero
vsetvli zero, zero, e8, m1, ta, mu
vsetvli zero, zero, e8, \e8mul, ta, mu
vnclipu.wi v10, v18, 0, v0.t # p0'
vnclipu.wi v11, v20, 0, v0.t # q0'
jr t0
endfunc
func ff_h264_v_loop_filter_luma_8_rvv, zve32x
func ff_h264_v_loop_filter_\type\()_8_rvv, zve32x
lpad 0
.ifc \type, luma
vsetivli zero, 4, e32, m1, ta, ma
vle8.v v4, (a4)
li t0, 0x01010101
vzext.vf4 v6, v4
.else
vsetivli zero, 4, e16, mf2, ta, ma
vle8.v v4, (a4)
li t0, 0x0101
vzext.vf2 v6, v4
.endif
sub t3, a0, a1
vmul.vx v6, v6, t0
vsetivli zero, 16, e8, m1, ta, ma
vsetivli zero, 4 * \inners, e8, \e8mul, ta, ma
vle8.v v11, (a0)
sub t2, t3, a1
vid.v v0
vle8.v v10, (t3)
sub t1, t2, a1
vle8.v v9, (t2)
add t5, a0, a1
vle8.v v8, (t1)
add t6, t5, a1
vle8.v v9, (t2)
.ifc \type, luma
sub t1, t2, a1
.endif
vle8.v v12, (t5)
.ifc \type, luma
add t6, t5, a1
vle8.v v8, (t1)
vle8.v v13, (t6)
jal t0, ff_h264_loop_filter_luma_8_rvv
.endif
jal t0, ff_h264_loop_filter_\type\()_8_rvv
.ifc \type, luma
vse8.v v9, (t2)
vse8.v v12, (t5)
.endif
vse8.v v10, (t3)
vse8.v v11, (a0)
vse8.v v12, (t5)
ret
endfunc
.endm
loop_filter luma, 4, m1, m2
loop_filter chroma, 2, mf2, m1
func ff_h264_h_loop_filter_luma_8_rvv, zve32x
lpad 0
@ -322,3 +362,32 @@ func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
vssseg4e8.v v9, (a0), a1
ret
endfunc
func ff_h264_h_loop_filter_chroma_8_rvv, zve32x
lpad 0
vsetivli zero, 4, e16, mf2, ta, ma
vle8.v v4, (a4)
li t0, 0x0101
vzext.vf2 v6, v4
addi a0, a0, -2
vmul.vx v6, v6, t0
vsetivli zero, 8, e8, mf2, ta, ma
vlsseg4e8.v v9, (a0), a1
addi a0, a0, 1
jal t0, ff_h264_loop_filter_chroma_8_rvv
vssseg2e8.v v10, (a0), a1
ret
endfunc
func ff_h264_h_loop_filter_chroma_mbaff_8_rvv, zve32x
lpad 0
vsetivli zero, 4, e8, mf4, ta, ma
vle8.v v6, (a4)
addi a0, a0, -2
vsetivli zero, 4, e8, mf2, ta, ma
vlsseg4e8.v v9, (a0), a1
addi a0, a0, 1
jal t0, ff_h264_loop_filter_chroma_8_rvv
vssseg2e8.v v10, (a0), a1
ret
endfunc