1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-10-30 23:18:11 +02:00

lavc/h264dsp: use saturing add/sub for R-V V 8-bit DC add

T-Head C908 (cycles):
h264_idct4_dc_add_8bpp_c:      109.2
h264_idct4_dc_add_8bpp_rvv_i32: 34.5 (before)
h264_idct4_dc_add_8bpp_rvv_i32: 25.5 (after)
h264_idct8_dc_add_8bpp_c:      418.7
h264_idct8_dc_add_8bpp_rvv_i64: 69.5 (before)
h264_idct8_dc_add_8bpp_rvv_i64: 33.5 (after)
This commit is contained in:
Rémi Denis-Courmont
2024-07-25 20:47:34 +03:00
parent 4713a5cc24
commit b0b3bea10b
2 changed files with 21 additions and 12 deletions

View File

@@ -98,8 +98,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
if (flags & AV_CPU_FLAG_RVB) {
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
dsp->h264_idct_add16 = ff_h264_idct_add16_8_rvv;
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
# if __riscv_xlen == 64
@@ -108,7 +108,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
}
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
if (flags & AV_CPU_FLAG_RVB)
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
}
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
}

View File

@@ -420,7 +420,7 @@ endfunc
.endr
.macro idct_dc_add8 width
func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, b
lpad 0
.if \width == 8
vsetivli zero, \width, e8, mf2, ta, ma
@@ -428,26 +428,34 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
vsetivli zero, \width, e8, mf4, ta, ma
.endif
lh t0, 0(a1)
li t1, 255
addi t0, t0, 32
srai t0, t0, 6
sh zero, 0(a1)
.if \width == 8
li a6, \width * \width
vlse64.v v24, (a0), a2
vsetvli zero, a6, e16, m8, ta, ma
vsetvli zero, a6, e8, m4, ta, ma
.else
vlse32.v v24, (a0), a2
vsetivli zero, \width * \width, e16, m2, ta, ma
vsetivli zero, \width * \width, e8, m1, ta, ma
.endif
vzext.vf2 v0, v24
vadd.vx v0, v0, t0
vmax.vx v0, v0, zero
.if \width == 8
vsetvli zero, zero, e8, m4, ta, ma
bgez t0, 1f
neg t0, t0
minu t0, t0, t1
vssubu.vx v24, v24, t0
.if \width == 8
vsetivli zero, \width, e8, mf2, ta, ma
vsse64.v v24, (a0), a2
.else
vsetvli zero, zero, e8, m1, ta, ma
vsetivli zero, \width, e8, mf4, ta, ma
vsse32.v v24, (a0), a2
.endif
vnclipu.wi v24, v0, 0
ret
1:
minu t0, t0, t1
vsaddu.vx v24, v24, t0
.if \width == 8
vsetivli zero, \width, e8, mf2, ta, ma
vsse64.v v24, (a0), a2