You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	lavc/h264dsp: use saturing add/sub for R-V V 8-bit DC add
T-Head C908 (cycles): h264_idct4_dc_add_8bpp_c: 109.2 h264_idct4_dc_add_8bpp_rvv_i32: 34.5 (before) h264_idct4_dc_add_8bpp_rvv_i32: 25.5 (after) h264_idct8_dc_add_8bpp_c: 418.7 h264_idct8_dc_add_8bpp_rvv_i64: 69.5 (before) h264_idct8_dc_add_8bpp_rvv_i64: 33.5 (after)
This commit is contained in:
		| @@ -98,8 +98,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, | ||||
|  | ||||
|             dsp->h264_idct_add  = ff_h264_idct_add_8_rvv; | ||||
|             dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv; | ||||
|             dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv; | ||||
|             if (flags & AV_CPU_FLAG_RVB) { | ||||
|                 dsp->h264_idct_dc_add     = ff_h264_idct4_dc_add_8_rvv; | ||||
|                 dsp->h264_idct_add16      = ff_h264_idct_add16_8_rvv; | ||||
|                 dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv; | ||||
| #  if __riscv_xlen == 64 | ||||
| @@ -108,7 +108,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, | ||||
|             } | ||||
|             if (flags & AV_CPU_FLAG_RVV_I64) { | ||||
|                 dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv; | ||||
|                 dsp->h264_idct8_dc_add      = ff_h264_idct8_dc_add_8_rvv; | ||||
|                 if (flags & AV_CPU_FLAG_RVB) | ||||
|                     dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv; | ||||
|             } | ||||
|             dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv; | ||||
|         } | ||||
|   | ||||
| @@ -420,7 +420,7 @@ endfunc | ||||
| .endr | ||||
|  | ||||
| .macro idct_dc_add8 width | ||||
| func ff_h264_idct\width\()_dc_add_8_rvv, zve64x | ||||
| func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, b | ||||
|         lpad    0 | ||||
| .if \width == 8 | ||||
|         vsetivli        zero, \width, e8, mf2, ta, ma | ||||
| @@ -428,26 +428,34 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x | ||||
|         vsetivli        zero, \width, e8, mf4, ta, ma | ||||
| .endif | ||||
|         lh              t0, 0(a1) | ||||
|         li              t1, 255 | ||||
|         addi            t0, t0, 32 | ||||
|         srai            t0, t0, 6 | ||||
|         sh              zero, 0(a1) | ||||
| .if \width == 8 | ||||
|         li              a6, \width * \width | ||||
|         vlse64.v        v24, (a0), a2 | ||||
|         vsetvli         zero, a6, e16, m8, ta, ma | ||||
|         vsetvli         zero, a6, e8, m4, ta, ma | ||||
| .else | ||||
|         vlse32.v        v24, (a0), a2 | ||||
|         vsetivli        zero, \width * \width, e16, m2, ta, ma | ||||
|         vsetivli        zero, \width * \width, e8, m1, ta, ma | ||||
| .endif | ||||
|         vzext.vf2       v0, v24 | ||||
|         vadd.vx         v0, v0, t0 | ||||
|         vmax.vx         v0, v0, zero | ||||
| .if \width == 8 | ||||
|         vsetvli         zero, zero, e8, m4, ta, ma | ||||
|         bgez            t0, 1f | ||||
|  | ||||
|         neg             t0, t0 | ||||
|         minu            t0, t0, t1 | ||||
|         vssubu.vx       v24, v24, t0 | ||||
|  .if \width == 8 | ||||
|         vsetivli        zero, \width, e8, mf2, ta, ma | ||||
|         vsse64.v        v24, (a0), a2 | ||||
| .else | ||||
|         vsetvli         zero, zero, e8, m1, ta, ma | ||||
|         vsetivli        zero, \width, e8, mf4, ta, ma | ||||
|         vsse32.v        v24, (a0), a2 | ||||
| .endif | ||||
|         vnclipu.wi      v24, v0, 0 | ||||
|         ret | ||||
| 1: | ||||
|         minu            t0, t0, t1 | ||||
|         vsaddu.vx       v24, v24, t0 | ||||
| .if \width == 8 | ||||
|         vsetivli        zero, \width, e8, mf2, ta, ma | ||||
|         vsse64.v        v24, (a0), a2 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user