You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p
In my personal opinion, we should not need to support unaligned YUY2 pixel maps. They should always be aligned to at least 32 bits, and the current code assumes just 16 bits. However checkasm does test for unaligned input bitmaps. QEMU accepts it, but real hardware dose not. In this particular case, we can at the same time improve performance and handle unaligned inputs, so do just that. uyvytoyuv422_c: 104379.0 uyvytoyuv422_c: 104060.0 uyvytoyuv422_rvv_i32: 25284.0 (before) uyvytoyuv422_rvv_i32: 19303.2 (after)
This commit is contained in:
		| @@ -55,8 +55,10 @@ av_cold void rgb2rgb_init_riscv(void) | |||||||
|         shuffle_bytes_1230 = ff_shuffle_bytes_1230_rvv; |         shuffle_bytes_1230 = ff_shuffle_bytes_1230_rvv; | ||||||
|         shuffle_bytes_3012 = ff_shuffle_bytes_3012_rvv; |         shuffle_bytes_3012 = ff_shuffle_bytes_3012_rvv; | ||||||
|         interleaveBytes = ff_interleave_bytes_rvv; |         interleaveBytes = ff_interleave_bytes_rvv; | ||||||
|         uyvytoyuv422 = ff_uyvytoyuv422_rvv; |         if (flags & AV_CPU_FLAG_RVB_BASIC) { | ||||||
|         yuyvtoyuv422 = ff_yuyvtoyuv422_rvv; |             uyvytoyuv422 = ff_uyvytoyuv422_rvv; | ||||||
|  |             yuyvtoyuv422 = ff_yuyvtoyuv422_rvv; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
|   | |||||||
| @@ -126,32 +126,35 @@ func ff_deinterleave_bytes_rvv, zve32x | |||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
|  |  | ||||||
| .macro yuy2_to_i422p y_shift | .macro yuy2_to_i422p luma, chroma | ||||||
|         slli    t4, a4, 1 // pixel width -> (source) byte width |         srai    t4, a4, 1 // pixel width -> chroma width | ||||||
|         lw      t6, (sp) |         lw      t6, (sp) | ||||||
|  |         slli    t5, a4, 1 // pixel width -> (source) byte width | ||||||
|         sub     a6, a6, a4 |         sub     a6, a6, a4 | ||||||
|         srai    a4, a4, 1 // pixel width -> chroma width |         sub     a7, a7, t4 | ||||||
|         sub     a7, a7, a4 |         sub     t6, t6, t5 | ||||||
|         sub     t6, t6, t4 |         vsetvli t2, zero, e8, m4, ta, ma | ||||||
| 1: | 1: | ||||||
|         mv      t4, a4 |         mv      t4, a4 | ||||||
|         addi    a5, a5, -1 |         addi    a5, a5, -1 | ||||||
| 2: | 2: | ||||||
|         vsetvli    t5, t4, e8, m2, ta, ma |         min     t0, t2, t4 // ensure even VL on penultimate iteration | ||||||
|         vlseg2e16.v v16, (a3) |         vsetvli t0, t0, e8, m4, ta, ma | ||||||
|         sub        t4, t4, t5 |         vlseg2e8.v v16, (a3) | ||||||
|         vnsrl.wi   v24, v16, \y_shift // Y0 |         srli    t1, t0, 1 | ||||||
|         sh2add     a3, t5, a3 |         vsetvli zero, t1, e8, m2, ta, ma | ||||||
|         vnsrl.wi   v26, v20, \y_shift // Y1 |         vnsrl.wi   v24, \chroma, 0 // U | ||||||
|         vnsrl.wi   v28, v16, 8 - \y_shift // U |         sub     t4, t4, t0 | ||||||
|         vnsrl.wi   v30, v20, 8 - \y_shift // V |         vnsrl.wi   v28, \chroma, 8 // V | ||||||
|         vsseg2e8.v v24, (a0) |         sh1add  a3, t0, a3 | ||||||
|         sh1add     a0, t5, a0 |         vse8.v  v24, (a1) | ||||||
|         vse8.v     v28, (a1) |         add     a1, t1, a1 | ||||||
|         add        a1, t5, a1 |         vse8.v  v28, (a2) | ||||||
|         vse8.v     v30, (a2) |         add     a2, t1, a2 | ||||||
|         add        a2, t5, a2 |         vsetvli zero, t0, e8, m4, ta, ma | ||||||
|         bnez       t4, 2b |         vse8.v  \luma, (a0) | ||||||
|  |         add     a0, t0, a0 | ||||||
|  |         bnez    t4, 2b | ||||||
|  |  | ||||||
|         add     a3, a3, t6 |         add     a3, a3, t6 | ||||||
|         add     a0, a0, a6 |         add     a0, a0, a6 | ||||||
| @@ -163,9 +166,9 @@ endfunc | |||||||
| .endm | .endm | ||||||
|  |  | ||||||
| func ff_uyvytoyuv422_rvv, zve32x | func ff_uyvytoyuv422_rvv, zve32x | ||||||
|         yuy2_to_i422p 8 |         yuy2_to_i422p v20, v16 | ||||||
| endfunc | endfunc | ||||||
|  |  | ||||||
| func ff_yuyvtoyuv422_rvv, zve32x | func ff_yuyvtoyuv422_rvv, zve32x | ||||||
|         yuy2_to_i422p 0 |         yuy2_to_i422p v16, v20 | ||||||
| endfunc | endfunc | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user