You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google.
Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:
                                     Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub16_add_neon:   3188.1   2435.4   2499.0   1969.0
vp9_inv_dct_dct_32x32_sub32_add_neon:  18531.7  16582.3  14207.6  12000.3
By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:
vp9_inv_dct_dct_16x16_sub1_add_neon:     274.6    189.5    211.7    235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:    2064.0   1534.8   1719.4   1248.7
vp9_inv_dct_dct_16x16_sub4_add_neon:    2135.0   1477.2   1736.3   1249.5
vp9_inv_dct_dct_16x16_sub8_add_neon:    2446.7   1828.7   1993.6   1494.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2832.4   2118.3   2266.5   1735.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.7   2475.3   2523.5   1983.1
vp9_inv_dct_dct_32x32_sub1_add_neon:     756.2    456.7    862.0    553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   10682.2   8190.4   8539.2   6762.5
vp9_inv_dct_dct_32x32_sub4_add_neon:   10813.5   8014.9   8518.3   6762.8
vp9_inv_dct_dct_32x32_sub8_add_neon:   11859.6   9313.0   9347.4   7514.5
vp9_inv_dct_dct_32x32_sub12_add_neon:  12946.6  10752.4  10192.2   8280.2
vp9_inv_dct_dct_32x32_sub16_add_neon:  14074.6  11946.5  11001.4   9008.6
vp9_inv_dct_dct_32x32_sub20_add_neon:  15269.9  13662.7  11816.1   9762.6
vp9_inv_dct_dct_32x32_sub24_add_neon:  16327.9  14940.1  12626.7  10516.0
vp9_inv_dct_dct_32x32_sub28_add_neon:  17462.7  15776.1  13446.2  11264.7
vp9_inv_dct_dct_32x32_sub32_add_neon:  18575.5  17157.0  14249.3  12015.1
I.e. in general a very minor overhead for the full subpartition case due
to the additional loads and cmps, but a significant speedup for the cases
when we only need to process a small part of the actual input data.
In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
This is cherrypicked from libav commit
9c8bc74c2b.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
			
			
This commit is contained in:
		
				
					committed by
					
						 Michael Niedermayer
						Michael Niedermayer
					
				
			
			
				
	
			
			
			
						parent
						
							ecd343aa1f
						
					
				
				
					commit
					388f6e6715
				
			| @@ -659,9 +659,8 @@ endfunc | |||||||
| @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, | @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, | ||||||
| @ transpose into a horizontal 16x4 slice and store. | @ transpose into a horizontal 16x4 slice and store. | ||||||
| @ r0 = dst (temp buffer) | @ r0 = dst (temp buffer) | ||||||
| @ r1 = unused | @ r1 = slice offset | ||||||
| @ r2 = src | @ r2 = src | ||||||
| @ r3 = slice offset |  | ||||||
| function \txfm\()16_1d_4x16_pass1_neon | function \txfm\()16_1d_4x16_pass1_neon | ||||||
|         mov             r12, #32 |         mov             r12, #32 | ||||||
|         vmov.s16        q2, #0 |         vmov.s16        q2, #0 | ||||||
| @@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon | |||||||
|         transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 |         transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 | ||||||
|  |  | ||||||
|         @ Store the transposed 4x4 blocks horizontally. |         @ Store the transposed 4x4 blocks horizontally. | ||||||
|         cmp             r3,  #12 |         cmp             r1,  #12 | ||||||
|         beq             1f |         beq             1f | ||||||
| .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 | .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 | ||||||
|         vst1.16         {d\i}, [r0,:64]! |         vst1.16         {d\i}, [r0,:64]! | ||||||
| .endr | .endr | ||||||
|         bx              lr |         bx              lr | ||||||
| 1: | 1: | ||||||
|         @ Special case: For the last input column (r3 == 12), |         @ Special case: For the last input column (r1 == 12), | ||||||
|         @ which would be stored as the last row in the temp buffer, |         @ which would be stored as the last row in the temp buffer, | ||||||
|         @ don't store the first 4x4 block, but keep it in registers |         @ don't store the first 4x4 block, but keep it in registers | ||||||
|         @ for the first slice of the second pass (where it is the |         @ for the first slice of the second pass (where it is the | ||||||
| @@ -781,15 +780,22 @@ endfunc | |||||||
| itxfm16_1d_funcs idct | itxfm16_1d_funcs idct | ||||||
| itxfm16_1d_funcs iadst | itxfm16_1d_funcs iadst | ||||||
|  |  | ||||||
|  | @ This is the minimum eob value for each subpartition, in increments of 4 | ||||||
|  | const min_eob_idct_idct_16, align=4 | ||||||
|  |         .short  0, 10, 38, 89 | ||||||
|  | endconst | ||||||
|  |  | ||||||
| .macro itxfm_func16x16 txfm1, txfm2 | .macro itxfm_func16x16 txfm1, txfm2 | ||||||
| function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 | function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 | ||||||
| .ifc \txfm1\()_\txfm2,idct_idct | .ifc \txfm1\()_\txfm2,idct_idct | ||||||
|         cmp             r3,  #1 |         cmp             r3,  #1 | ||||||
|         beq             idct16x16_dc_add_neon |         beq             idct16x16_dc_add_neon | ||||||
| .endif | .endif | ||||||
|         push            {r4-r7,lr} |         push            {r4-r8,lr} | ||||||
| .ifnc \txfm1\()_\txfm2,idct_idct | .ifnc \txfm1\()_\txfm2,idct_idct | ||||||
|         vpush           {q4-q7} |         vpush           {q4-q7} | ||||||
|  | .else | ||||||
|  |         movrel          r8,  min_eob_idct_idct_16 + 2 | ||||||
| .endif | .endif | ||||||
|  |  | ||||||
|         @ Align the stack, allocate a temp buffer |         @ Align the stack, allocate a temp buffer | ||||||
| @@ -810,10 +816,36 @@ A       and             r7,  sp,  #15 | |||||||
|  |  | ||||||
| .irp i, 0, 4, 8, 12 | .irp i, 0, 4, 8, 12 | ||||||
|         add             r0,  sp,  #(\i*32) |         add             r0,  sp,  #(\i*32) | ||||||
|  | .ifc \txfm1\()_\txfm2,idct_idct | ||||||
|  | .if \i > 0 | ||||||
|  |         ldrh_post       r1,  r8,  #2 | ||||||
|  |         cmp             r3,  r1 | ||||||
|  |         it              le | ||||||
|  |         movle           r1,  #(16 - \i)/4 | ||||||
|  |         ble             1f | ||||||
|  | .endif | ||||||
|  | .endif | ||||||
|  |         mov             r1,  #\i | ||||||
|         add             r2,  r6,  #(\i*2) |         add             r2,  r6,  #(\i*2) | ||||||
|         mov             r3,  #\i |  | ||||||
|         bl              \txfm1\()16_1d_4x16_pass1_neon |         bl              \txfm1\()16_1d_4x16_pass1_neon | ||||||
| .endr | .endr | ||||||
|  |  | ||||||
|  | .ifc \txfm1\()_\txfm2,idct_idct | ||||||
|  |         b               3f | ||||||
|  | 1: | ||||||
|  |         @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register | ||||||
|  |         @ passthrough of coefficients to pass 2 and clear the end of the temp buffer | ||||||
|  |         vmov.i16        q14, #0 | ||||||
|  |         vmov.i16        q15, #0 | ||||||
|  | 2: | ||||||
|  |         subs            r1,  r1,  #1 | ||||||
|  | .rept 4 | ||||||
|  |         vst1.16         {q14-q15}, [r0,:128]! | ||||||
|  | .endr | ||||||
|  |         bne             2b | ||||||
|  | 3: | ||||||
|  | .endif | ||||||
|  |  | ||||||
| .ifc \txfm1\()_\txfm2,iadst_idct | .ifc \txfm1\()_\txfm2,iadst_idct | ||||||
|         movrel          r12, idct_coeffs |         movrel          r12, idct_coeffs | ||||||
|         vld1.16         {q0-q1}, [r12,:128] |         vld1.16         {q0-q1}, [r12,:128] | ||||||
| @@ -830,7 +862,7 @@ A       and             r7,  sp,  #15 | |||||||
| .ifnc \txfm1\()_\txfm2,idct_idct | .ifnc \txfm1\()_\txfm2,idct_idct | ||||||
|         vpop            {q4-q7} |         vpop            {q4-q7} | ||||||
| .endif | .endif | ||||||
|         pop             {r4-r7,pc} |         pop             {r4-r8,pc} | ||||||
| endfunc | endfunc | ||||||
| .endm | .endm | ||||||
|  |  | ||||||
| @@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon | |||||||
|         bx              lr |         bx              lr | ||||||
| endfunc | endfunc | ||||||
|  |  | ||||||
|  | const min_eob_idct_idct_32, align=4 | ||||||
|  |         .short  0, 9, 34, 70, 135, 240, 336, 448 | ||||||
|  | endconst | ||||||
|  |  | ||||||
| function ff_vp9_idct_idct_32x32_add_neon, export=1 | function ff_vp9_idct_idct_32x32_add_neon, export=1 | ||||||
|         cmp             r3,  #1 |         cmp             r3,  #1 | ||||||
|         beq             idct32x32_dc_add_neon |         beq             idct32x32_dc_add_neon | ||||||
|         push            {r4-r7,lr} |         push            {r4-r8,lr} | ||||||
|         vpush           {q4-q7} |         vpush           {q4-q7} | ||||||
|  |         movrel          r8,  min_eob_idct_idct_32 + 2 | ||||||
|  |  | ||||||
|         @ Align the stack, allocate a temp buffer |         @ Align the stack, allocate a temp buffer | ||||||
| T       mov             r7,  sp | T       mov             r7,  sp | ||||||
| @@ -1129,9 +1166,29 @@ A       and             r7,  sp,  #15 | |||||||
|  |  | ||||||
| .irp i, 0, 4, 8, 12, 16, 20, 24, 28 | .irp i, 0, 4, 8, 12, 16, 20, 24, 28 | ||||||
|         add             r0,  sp,  #(\i*64) |         add             r0,  sp,  #(\i*64) | ||||||
|  | .if \i > 0 | ||||||
|  |         ldrh_post       r1,  r8,  #2 | ||||||
|  |         cmp             r3,  r1 | ||||||
|  |         it              le | ||||||
|  |         movle           r1,  #(32 - \i)/2 | ||||||
|  |         ble             1f | ||||||
|  | .endif | ||||||
|         add             r2,  r6,  #(\i*2) |         add             r2,  r6,  #(\i*2) | ||||||
|         bl              idct32_1d_4x32_pass1_neon |         bl              idct32_1d_4x32_pass1_neon | ||||||
| .endr | .endr | ||||||
|  |         b               3f | ||||||
|  |  | ||||||
|  | 1: | ||||||
|  |         @ Write zeros to the temp buffer for pass 2 | ||||||
|  |         vmov.i16        q14, #0 | ||||||
|  |         vmov.i16        q15, #0 | ||||||
|  | 2: | ||||||
|  |         subs            r1,  r1,  #1 | ||||||
|  | .rept 4 | ||||||
|  |         vst1.16         {q14-q15}, [r0,:128]! | ||||||
|  | .endr | ||||||
|  |         bne             2b | ||||||
|  | 3: | ||||||
| .irp i, 0, 4, 8, 12, 16, 20, 24, 28 | .irp i, 0, 4, 8, 12, 16, 20, 24, 28 | ||||||
|         add             r0,  r4,  #(\i) |         add             r0,  r4,  #(\i) | ||||||
|         mov             r1,  r5 |         mov             r1,  r5 | ||||||
| @@ -1141,5 +1198,5 @@ A       and             r7,  sp,  #15 | |||||||
|  |  | ||||||
|         add             sp,  sp,  r7 |         add             sp,  sp,  r7 | ||||||
|         vpop            {q4-q7} |         vpop            {q4-q7} | ||||||
|         pop             {r4-r7,pc} |         pop             {r4-r8,pc} | ||||||
| endfunc | endfunc | ||||||
|   | |||||||
| @@ -334,8 +334,10 @@ static void check_itxfm(void) | |||||||
|                 // skip testing sub-IDCTs for WHT or ADST since they don't |                 // skip testing sub-IDCTs for WHT or ADST since they don't | ||||||
|                 // implement it in any of the SIMD functions. If they do, |                 // implement it in any of the SIMD functions. If they do, | ||||||
|                 // consider changing this to ensure we have complete test |                 // consider changing this to ensure we have complete test | ||||||
|                 // coverage |                 // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc, | ||||||
|                 for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) { |                 // since the arm version can distinguish them at that level. | ||||||
|  |                 for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; | ||||||
|  |                      sub < 4 ? (sub <<= 1) : (sub += 4)) { | ||||||
|                     if (check_func(dsp.itxfm_add[tx][txtp], |                     if (check_func(dsp.itxfm_add[tx][txtp], | ||||||
|                                    "vp9_inv_%s_%dx%d_sub%d_add_%d", |                                    "vp9_inv_%s_%dx%d_sub%d_add_%d", | ||||||
|                                    tx == 4 ? "wht_wht" : txtp_types[txtp], |                                    tx == 4 ? "wht_wht" : txtp_types[txtp], | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user