mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google.
Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:
Cortex A7 A8 A9 A53
vp9_inv_dct_dct_16x16_sub16_add_neon: 3188.1 2435.4 2499.0 1969.0
vp9_inv_dct_dct_32x32_sub32_add_neon: 18531.7 16582.3 14207.6 12000.3
By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:
vp9_inv_dct_dct_16x16_sub1_add_neon: 274.6 189.5 211.7 235.8
vp9_inv_dct_dct_16x16_sub2_add_neon: 2064.0 1534.8 1719.4 1248.7
vp9_inv_dct_dct_16x16_sub4_add_neon: 2135.0 1477.2 1736.3 1249.5
vp9_inv_dct_dct_16x16_sub8_add_neon: 2446.7 1828.7 1993.6 1494.7
vp9_inv_dct_dct_16x16_sub12_add_neon: 2832.4 2118.3 2266.5 1735.1
vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.7 2475.3 2523.5 1983.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 756.2 456.7 862.0 553.9
vp9_inv_dct_dct_32x32_sub2_add_neon: 10682.2 8190.4 8539.2 6762.5
vp9_inv_dct_dct_32x32_sub4_add_neon: 10813.5 8014.9 8518.3 6762.8
vp9_inv_dct_dct_32x32_sub8_add_neon: 11859.6 9313.0 9347.4 7514.5
vp9_inv_dct_dct_32x32_sub12_add_neon: 12946.6 10752.4 10192.2 8280.2
vp9_inv_dct_dct_32x32_sub16_add_neon: 14074.6 11946.5 11001.4 9008.6
vp9_inv_dct_dct_32x32_sub20_add_neon: 15269.9 13662.7 11816.1 9762.6
vp9_inv_dct_dct_32x32_sub24_add_neon: 16327.9 14940.1 12626.7 10516.0
vp9_inv_dct_dct_32x32_sub28_add_neon: 17462.7 15776.1 13446.2 11264.7
vp9_inv_dct_dct_32x32_sub32_add_neon: 18575.5 17157.0 14249.3 12015.1
I.e. in general a very minor overhead for the full subpartition case due
to the additional loads and cmps, but a significant speedup for the cases
when we only need to process a small part of the actual input data.
In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
This is cherrypicked from libav commit
9c8bc74c2b
.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
ecd343aa1f
commit
388f6e6715
@ -659,9 +659,8 @@ endfunc
|
|||||||
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
|
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
|
||||||
@ transpose into a horizontal 16x4 slice and store.
|
@ transpose into a horizontal 16x4 slice and store.
|
||||||
@ r0 = dst (temp buffer)
|
@ r0 = dst (temp buffer)
|
||||||
@ r1 = unused
|
@ r1 = slice offset
|
||||||
@ r2 = src
|
@ r2 = src
|
||||||
@ r3 = slice offset
|
|
||||||
function \txfm\()16_1d_4x16_pass1_neon
|
function \txfm\()16_1d_4x16_pass1_neon
|
||||||
mov r12, #32
|
mov r12, #32
|
||||||
vmov.s16 q2, #0
|
vmov.s16 q2, #0
|
||||||
@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon
|
|||||||
transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
|
transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
|
||||||
|
|
||||||
@ Store the transposed 4x4 blocks horizontally.
|
@ Store the transposed 4x4 blocks horizontally.
|
||||||
cmp r3, #12
|
cmp r1, #12
|
||||||
beq 1f
|
beq 1f
|
||||||
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
|
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
|
||||||
vst1.16 {d\i}, [r0,:64]!
|
vst1.16 {d\i}, [r0,:64]!
|
||||||
.endr
|
.endr
|
||||||
bx lr
|
bx lr
|
||||||
1:
|
1:
|
||||||
@ Special case: For the last input column (r3 == 12),
|
@ Special case: For the last input column (r1 == 12),
|
||||||
@ which would be stored as the last row in the temp buffer,
|
@ which would be stored as the last row in the temp buffer,
|
||||||
@ don't store the first 4x4 block, but keep it in registers
|
@ don't store the first 4x4 block, but keep it in registers
|
||||||
@ for the first slice of the second pass (where it is the
|
@ for the first slice of the second pass (where it is the
|
||||||
@ -781,15 +780,22 @@ endfunc
|
|||||||
itxfm16_1d_funcs idct
|
itxfm16_1d_funcs idct
|
||||||
itxfm16_1d_funcs iadst
|
itxfm16_1d_funcs iadst
|
||||||
|
|
||||||
|
@ This is the minimum eob value for each subpartition, in increments of 4
|
||||||
|
const min_eob_idct_idct_16, align=4
|
||||||
|
.short 0, 10, 38, 89
|
||||||
|
endconst
|
||||||
|
|
||||||
.macro itxfm_func16x16 txfm1, txfm2
|
.macro itxfm_func16x16 txfm1, txfm2
|
||||||
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
|
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
|
||||||
.ifc \txfm1\()_\txfm2,idct_idct
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
cmp r3, #1
|
cmp r3, #1
|
||||||
beq idct16x16_dc_add_neon
|
beq idct16x16_dc_add_neon
|
||||||
.endif
|
.endif
|
||||||
push {r4-r7,lr}
|
push {r4-r8,lr}
|
||||||
.ifnc \txfm1\()_\txfm2,idct_idct
|
.ifnc \txfm1\()_\txfm2,idct_idct
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
|
.else
|
||||||
|
movrel r8, min_eob_idct_idct_16 + 2
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
@ Align the stack, allocate a temp buffer
|
@ Align the stack, allocate a temp buffer
|
||||||
@ -810,10 +816,36 @@ A and r7, sp, #15
|
|||||||
|
|
||||||
.irp i, 0, 4, 8, 12
|
.irp i, 0, 4, 8, 12
|
||||||
add r0, sp, #(\i*32)
|
add r0, sp, #(\i*32)
|
||||||
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
|
.if \i > 0
|
||||||
|
ldrh_post r1, r8, #2
|
||||||
|
cmp r3, r1
|
||||||
|
it le
|
||||||
|
movle r1, #(16 - \i)/4
|
||||||
|
ble 1f
|
||||||
|
.endif
|
||||||
|
.endif
|
||||||
|
mov r1, #\i
|
||||||
add r2, r6, #(\i*2)
|
add r2, r6, #(\i*2)
|
||||||
mov r3, #\i
|
|
||||||
bl \txfm1\()16_1d_4x16_pass1_neon
|
bl \txfm1\()16_1d_4x16_pass1_neon
|
||||||
.endr
|
.endr
|
||||||
|
|
||||||
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
|
b 3f
|
||||||
|
1:
|
||||||
|
@ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
|
||||||
|
@ passthrough of coefficients to pass 2 and clear the end of the temp buffer
|
||||||
|
vmov.i16 q14, #0
|
||||||
|
vmov.i16 q15, #0
|
||||||
|
2:
|
||||||
|
subs r1, r1, #1
|
||||||
|
.rept 4
|
||||||
|
vst1.16 {q14-q15}, [r0,:128]!
|
||||||
|
.endr
|
||||||
|
bne 2b
|
||||||
|
3:
|
||||||
|
.endif
|
||||||
|
|
||||||
.ifc \txfm1\()_\txfm2,iadst_idct
|
.ifc \txfm1\()_\txfm2,iadst_idct
|
||||||
movrel r12, idct_coeffs
|
movrel r12, idct_coeffs
|
||||||
vld1.16 {q0-q1}, [r12,:128]
|
vld1.16 {q0-q1}, [r12,:128]
|
||||||
@ -830,7 +862,7 @@ A and r7, sp, #15
|
|||||||
.ifnc \txfm1\()_\txfm2,idct_idct
|
.ifnc \txfm1\()_\txfm2,idct_idct
|
||||||
vpop {q4-q7}
|
vpop {q4-q7}
|
||||||
.endif
|
.endif
|
||||||
pop {r4-r7,pc}
|
pop {r4-r8,pc}
|
||||||
endfunc
|
endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon
|
|||||||
bx lr
|
bx lr
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
|
const min_eob_idct_idct_32, align=4
|
||||||
|
.short 0, 9, 34, 70, 135, 240, 336, 448
|
||||||
|
endconst
|
||||||
|
|
||||||
function ff_vp9_idct_idct_32x32_add_neon, export=1
|
function ff_vp9_idct_idct_32x32_add_neon, export=1
|
||||||
cmp r3, #1
|
cmp r3, #1
|
||||||
beq idct32x32_dc_add_neon
|
beq idct32x32_dc_add_neon
|
||||||
push {r4-r7,lr}
|
push {r4-r8,lr}
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
|
movrel r8, min_eob_idct_idct_32 + 2
|
||||||
|
|
||||||
@ Align the stack, allocate a temp buffer
|
@ Align the stack, allocate a temp buffer
|
||||||
T mov r7, sp
|
T mov r7, sp
|
||||||
@ -1129,9 +1166,29 @@ A and r7, sp, #15
|
|||||||
|
|
||||||
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
||||||
add r0, sp, #(\i*64)
|
add r0, sp, #(\i*64)
|
||||||
|
.if \i > 0
|
||||||
|
ldrh_post r1, r8, #2
|
||||||
|
cmp r3, r1
|
||||||
|
it le
|
||||||
|
movle r1, #(32 - \i)/2
|
||||||
|
ble 1f
|
||||||
|
.endif
|
||||||
add r2, r6, #(\i*2)
|
add r2, r6, #(\i*2)
|
||||||
bl idct32_1d_4x32_pass1_neon
|
bl idct32_1d_4x32_pass1_neon
|
||||||
.endr
|
.endr
|
||||||
|
b 3f
|
||||||
|
|
||||||
|
1:
|
||||||
|
@ Write zeros to the temp buffer for pass 2
|
||||||
|
vmov.i16 q14, #0
|
||||||
|
vmov.i16 q15, #0
|
||||||
|
2:
|
||||||
|
subs r1, r1, #1
|
||||||
|
.rept 4
|
||||||
|
vst1.16 {q14-q15}, [r0,:128]!
|
||||||
|
.endr
|
||||||
|
bne 2b
|
||||||
|
3:
|
||||||
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
||||||
add r0, r4, #(\i)
|
add r0, r4, #(\i)
|
||||||
mov r1, r5
|
mov r1, r5
|
||||||
@ -1141,5 +1198,5 @@ A and r7, sp, #15
|
|||||||
|
|
||||||
add sp, sp, r7
|
add sp, sp, r7
|
||||||
vpop {q4-q7}
|
vpop {q4-q7}
|
||||||
pop {r4-r7,pc}
|
pop {r4-r8,pc}
|
||||||
endfunc
|
endfunc
|
||||||
|
@ -334,8 +334,10 @@ static void check_itxfm(void)
|
|||||||
// skip testing sub-IDCTs for WHT or ADST since they don't
|
// skip testing sub-IDCTs for WHT or ADST since they don't
|
||||||
// implement it in any of the SIMD functions. If they do,
|
// implement it in any of the SIMD functions. If they do,
|
||||||
// consider changing this to ensure we have complete test
|
// consider changing this to ensure we have complete test
|
||||||
// coverage
|
// coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,
|
||||||
for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) {
|
// since the arm version can distinguish them at that level.
|
||||||
|
for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
|
||||||
|
sub < 4 ? (sub <<= 1) : (sub += 4)) {
|
||||||
if (check_func(dsp.itxfm_add[tx][txtp],
|
if (check_func(dsp.itxfm_add[tx][txtp],
|
||||||
"vp9_inv_%s_%dx%d_sub%d_add_%d",
|
"vp9_inv_%s_%dx%d_sub%d_add_%d",
|
||||||
tx == 4 ? "wht_wht" : txtp_types[txtp],
|
tx == 4 ? "wht_wht" : txtp_types[txtp],
|
||||||
|
Loading…
Reference in New Issue
Block a user