mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
This work is sponsored by, and copyright, Google.
Before: Cortex A7 A8 A9 A53
vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.7 235.8
vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 459.2 862.2 553.9
After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 226.5 145.0 225.1 171.8
vp9_inv_dct_dct_32x32_sub1_add_neon: 721.2 415.7 727.6 475.0
This is cherrypicked from libav commit
a76bf8cf12
.
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
045e33ae3f
commit
758302e4bc
@ -542,16 +542,23 @@ function idct16x16_dc_add_neon
|
||||
|
||||
vrshr.s16 q8, q8, #6
|
||||
|
||||
mov r3, r0
|
||||
mov r12, #16
|
||||
1:
|
||||
@ Loop to add the constant from q8 into all 16x16 outputs
|
||||
vld1.8 {q3}, [r0,:128]
|
||||
vaddw.u8 q10, q8, d6
|
||||
vaddw.u8 q11, q8, d7
|
||||
vqmovun.s16 d6, q10
|
||||
vqmovun.s16 d7, q11
|
||||
vst1.8 {q3}, [r0,:128], r1
|
||||
subs r12, r12, #1
|
||||
subs r12, r12, #2
|
||||
vld1.8 {q2}, [r0,:128], r1
|
||||
vaddw.u8 q10, q8, d4
|
||||
vld1.8 {q3}, [r0,:128], r1
|
||||
vaddw.u8 q11, q8, d5
|
||||
vaddw.u8 q12, q8, d6
|
||||
vaddw.u8 q13, q8, d7
|
||||
vqmovun.s16 d4, q10
|
||||
vqmovun.s16 d5, q11
|
||||
vqmovun.s16 d6, q12
|
||||
vst1.8 {q2}, [r3,:128], r1
|
||||
vqmovun.s16 d7, q13
|
||||
vst1.8 {q3}, [r3,:128], r1
|
||||
bne 1b
|
||||
|
||||
bx lr
|
||||
@ -1147,20 +1154,31 @@ function idct32x32_dc_add_neon
|
||||
|
||||
vrshr.s16 q8, q8, #6
|
||||
|
||||
mov r3, r0
|
||||
mov r12, #32
|
||||
1:
|
||||
@ Loop to add the constant from q8 into all 32x32 outputs
|
||||
vld1.8 {q2-q3}, [r0,:128]
|
||||
vaddw.u8 q10, q8, d4
|
||||
vaddw.u8 q11, q8, d5
|
||||
vaddw.u8 q12, q8, d6
|
||||
vaddw.u8 q13, q8, d7
|
||||
vqmovun.s16 d4, q10
|
||||
vqmovun.s16 d5, q11
|
||||
vqmovun.s16 d6, q12
|
||||
vqmovun.s16 d7, q13
|
||||
vst1.8 {q2-q3}, [r0,:128], r1
|
||||
subs r12, r12, #1
|
||||
subs r12, r12, #2
|
||||
vld1.8 {q0-q1}, [r0,:128], r1
|
||||
vaddw.u8 q9, q8, d0
|
||||
vaddw.u8 q10, q8, d1
|
||||
vld1.8 {q2-q3}, [r0,:128], r1
|
||||
vaddw.u8 q11, q8, d2
|
||||
vaddw.u8 q12, q8, d3
|
||||
vaddw.u8 q13, q8, d4
|
||||
vaddw.u8 q14, q8, d5
|
||||
vaddw.u8 q15, q8, d6
|
||||
vqmovun.s16 d0, q9
|
||||
vaddw.u8 q9, q8, d7
|
||||
vqmovun.s16 d1, q10
|
||||
vqmovun.s16 d2, q11
|
||||
vqmovun.s16 d3, q12
|
||||
vqmovun.s16 d4, q13
|
||||
vqmovun.s16 d5, q14
|
||||
vst1.8 {q0-q1}, [r3,:128], r1
|
||||
vqmovun.s16 d6, q15
|
||||
vqmovun.s16 d7, q9
|
||||
vst1.8 {q2-q3}, [r3,:128], r1
|
||||
bne 1b
|
||||
|
||||
bx lr
|
||||
|
Loading…
Reference in New Issue
Block a user