mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
aarch64: vp9: use alternative returns in the core loop filter function
Since aarch64 has enough free general purpose registers use them to branch to the appropiate storage code. 1-2 cycles faster for the functions using loop_filter 8/16, ... on a cortex-a53. Mixed results (up to 2 cycles faster/slower) on a cortex-a57.
This commit is contained in:
parent
e17567a831
commit
d7595de0b2
@ -410,15 +410,19 @@
|
|||||||
.endif
|
.endif
|
||||||
// If no pixels needed flat8in nor flat8out, jump to a
|
// If no pixels needed flat8in nor flat8out, jump to a
|
||||||
// writeout of the inner 4 pixels
|
// writeout of the inner 4 pixels
|
||||||
cbz x5, 7f
|
cbnz x5, 1f
|
||||||
|
br x14
|
||||||
|
1:
|
||||||
mov x5, v7.d[0]
|
mov x5, v7.d[0]
|
||||||
.ifc \sz, .16b
|
.ifc \sz, .16b
|
||||||
mov x6, v7.d[1]
|
mov x6, v7.d[1]
|
||||||
orr x5, x5, x6
|
orr x5, x5, x6
|
||||||
.endif
|
.endif
|
||||||
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
||||||
cbz x5, 8f
|
cbnz x5, 1f
|
||||||
|
br x15
|
||||||
|
|
||||||
|
1:
|
||||||
// flat8out
|
// flat8out
|
||||||
// This writes all outputs into v2-v17 (skipping v6 and v16).
|
// This writes all outputs into v2-v17 (skipping v6 and v16).
|
||||||
// If this part is skipped, the output is read from v21-v26 (which is the input
|
// If this part is skipped, the output is read from v21-v26 (which is the input
|
||||||
@ -549,35 +553,24 @@ endfunc
|
|||||||
|
|
||||||
function vp9_loop_filter_8
|
function vp9_loop_filter_8
|
||||||
loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
|
loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
|
||||||
mov x5, #0
|
|
||||||
ret
|
ret
|
||||||
6:
|
6:
|
||||||
mov x5, #6
|
br x13
|
||||||
ret
|
|
||||||
9:
|
9:
|
||||||
br x10
|
br x10
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
function vp9_loop_filter_8_16b_mix
|
function vp9_loop_filter_8_16b_mix
|
||||||
loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
|
loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
|
||||||
mov x5, #0
|
|
||||||
ret
|
ret
|
||||||
6:
|
6:
|
||||||
mov x5, #6
|
br x13
|
||||||
ret
|
|
||||||
9:
|
9:
|
||||||
br x10
|
br x10
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
function vp9_loop_filter_16
|
function vp9_loop_filter_16
|
||||||
loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
|
loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
|
||||||
mov x5, #0
|
|
||||||
ret
|
|
||||||
7:
|
|
||||||
mov x5, #7
|
|
||||||
ret
|
|
||||||
8:
|
|
||||||
mov x5, #8
|
|
||||||
ret
|
ret
|
||||||
9:
|
9:
|
||||||
ldp d8, d9, [sp], 0x10
|
ldp d8, d9, [sp], 0x10
|
||||||
@ -589,13 +582,6 @@ endfunc
|
|||||||
|
|
||||||
function vp9_loop_filter_16_16b
|
function vp9_loop_filter_16_16b
|
||||||
loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
|
loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
|
||||||
mov x5, #0
|
|
||||||
ret
|
|
||||||
7:
|
|
||||||
mov x5, #7
|
|
||||||
ret
|
|
||||||
8:
|
|
||||||
mov x5, #8
|
|
||||||
ret
|
ret
|
||||||
9:
|
9:
|
||||||
ldp d8, d9, [sp], 0x10
|
ldp d8, d9, [sp], 0x10
|
||||||
@ -614,11 +600,14 @@ endfunc
|
|||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro loop_filter_8
|
.macro loop_filter_8
|
||||||
|
// calculate alternative 'return' targets
|
||||||
|
adr x13, 6f
|
||||||
bl vp9_loop_filter_8
|
bl vp9_loop_filter_8
|
||||||
cbnz x5, 6f
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro loop_filter_8_16b_mix mix
|
.macro loop_filter_8_16b_mix mix
|
||||||
|
// calculate alternative 'return' targets
|
||||||
|
adr x13, 6f
|
||||||
.if \mix == 48
|
.if \mix == 48
|
||||||
mov x11, #0xffffffff00000000
|
mov x11, #0xffffffff00000000
|
||||||
.elseif \mix == 84
|
.elseif \mix == 84
|
||||||
@ -627,21 +616,20 @@ endfunc
|
|||||||
mov x11, #0xffffffffffffffff
|
mov x11, #0xffffffffffffffff
|
||||||
.endif
|
.endif
|
||||||
bl vp9_loop_filter_8_16b_mix
|
bl vp9_loop_filter_8_16b_mix
|
||||||
cbnz x5, 6f
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro loop_filter_16
|
.macro loop_filter_16
|
||||||
|
// calculate alternative 'return' targets
|
||||||
|
adr x14, 7f
|
||||||
|
adr x15, 8f
|
||||||
bl vp9_loop_filter_16
|
bl vp9_loop_filter_16
|
||||||
cmp x5, 7
|
|
||||||
b.gt 8f
|
|
||||||
b.eq 7f
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro loop_filter_16_16b
|
.macro loop_filter_16_16b
|
||||||
|
// calculate alternative 'return' targets
|
||||||
|
adr x14, 7f
|
||||||
|
adr x15, 8f
|
||||||
bl vp9_loop_filter_16_16b
|
bl vp9_loop_filter_16_16b
|
||||||
cmp x5, 7
|
|
||||||
b.gt 8f
|
|
||||||
b.eq 7f
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user