aarch64: vp9: use alternative returns in the core loop filter function

Since aarch64 has enough free general purpose registers use them to branch to the appropiate storage code. 1-2 cycles faster for the functions using loop_filter 8/16, ... on a cortex-a53. Mixed results (up to 2 cycles faster/slower) on a cortex-a57.
2025-08-10 06:10:52 +02:00 · 2016-11-14 00:13:34 +01:00
parent e17567a831
commit d7595de0b2
1 changed files with 18 additions and 30 deletions
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@
 .endif
        // If no pixels needed flat8in nor flat8out, jump to a
        // writeout of the inner 4 pixels
-        cbz             x5,  7f
+        cbnz            x5,  1f
+        br              x14
+1:
        mov             x5,  v7.d[0]
 .ifc \sz, .16b
        mov             x6,  v7.d[1]
        orr             x5,  x5,  x6
 .endif
        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        cbz             x5,  8f
+        cbnz            x5,  1f
+        br              x15

+1:
        // flat8out
        // This writes all outputs into v2-v17 (skipping v6 and v16).
        // If this part is skipped, the output is read from v21-v26 (which is the input
@@ -549,35 +553,24 @@ endfunc

 function vp9_loop_filter_8
        loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
        ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
        br              x10
 endfunc

 function vp9_loop_filter_8_16b_mix
        loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
        ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
        br              x10
 endfunc

 function vp9_loop_filter_16
        loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
        ret
 9:
        ldp             d8,  d9,  [sp], 0x10
@@ -589,13 +582,6 @@ endfunc

 function vp9_loop_filter_16_16b
        loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
        ret
 9:
        ldp             d8,  d9,  [sp], 0x10
@@ -614,11 +600,14 @@ endfunc
 .endm

 .macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
        bl              vp9_loop_filter_8
-        cbnz            x5,  6f
 .endm

 .macro loop_filter_8_16b_mix mix
+        // calculate alternative 'return' targets
+        adr             x13, 6f
 .if \mix == 48
        mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -627,21 +616,20 @@ endfunc
        mov             x11, #0xffffffffffffffff
 .endif
        bl              vp9_loop_filter_8_16b_mix
-        cbnz            x5,  6f
 .endm

 .macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
        bl              vp9_loop_filter_16
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm

 .macro loop_filter_16_16b
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
        bl              vp9_loop_filter_16_16b
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm