From e18c39005ad1dbb178b336f691da1de91afd434e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 10 Jan 2017 16:49:13 +0200
Subject: [PATCH 1/3] arm: vp9lpf: Interleave the start of flat8in into the
 calculation above
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds lots of extra .ifs, but speeds it up by a couple cycles,
by avoiding stalls.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9lpf_neon.S | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index c2f1c95427..1e161e0c63 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -182,16 +182,20 @@
 
         vmovl.u8        q0,  d22                @ p1
         vmovl.u8        q1,  d25                @ q1
+.if \wd >= 8
+        vmov            r2,  r3,  d6
+.endif
         vaddw.s8        q0,  q0,  \tmp3         @ p1 + f
         vsubw.s8        q1,  q1,  \tmp3         @ q1 - f
+.if \wd >= 8
+        orrs            r2,  r2,  r3
+.endif
         vqmovun.s16     d0,  q0                 @ out p1
         vqmovun.s16     d2,  q1                 @ out q1
         vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
         vbit            d25, d2,  d5
 
 .if \wd >= 8
-        vmov            r2,  r3,  d6
-        orrs            r2,  r2,  r3
         @ If no pixels need flat8in, jump to flat8out
         @ (or to a writeout of the inner 4 pixels, for wd=8)
         beq             6f

From b0806088d3b27044145b20421da8d39089ae0c6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 10 Jan 2017 22:08:50 +0200
Subject: [PATCH 2/3] aarch64: vp9lpf: Interleave the start of flat8in into the
 calculation above
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds lots of extra .ifs, but speeds it up by a couple cycles,
by avoiding stalls.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9lpf_neon.S | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 3b8e6ebc99..2b8a478383 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -338,20 +338,28 @@
 
         uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
         uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
+.if \wd >= 8
+        mov             x5,  v6.d[0]
+.ifc \sz, .16b
+        mov             x6,  v6.d[1]
+.endif
+.endif
         saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
         ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
         sqxtun_sz       v0,  v0.8h,  v1.8h, \sz     // out p1
         sqxtun_sz       v2,  v2.8h,  v3.8h, \sz     // out q1
+.if \wd >= 8
+.ifc \sz, .16b
+        adds            x5,  x5,  x6
+.endif
+.endif
         bit             v22\sz, v0\sz,  v5\sz       // if (!hev && fm && !flat8in)
         bit             v25\sz, v2\sz,  v5\sz
 
         // If no pixels need flat8in, jump to flat8out
         // (or to a writeout of the inner 4 pixels, for wd=8)
 .if \wd >= 8
-        mov             x5,  v6.d[0]
 .ifc \sz, .16b
-        mov             x6,  v6.d[1]
-        adds            x5,  x5,  x6
         b.eq            6f
 .else
         cbz             x5,  6f

From 07b5136c481d394992c7e951967df0cfbb346c0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 11 Jan 2017 11:58:02 +0200
Subject: [PATCH 3/3] aarch64: vp9lpf: Fix broken indentation/vertical
 alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9lpf_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 2b8a478383..5fafc7ad5c 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -417,7 +417,7 @@
         mov             x5,  v2.d[0]
 .ifc \sz, .16b
         mov             x6,  v2.d[1]
-        adds             x5,  x5,  x6
+        adds            x5,  x5,  x6
         b.ne            1f
 .else
         cbnz            x5,  1f
@@ -430,7 +430,7 @@
         mov             x5,  v7.d[0]
 .ifc \sz, .16b
         mov             x6,  v7.d[1]
-        adds             x5,  x5,  x6
+        adds            x5,  x5,  x6
         b.ne            1f
 .else
         cbnz            x5,  1f