From c582cb8537367721bb399a5d01b652c20142b756 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 14 Jan 2017 20:49:19 +0200
Subject: [PATCH 1/9] arm/aarch64: vp9lpf: Keep the comparison to E within 8
 bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.

Before:                     Cortex A7      A8      A9     A53  A53/AArch64
vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
After:
vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++-------------------------
 libavcodec/arm/vp9lpf_neon.S     | 11 ++++-----
 2 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 5fafc7ad5c..48cac4cac6 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -51,13 +51,6 @@
 // see the arm version instead.
 
 
-.macro uabdl_sz dst1, dst2, in1, in2, sz
-        uabdl           \dst1,  \in1\().8b,  \in2\().8b
-.ifc \sz, .16b
-        uabdl2          \dst2,  \in1\().16b, \in2\().16b
-.endif
-.endm
-
 .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
         add             \dst1,  \in1,  \in3
 .ifc \sz, .16b
@@ -86,20 +79,6 @@
 .endif
 .endm
 
-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
-        cmhs            \dst1,  \in1,  \in3
-.ifc \sz, .16b
-        cmhs            \dst2,  \in2,  \in4
-.endif
-.endm
-
-.macro xtn_sz dst, in1, in2, sz
-        xtn             \dst\().8b,  \in1
-.ifc \sz, .16b
-        xtn2            \dst\().16b, \in2
-.endif
-.endm
-
 .macro usubl_sz dst1, dst2, in1, in2, sz
         usubl           \dst1,  \in1\().8b,  \in2\().8b
 .ifc \sz, .16b
@@ -179,20 +158,20 @@
 // tmpq2 == tmp3 + tmp4, etc.
 .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
 .if \mix == 0
-        dup             v0.8h,  w2        // E
-        dup             v1.8h,  w2        // E
+        dup             v0\sz,  w2        // E
         dup             v2\sz,  w3        // I
         dup             v3\sz,  w4        // H
 .else
-        dup             v0.8h,  w2        // E
+        dup             v0.8b,  w2        // E
         dup             v2.8b,  w3        // I
         dup             v3.8b,  w4        // H
+        lsr             w5,     w2,  #8
         lsr             w6,     w3,  #8
         lsr             w7,     w4,  #8
-        ushr            v1.8h,  v0.8h, #8 // E
+        dup             v1.8b,  w5        // E
         dup             v4.8b,  w6        // I
-        bic             v0.8h,  #255, lsl 8 // E
         dup             v5.8b,  w7        // H
+        trn1            v0.2d,  v0.2d,  v1.2d
         trn1            v2.2d,  v2.2d,  v4.2d
         trn1            v3.2d,  v3.2d,  v5.2d
 .endif
@@ -206,16 +185,15 @@
         umax            v4\sz,  v4\sz,  v5\sz
         umax            v5\sz,  v6\sz,  v7\sz
         umax            \tmp1\sz, \tmp1\sz, \tmp2\sz
-        uabdl_sz        v6.8h,  v7.8h,  v23, v24, \sz // abs(p0 - q0)
+        uabd            v6\sz,  v23\sz, v24\sz        // abs(p0 - q0)
         umax            v4\sz,  v4\sz,  v5\sz
-        add_sz          v6.8h,  v7.8h,  v6.8h,  v7.8h,  v6.8h,  v7.8h, \sz // abs(p0 - q0) * 2
+        uqadd           v6\sz,  v6\sz,  v6\sz         // abs(p0 - q0) * 2
         uabd            v5\sz,  v22\sz, v25\sz        // abs(p1 - q1)
         umax            v4\sz,  v4\sz,  \tmp1\sz      // max(abs(p3 - p2), ..., abs(q2 - q3))
         ushr            v5\sz,  v5\sz,  #1
         cmhs            v4\sz,  v2\sz,  v4\sz         // max(abs()) <= I
-        uaddw_sz        v6.8h,  v7.8h,  v6.8h,  v7.8h,  v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
-        cmhs_sz         v6.8h,  v7.8h,  v0.8h,  v1.8h,  v6.8h,  v7.8h, \sz
-        xtn_sz          v5,     v6.8h,  v7.8h,  \sz
+        uqadd           v6\sz,  v6\sz,  v5\sz         // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        cmhs            v5\sz,  v0\sz,  v6\sz
         and             v4\sz,  v4\sz,  v5\sz         // fm
 
         // If no pixels need filtering, just exit as soon as possible
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index 1e161e0c63..e31c807cc0 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -51,7 +51,7 @@
 @ and d28-d31 as temp registers, or d8-d15.
 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
-        vdup.u16        q0,  r2 @ E
+        vdup.u8         d0,  r2 @ E
         vdup.u8         d2,  r3 @ I
         ldr             r3,  [sp]
 
@@ -64,16 +64,15 @@
         vmax.u8         d4,  d4,  d5
         vmax.u8         d5,  d6,  d7
         vmax.u8         \tmp1,  \tmp1,  \tmp2
-        vabdl.u8        q3,  d23, d24    @ abs(p0 - q0)
+        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
         vmax.u8         d4,  d4,  d5
-        vadd.u16        q3,  q3,  q3     @ abs(p0 - q0) * 2
+        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
         vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
         vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
         vshr.u8         d5,  d5,  #1
         vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
-        vaddw.u8        q3,  q3,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
-        vcle.u16        q3,  q3,  q0
-        vmovn.u16       d5,  q3
+        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u8         d5,  d6,  d0
         vand            d4,  d4,  d5     @ fm
 
         vdup.u8         d3,  r3          @ H

From 3bf9c48320f25f3d5557485b0202f22ae60748b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 23 Feb 2017 23:33:58 +0200
Subject: [PATCH 2/9] aarch64: vp9lpf: Use dup+rev16+uzp1 instead of
 dup+lsr+dup+trn1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is one cycle faster in total, and three instructions fewer.

Before:
vp9_loop_filter_mix2_v_44_16_neon: 123.2
After:
vp9_loop_filter_mix2_v_44_16_neon: 122.2

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9lpf_neon.S | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 48cac4cac6..e9c497096b 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -162,18 +162,15 @@
         dup             v2\sz,  w3        // I
         dup             v3\sz,  w4        // H
 .else
-        dup             v0.8b,  w2        // E
-        dup             v2.8b,  w3        // I
-        dup             v3.8b,  w4        // H
-        lsr             w5,     w2,  #8
-        lsr             w6,     w3,  #8
-        lsr             w7,     w4,  #8
-        dup             v1.8b,  w5        // E
-        dup             v4.8b,  w6        // I
-        dup             v5.8b,  w7        // H
-        trn1            v0.2d,  v0.2d,  v1.2d
-        trn1            v2.2d,  v2.2d,  v4.2d
-        trn1            v3.2d,  v3.2d,  v5.2d
+        dup             v0.8h,  w2        // E
+        dup             v2.8h,  w3        // I
+        dup             v3.8h,  w4        // H
+        rev16           v1.16b, v0.16b    // E
+        rev16           v4.16b, v2.16b    // I
+        rev16           v5.16b, v3.16b    // H
+        uzp1            v0.16b, v0.16b, v1.16b
+        uzp1            v2.16b, v2.16b, v4.16b
+        uzp1            v3.16b, v3.16b, v5.16b
 .endif
 
         uabd            v4\sz,  v20\sz, v21\sz        // abs(p3 - p2)

From 575e31e931e4178e9f1e24407503c9b4ec0ef9ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 14 Jan 2017 13:22:30 +0200
Subject: [PATCH 3/9] arm: vp9lpf: Implement the mix2_44 function with one
 single filter pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For this case, with 8 inputs but only changing 4 of them, we can fit
all 16 input pixels into a q register, and still have enough temporary
registers for doing the loop filter.

The wd=8 filters would require too many temporary registers for
processing all 16 pixels at once though.

Before:                          Cortex A7      A8     A9     A53
vp9_loop_filter_mix2_v_44_16_neon:   289.7   256.2  237.5   181.2
After:
vp9_loop_filter_mix2_v_44_16_neon:   221.2   150.5  177.7   138.0

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9dsp_init_arm.c |   7 +-
 libavcodec/arm/vp9lpf_neon.S     | 191 +++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 3 deletions(-)

diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index e99d931674..1ede1708e7 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -194,6 +194,8 @@ define_loop_filters(8, 8);
 define_loop_filters(16, 8);
 define_loop_filters(16, 16);
 
+define_loop_filters(44, 16);
+
 #define lf_mix_fn(dir, wd1, wd2, stridea)                                                         \
 static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,                                \
                                                      ptrdiff_t stride,                            \
@@ -207,7 +209,6 @@ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,
     lf_mix_fn(h, wd1, wd2, stride) \
     lf_mix_fn(v, wd1, wd2, sizeof(uint8_t))
 
-lf_mix_fns(4, 4)
 lf_mix_fns(4, 8)
 lf_mix_fns(8, 4)
 lf_mix_fns(8, 8)
@@ -227,8 +228,8 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
         dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
         dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
 
-        dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon;
-        dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon;
+        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
         dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon;
         dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon;
         dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon;
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index e31c807cc0..12984a900c 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -44,6 +44,109 @@
         vtrn.8          \r2,  \r3
 .endm
 
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q
+        vdup.u8         d0,  r2          @ E
+        lsr             r2,  r2,  #8
+        vdup.u8         d2,  r3          @ I
+        lsr             r3,  r3,  #8
+        vdup.u8         d1,  r2          @ E
+        vdup.u8         d3,  r3          @ I
+
+        vabd.u8         q2,  q8,  q9     @ abs(p3 - p2)
+        vabd.u8         q3,  q9,  q10    @ abs(p2 - p1)
+        vabd.u8         q4,  q10, q11    @ abs(p1 - p0)
+        vabd.u8         q5,  q12, q13    @ abs(q0 - q1)
+        vabd.u8         q6,  q13, q14    @ abs(q1 - q2)
+        vabd.u8         q7,  q14, q15    @ abs(q2 - q3)
+        vmax.u8         q2,  q2,  q3
+        vmax.u8         q3,  q4,  q5
+        vmax.u8         q4,  q6,  q7
+        vabd.u8         q5,  q11, q12    @ abs(p0 - q0)
+        vmax.u8         q2,  q2,  q3
+        vqadd.u8        q5,  q5,  q5     @ abs(p0 - q0) * 2
+        vabd.u8         q7,  q10, q13    @ abs(p1 - q1)
+        vmax.u8         q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u8         q7,  q7,  #1
+        vcle.u8         q2,  q2,  q1     @ max(abs()) <= I
+        vqadd.u8        q5,  q5,  q7     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u8         q5,  q5,  q0
+        vand            q2,  q2,  q5     @ fm
+
+        vshrn.u16       d10, q2,  #4
+        vmov            r2,  r3,  d10
+        orrs            r2,  r2,  r3
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        ldr             r3,  [sp, #64]
+        vabd.u8         q3,  q10, q11    @ abs(p1 - p0)
+        vabd.u8         q4,  q13, q12    @ abs(q1 - q0)
+
+        vsubl.u8        q5,  d20, d26    @ p1 - q1
+        vsubl.u8        q6,  d21, d27    @ p1 - q1
+        vmax.u8         q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
+        vqmovn.s16      d10, q5          @ av_clip_int8p(p1 - q1)
+        vqmovn.s16      d11, q6          @ av_clip_int8p(p1 - q1)
+        vdup.u8         d8,  r3          @ H
+        lsr             r3,  r3,  #8
+        vdup.u8         d9,  r3          @ H
+        vsubl.u8        q6,  d24, d22    @ q0 - p0
+        vsubl.u8        q7,  d25, d23    @ q0 - p0
+        vcle.u8         q3,  q3,  q4     @ hev
+        vmov.s16        q0,  #3
+        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
+
+        vmul.s16        q6,  q6,  q0     @ 3 * (q0 - p0)
+        vmul.s16        q7,  q7,  q0     @ 3 * (q0 - p0)
+        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int8 = 0
+        vaddw.s8        q6,  q6,  d10    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        vaddw.s8        q7,  q7,  d11    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        vmov.s8         q5,  #4
+        vqmovn.s16      d12, q6
+        vqmovn.s16      d13, q7          @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
+        vmov.s8         q0,  #3
+
+        vqadd.s8        q5,  q6,  q5     @ FFMIN(f + 4, 127)
+        vqadd.s8        q0,  q6,  q0     @ FFMIN(f + 3, 127)
+        vmovl.u8        q6,  d22         @ p0
+        vmovl.u8        q7,  d23         @ p0
+        vshr.s8         q5,  q5,  #3     @ f1
+        vshr.s8         q0,  q0,  #3     @ f2
+
+        vaddw.s8        q6,  q6,  d0     @ p0 + f2
+        vaddw.s8        q7,  q7,  d1     @ p0 + f2
+        vqmovun.s16     d0,  q6          @ out p0
+        vmovl.u8        q6,  d24         @ q0
+        vqmovun.s16     d1,  q7          @ out p0
+        vmovl.u8        q7,  d25         @ q0
+        vsubw.s8        q6,  q6,  d10    @ q0 - f1
+        vsubw.s8        q7,  q7,  d11    @ q0 - f1
+        vqmovun.s16     d12, q6          @ out q0
+        vqmovun.s16     d13, q7          @ out q0
+        vrshr.s8        q5,  q5,  #1     @ f = (f1 + 1) >> 1
+        vbit            q11, q0,  q2     @ if (fm && !flat8in)
+        vbit            q12, q6,  q2
+
+        vmovl.u8        q0,  d20         @ p1
+        vmovl.u8        q2,  d21         @ p1
+        vmovl.u8        q6,  d26         @ q1
+        vmovl.u8        q7,  d27         @ q1
+        vaddw.s8        q0,  q0,  d10    @ p1 + f
+        vaddw.s8        q2,  q2,  d11    @ p1 + f
+        vsubw.s8        q6,  q6,  d10    @ q1 - f
+        vsubw.s8        q7,  q7,  d11    @ q1 - f
+        vqmovun.s16     d0,  q0          @ out p1
+        vqmovun.s16     d1,  q2          @ out p1
+        vqmovun.s16     d12, q6          @ out q1
+        vqmovun.s16     d13, q7          @ out q1
+        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
+        vbit            q13, q6,  q3
+.endm
+
 @ The input to and output from this macro is in the registers d16-d31,
 @ and d0-d7 are used as scratch registers.
 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
@@ -455,6 +558,94 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1
         bx              lr
 endfunc
 
+function ff_vp9_loop_filter_v_44_16_neon, export=1
+        vpush           {q4-q7}
+        sub             r12, r0,  r1, lsl #2
+        vld1.8          {q8},  [r12,:128], r1 @ p3
+        vld1.8          {q12}, [r0, :128], r1 @ q0
+        vld1.8          {q9},  [r12,:128], r1 @ p2
+        vld1.8          {q13}, [r0, :128], r1 @ q1
+        vld1.8          {q10}, [r12,:128], r1 @ p1
+        vld1.8          {q14}, [r0, :128], r1 @ q2
+        vld1.8          {q11}, [r12,:128], r1 @ p0
+        vld1.8          {q15}, [r0, :128], r1 @ q3
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #1
+
+        loop_filter_q
+
+        vst1.8          {q10}, [r12,:128], r1
+        vst1.8          {q12}, [r0, :128], r1
+        vst1.8          {q11}, [r12,:128], r1
+        vst1.8          {q13}, [r0, :128], r1
+9:
+        vpop            {q4-q7}
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_h_44_16_neon, export=1
+        vpush           {q4-q7}
+        sub             r12, r0,  #4
+        add             r0,  r12, r1, lsl #2
+        vld1.8          {d16}, [r12], r1
+        vld1.8          {d24}, [r0],  r1
+        vld1.8          {d18}, [r12], r1
+        vld1.8          {d26}, [r0],  r1
+        vld1.8          {d20}, [r12], r1
+        vld1.8          {d28}, [r0],  r1
+        vld1.8          {d22}, [r12], r1
+        vld1.8          {d30}, [r0],  r1
+        mov             r12, r0
+        add             r0,  r0,  r1, lsl #2
+        vld1.8          {d17}, [r12], r1
+        vld1.8          {d25}, [r0],  r1
+        vld1.8          {d19}, [r12], r1
+        vld1.8          {d27}, [r0],  r1
+        vld1.8          {d21}, [r12], r1
+        vld1.8          {d29}, [r0],  r1
+        vld1.8          {d23}, [r12], r1
+        vld1.8          {d31}, [r0],  r1
+
+        @ Transpose the 16x8 pixels, as two 8x8 parts
+        transpose_8x8   q8,  q9,  q10, q11, q12, q13, q14, q15
+
+        loop_filter_q
+
+        sub             r12, r0,  r1, lsl #4
+        add             r0,  r12, r1, lsl #3
+        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+        @ outermost 2 pixels since they aren't changed.
+        add             r12, r12, #2
+        add             r0,  r0,  #2
+
+        @ We only will write the mid 4 pixels back; after the loop filter,
+        @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
+        @ We need to transpose them to columns, done with a 4x4 transpose
+        @ (which in practice is four 4x4 transposes of the 4x4 blocks of
+        @ the 16x4 pixels; into 4x16 pixels).
+        transpose_4x4   q10, q11, q12, q13
+
+        vst1.32         {d20[0]}, [r12], r1
+        vst1.32         {d21[0]}, [r0],  r1
+        vst1.32         {d22[0]}, [r12], r1
+        vst1.32         {d23[0]}, [r0],  r1
+        vst1.32         {d24[0]}, [r12], r1
+        vst1.32         {d25[0]}, [r0],  r1
+        vst1.32         {d26[0]}, [r12], r1
+        vst1.32         {d27[0]}, [r0],  r1
+        vst1.32         {d20[1]}, [r12], r1
+        vst1.32         {d21[1]}, [r0],  r1
+        vst1.32         {d22[1]}, [r12], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[1]}, [r12], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        vst1.32         {d26[1]}, [r12], r1
+        vst1.32         {d27[1]}, [r0],  r1
+9:
+        vpop            {q4-q7}
+        bx              lr
+endfunc
+
 function ff_vp9_loop_filter_v_8_8_neon, export=1
         sub             r12, r0,  r1, lsl #2
         vld1.8          {d20}, [r12,:64], r1 @ p3

From 402546a17233a8815307df9e14ff88cd70424537 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 2 Jan 2017 22:50:38 +0200
Subject: [PATCH 4/9] arm: vp9itxfm: Avoid reloading the idct32 coefficients
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The idct32x32 function actually pushed q4-q7 onto the stack even
though it didn't clobber them; there are plenty of registers that
can be used to allow keeping all the idct coefficients in registers
without having to reload different subsets of them at different
stages in the transform.

Since the idct16 core transform avoids clobbering q4-q7 (but clobbers
q2-q3 instead, to avoid needing to back up and restore q4-q7 at all
in the idct16 function), and the lanewise vmul needs a register in
the q0-q3 range, we move the stored coefficients from q2-q3 into q4-q5
while doing idct16.

While keeping these coefficients in registers, we still can skip pushing
q7.

Before:                              Cortex A7       A8       A9      A53
vp9_inv_dct_dct_32x32_sub32_add_neon:  18553.8  17182.7  14303.3  12089.7
After:
vp9_inv_dct_dct_32x32_sub32_add_neon:  18470.3  16717.7  14173.6  11860.8

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 240 ++++++++++++++++-----------------
 1 file changed, 117 insertions(+), 123 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 8dc4bbfa55..bed502eba2 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -1185,58 +1185,51 @@ function idct32x32_dc_add_neon
 endfunc
 
 .macro idct32_end
-        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
+        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
         butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
-        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
+        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
         butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
-        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
+        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
         butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
-        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
+        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
         butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
 
         mbutterfly      d27, d20, d0[1], d0[2], q12, q15        @ d27 = t18a, d20 = t29a
-        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15        @ d29 = t19,  d5  = t28
-        mbutterfly      d28, d6,  d0[1], d0[2], q12, q15, neg=1 @ d28 = t27,  d6  = t20
+        mbutterfly      d29, d9,  d0[1], d0[2], q12, q15        @ d29 = t19,  d9  = t28
+        mbutterfly      d28, d10, d0[1], d0[2], q12, q15, neg=1 @ d28 = t27,  d10 = t20
         mbutterfly      d26, d21, d0[1], d0[2], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
 
-        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
+        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
         butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
         butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
         butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
         butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
-        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
-        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
-        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
-        vmov            d29, d4            @ d29 = t29
+        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
+        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
+        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
+        vmov            d29, d8            @ d29 = t29
 
-        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 = t20
-        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
-        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
-        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
+        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
+        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
+        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
+        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
         bx              lr
 .endm
 
 function idct32_odd
-        movrel          r12, idct_coeffs
-        add             r12, r12, #32
-        vld1.16         {q0-q1}, [r12,:128]
+        mbutterfly      d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly      d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly      d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly      d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly      d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly      d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly      d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly      d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
 
-        mbutterfly      d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
-        mbutterfly      d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
-        mbutterfly      d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
-        mbutterfly      d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
-        mbutterfly      d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
-        mbutterfly      d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
-        mbutterfly      d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
-        mbutterfly      d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
-
-        sub             r12, r12, #32
-        vld1.16         {q0}, [r12,:128]
-
-        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
-        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
-        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
-        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
         butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
         butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
         butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
@@ -1250,26 +1243,19 @@ function idct32_odd
 endfunc
 
 function idct32_odd_half
-        movrel          r12, idct_coeffs
-        add             r12, r12, #32
-        vld1.16         {q0-q1}, [r12,:128]
+        mbutterfly_h1   d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly_h2   d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly_h1   d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly_h2   d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly_h1   d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly_h2   d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly_h1   d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly_h2   d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
 
-        mbutterfly_h1   d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
-        mbutterfly_h2   d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
-        mbutterfly_h1   d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
-        mbutterfly_h2   d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
-        mbutterfly_h1   d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
-        mbutterfly_h2   d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
-        mbutterfly_h1   d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
-        mbutterfly_h2   d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
-
-        sub             r12, r12, #32
-        vld1.16         {q0}, [r12,:128]
-
-        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
-        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
-        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
-        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
         butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
         butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
         butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
@@ -1284,45 +1270,38 @@ function idct32_odd_half
 endfunc
 
 function idct32_odd_quarter
-        movrel          r12, idct_coeffs
-        add             r12, r12, #32
-        vld1.16         {q0-q1}, [r12,:128]
-
-        vmull.s16       q2,  d16, d0[0]
-        vmull.s16       q14, d19, d1[3]
-        vmull.s16       q15, d16, d0[1]
-        vmull.s16       q11, d17, d3[2]
-        vmull.s16       q3,  d17, d3[3]
-        vmull.s16       q13, d19, d1[2]
-        vmull.s16       q10, d18, d2[0]
-        vmull.s16       q12, d18, d2[1]
-
-        sub             r12, r12, #32
-        vld1.16         {q0}, [r12,:128]
+        vmull.s16       q4,  d16, d4[0]
+        vmull.s16       q14, d19, d5[3]
+        vmull.s16       q15, d16, d4[1]
+        vmull.s16       q11, d17, d7[2]
+        vmull.s16       q5,  d17, d7[3]
+        vmull.s16       q13, d19, d5[2]
+        vmull.s16       q10, d18, d6[0]
+        vmull.s16       q12, d18, d6[1]
 
         vneg.s32        q14, q14
-        vneg.s32        q3,  q3
+        vneg.s32        q5,  q5
 
-        vrshrn.s32      d4,  q2,  #14
-        vrshrn.s32      d5,  q14, #14
+        vrshrn.s32      d8,  q4,  #14
+        vrshrn.s32      d9,  q14, #14
         vrshrn.s32      d29, q15, #14
         vrshrn.s32      d28, q11, #14
-        vrshrn.s32      d7,  q3,  #14
+        vrshrn.s32      d11, q5,  #14
         vrshrn.s32      d31, q13, #14
-        vrshrn.s32      d6,  q10, #14
+        vrshrn.s32      d10, q10, #14
         vrshrn.s32      d30, q12, #14
 
-        mbutterfly_l    q8,  q9,  d29, d4,  d0[3], d1[0]
-        mbutterfly_l    q13, q10, d31, d5,  d0[3], d1[0]
+        mbutterfly_l    q8,  q9,  d29, d8,  d0[3], d1[0]
+        mbutterfly_l    q13, q10, d31, d9,  d0[3], d1[0]
         vrshrn.s32      d23, q8,  #14
         vrshrn.s32      d24, q9,  #14
         vneg.s32        q10, q10
         vrshrn.s32      d27, q13, #14
         vrshrn.s32      d20, q10, #14
-        mbutterfly_l    q8,  q9,  d30, d6,  d1[1], d1[2]
+        mbutterfly_l    q8,  q9,  d30, d10, d1[1], d1[2]
         vrshrn.s32      d21, q8,  #14
         vrshrn.s32      d26, q9,  #14
-        mbutterfly_l    q8,  q9,  d28, d7,  d1[1], d1[2]
+        mbutterfly_l    q8,  q9,  d28, d11, d1[1], d1[2]
         vrshrn.s32      d25, q8,  #14
         vneg.s32        q9,  q9
         vrshrn.s32      d22, q9,  #14
@@ -1343,8 +1322,11 @@ endfunc
 function idct32_1d_4x32_pass1\suffix\()_neon
         push            {lr}
 
-        movrel          r12, idct_coeffs
-        vld1.16         {q0-q1}, [r12,:128]
+        @ idct16 clobbers q2-q3 (since it doesn't clobber q4-q7 at all
+        @ when doing the normal 16x16 idct), so move the idct32_odd coeffs
+        @ to q4-q5
+        vmov            q4,  q2
+        vmov            q5,  q3
 
         @ Double stride of the input, since we only read every other line
         mov             r12, #128
@@ -1372,6 +1354,11 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 
         bl              idct16\suffix
 
+        @ Move the idct32_odd coeffs back into q2-q3 for idct32_odd;
+        @ the constants for a vmul with a lane must be in q0-q3.
+        vmov            q2,  q4
+        vmov            q3,  q5
+
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
         @ contain the transposed 4x4 blocks.
@@ -1407,24 +1394,24 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 .endif
         add             r2,  r2,  #64
 
-        vmov.s16        d4, #0
+        vmov.s16        d8, #0
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
-        vst1.16         {d4},  [r2,:64], r12
+        vst1.16         {d8},  [r2,:64], r12
 .endr
 .endif
 .ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
         vld1.16         {d\i}, [r2,:64]
-        vst1.16         {d4},  [r2,:64], r12
+        vst1.16         {d8},  [r2,:64], r12
 .endr
 .endif
 .ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         vld1.16         {d\i}, [r2,:64]
-        vst1.16         {d4},  [r2,:64], r12
+        vst1.16         {d8},  [r2,:64], r12
 .endr
 .endif
 
@@ -1437,15 +1424,15 @@ function idct32_1d_4x32_pass1\suffix\()_neon
         @ from the output.
 .macro store_rev a, b, c, d
 .irp i, \a, \b, \c, \d
-        vld1.16         {d4},  [r0,:64]
-        vadd.s16        d4, d4, d\i
-        vst1.16         {d4},  [r0,:64]!
+        vld1.16         {d8},  [r0,:64]
+        vadd.s16        d8,  d8,  d\i
+        vst1.16         {d8},  [r0,:64]!
         vrev64.16       d\i, d\i
 .endr
 .irp i, \d, \c, \b, \a
-        vld1.16         {d4},  [r0,:64]
-        vsub.s16        d4, d4, d\i
-        vst1.16         {d4},  [r0,:64]!
+        vld1.16         {d8},  [r0,:64]
+        vsub.s16        d8,  d8,  d\i
+        vst1.16         {d8},  [r0,:64]!
 .endr
 .endm
 
@@ -1466,8 +1453,8 @@ endfunc
 @ r2 = src (temp buffer)
 function idct32_1d_4x32_pass2\suffix\()_neon
         push            {lr}
-        movrel          r12, idct_coeffs
-        vld1.16         {q0-q1}, [r12,:128]
+        vmov            q4,  q2
+        vmov            q5,  q3
 
         mov             r12, #128
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
@@ -1492,6 +1479,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
 
         bl              idct16\suffix
 
+        vmov            q2,  q4
+        vmov            q3,  q5
+
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vst1.16         {d\i}, [r2,:64], r12
 .endr
@@ -1524,38 +1514,38 @@ function idct32_1d_4x32_pass2\suffix\()_neon
 
         mov             r12, #128
 .macro load_acc_store a, b, c, d, neg=0
-        vld1.16         {d4},  [r2,:64], r12
-        vld1.16         {d5},  [r2,:64], r12
+        vld1.16         {d8},  [r2,:64], r12
+        vld1.16         {d9},  [r2,:64], r12
 .if \neg == 0
-        vadd.s16        d4, d4, d\a
-        vld1.16         {d6},  [r2,:64], r12
-        vadd.s16        d5, d5, d\b
-        vld1.16         {d7},  [r2,:64], r12
-        vadd.s16        d6, d6, d\c
-        vadd.s16        d7, d7, d\d
+        vadd.s16        d8,  d8,  d\a
+        vld1.16         {d10}, [r2,:64], r12
+        vadd.s16        d9,  d9,  d\b
+        vld1.16         {d11}, [r2,:64], r12
+        vadd.s16        d10, d10, d\c
+        vadd.s16        d11, d11, d\d
 .else
-        vsub.s16        d4, d4, d\a
-        vld1.16         {d6},  [r2,:64], r12
-        vsub.s16        d5, d5, d\b
-        vld1.16         {d7},  [r2,:64], r12
-        vsub.s16        d6, d6, d\c
-        vsub.s16        d7, d7, d\d
+        vsub.s16        d8,  d8,  d\a
+        vld1.16         {d10}, [r2,:64], r12
+        vsub.s16        d9,  d9,  d\b
+        vld1.16         {d11}, [r2,:64], r12
+        vsub.s16        d10, d10, d\c
+        vsub.s16        d11, d11, d\d
 .endif
-        vld1.32         {d2[]},   [r0,:32], r1
-        vld1.32         {d2[1]},  [r0,:32], r1
-        vrshr.s16       q2, q2, #6
-        vld1.32         {d3[]},   [r0,:32], r1
-        vrshr.s16       q3, q3, #6
-        vld1.32         {d3[1]},  [r0,:32], r1
+        vld1.32         {d12[]},  [r0,:32], r1
+        vld1.32         {d12[1]}, [r0,:32], r1
+        vrshr.s16       q4, q4, #6
+        vld1.32         {d13[]},  [r0,:32], r1
+        vrshr.s16       q5, q5, #6
+        vld1.32         {d13[1]}, [r0,:32], r1
         sub             r0,  r0,  r1, lsl #2
-        vaddw.u8        q2,  q2,  d2
-        vaddw.u8        q3,  q3,  d3
-        vqmovun.s16     d4,  q2
-        vqmovun.s16     d5,  q3
-        vst1.32         {d4[0]},  [r0,:32], r1
-        vst1.32         {d4[1]},  [r0,:32], r1
-        vst1.32         {d5[0]},  [r0,:32], r1
-        vst1.32         {d5[1]},  [r0,:32], r1
+        vaddw.u8        q4,  q4,  d12
+        vaddw.u8        q5,  q5,  d13
+        vqmovun.s16     d8,  q4
+        vqmovun.s16     d9,  q5
+        vst1.32         {d8[0]},  [r0,:32], r1
+        vst1.32         {d8[1]},  [r0,:32], r1
+        vst1.32         {d9[0]},  [r0,:32], r1
+        vst1.32         {d9[1]},  [r0,:32], r1
 .endm
         load_acc_store  31, 30, 29, 28
         load_acc_store  27, 26, 25, 24
@@ -1584,7 +1574,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         cmp             r3,  #1
         beq             idct32x32_dc_add_neon
         push            {r4-r8,lr}
-        vpush           {q4-q7}
+        vpush           {q4-q6}
         movrel          r8,  min_eob_idct_idct_32 + 2
 
         @ Align the stack, allocate a temp buffer
@@ -1598,6 +1588,10 @@ A       and             r7,  sp,  #15
         mov             r5,  r1
         mov             r6,  r2
 
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]!
+        vld1.16         {q2-q3}, [r12,:128]
+
         cmp             r3,  #34
         ble             idct32x32_quarter_add_neon
         cmp             r3,  #135
@@ -1636,7 +1630,7 @@ A       and             r7,  sp,  #15
 .endr
 
         add             sp,  sp,  r7
-        vpop            {q4-q7}
+        vpop            {q4-q6}
         pop             {r4-r8,pc}
 endfunc
 
@@ -1668,7 +1662,7 @@ function idct32x32_quarter_add_neon
 .endr
 
         add             sp,  sp,  r7
-        vpop            {q4-q7}
+        vpop            {q4-q6}
         pop             {r4-r8,pc}
 endfunc
 
@@ -1706,6 +1700,6 @@ function idct32x32_half_add_neon
 .endr
 
         add             sp,  sp,  r7
-        vpop            {q4-q7}
+        vpop            {q4-q6}
         pop             {r4-r8,pc}
 endfunc

From 65aa002d54433154a6924dc13e498bec98451ad0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 2 Jan 2017 22:08:41 +0200
Subject: [PATCH 5/9] aarch64: vp9itxfm: Avoid reloading the idct32
 coefficients
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The idct32x32 function actually pushed d8-d15 onto the stack even
though it didn't clobber them; there are plenty of registers that
can be used to allow keeping all the idct coefficients in registers
without having to reload different subsets of them at different
stages in the transform.

After this, we still can skip pushing d12-d15.

Before:
vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3
After:
vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 110 +++++++++++------------------
 1 file changed, 43 insertions(+), 67 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index d35f103a79..b6c2575236 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1123,18 +1123,14 @@ endfunc
 .endm
 
 function idct32_odd
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
+        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 
         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1153,18 +1149,14 @@ function idct32_odd
 endfunc
 
 function idct32_odd_half
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
+        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 
         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1183,18 +1175,14 @@ function idct32_odd_half
 endfunc
 
 function idct32_odd_quarter
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dsmull_h        v4,  v5,  v16, v0.h[0]
-        dsmull_h        v28, v29, v19, v0.h[7]
-        dsmull_h        v30, v31, v16, v0.h[1]
-        dsmull_h        v22, v23, v17, v1.h[6]
-        dsmull_h        v7,  v6,  v17, v1.h[7]
-        dsmull_h        v26, v27, v19, v0.h[6]
-        dsmull_h        v20, v21, v18, v1.h[0]
-        dsmull_h        v24, v25, v18, v1.h[1]
-
-        ld1             {v0.8h}, [x10]
+        dsmull_h        v4,  v5,  v16, v8.h[0]
+        dsmull_h        v28, v29, v19, v8.h[7]
+        dsmull_h        v30, v31, v16, v8.h[1]
+        dsmull_h        v22, v23, v17, v9.h[6]
+        dsmull_h        v7,  v6,  v17, v9.h[7]
+        dsmull_h        v26, v27, v19, v8.h[6]
+        dsmull_h        v20, v21, v18, v9.h[0]
+        dsmull_h        v24, v25, v18, v9.h[1]
 
         neg             v28.4s, v28.4s
         neg             v29.4s, v29.4s
@@ -1240,12 +1228,8 @@ endfunc
 // x1 = unused
 // x2 = src
 // x9 = double input stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1\suffix\()_neon
         mov             x14, x30
-        ld1             {v0.8h,v1.8h}, [x10]
-
         movi            v2.8h, #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .macro store_rev a, b
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, \b
+        rev64           v3.8h, \b
         st1             {\a},  [x0], #16
-        rev64           v0.8h, \a
-        ext             v1.16b, v1.16b, v1.16b, #8
+        rev64           v2.8h, \a
+        ext             v3.16b, v3.16b, v3.16b, #8
         st1             {\b},  [x0], #16
-        ext             v0.16b, v0.16b, v0.16b, #8
-        st1             {v1.8h},  [x0], #16
-        st1             {v0.8h},  [x0], #16
+        ext             v2.16b, v2.16b, v2.16b, #8
+        st1             {v3.8h},  [x0], #16
+        st1             {v2.8h},  [x0], #16
 .endm
         store_rev       v16.8h, v24.8h
         store_rev       v17.8h, v25.8h
@@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
         // subtracted from the output.
 .macro store_rev a, b
         ld1             {v4.8h},  [x0]
-        rev64           v1.8h, \b
+        rev64           v3.8h, \b
         add             v4.8h, v4.8h, \a
-        rev64           v0.8h, \a
+        rev64           v2.8h, \a
         st1             {v4.8h},  [x0], #16
-        ext             v1.16b, v1.16b, v1.16b, #8
+        ext             v3.16b, v3.16b, v3.16b, #8
         ld1             {v5.8h},  [x0]
-        ext             v0.16b, v0.16b, v0.16b, #8
+        ext             v2.16b, v2.16b, v2.16b, #8
         add             v5.8h, v5.8h, \b
         st1             {v5.8h},  [x0], #16
         ld1             {v6.8h},  [x0]
-        sub             v6.8h, v6.8h, v1.8h
+        sub             v6.8h, v6.8h, v3.8h
         st1             {v6.8h},  [x0], #16
         ld1             {v7.8h},  [x0]
-        sub             v7.8h, v7.8h, v0.8h
+        sub             v7.8h, v7.8h, v2.8h
         st1             {v7.8h},  [x0], #16
 .endm
 
@@ -1376,12 +1360,8 @@ endfunc
 // x2 = src (temp buffer)
 // x7 = negative double temp buffer stride
 // x9 = double temp buffer stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2\suffix\()_neon
         mov             x14, x30
-        ld1             {v0.8h,v1.8h}, [x10]
-
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
         sub             v6.8h, v6.8h, \c
         sub             v7.8h, v7.8h, \d
 .endif
-        ld1             {v0.8b}, [x0], x1
-        ld1             {v1.8b}, [x0], x1
+        ld1             {v10.8b}, [x0], x1
+        ld1             {v11.8b}, [x0], x1
         srshr           v4.8h, v4.8h, #6
         ld1             {v2.8b}, [x0], x1
         srshr           v5.8h, v5.8h, #6
-        uaddw           v4.8h, v4.8h, v0.8b
+        uaddw           v4.8h, v4.8h, v10.8b
         ld1             {v3.8b}, [x0], x1
         srshr           v6.8h, v6.8h, #6
-        uaddw           v5.8h, v5.8h, v1.8b
+        uaddw           v5.8h, v5.8h, v11.8b
         srshr           v7.8h, v7.8h, #6
         sub             x0,  x0,  x1, lsl #2
         uaddw           v6.8h, v6.8h, v2.8b
@@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         b.eq            idct32x32_dc_add_neon
 
         movrel          x10, idct_coeffs
-        add             x11, x10, #32
         movrel          x12, min_eob_idct_idct_32, 2
 
         mov             x15, x30
 
-        stp             d14, d15, [sp, #-0x10]!
-        stp             d12, d13, [sp, #-0x10]!
         stp             d10, d11, [sp, #-0x10]!
         stp             d8,  d9,  [sp, #-0x10]!
 
@@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         mov             x9,  #128
         neg             x7,  x9
 
+        ld1             {v0.8h,v1.8h}, [x10], #32
+        ld1             {v8.8h,v9.8h}, [x10]
+
         cmp             w3,  #34
         b.le            idct32x32_quarter_add_neon
         cmp             w3,  #135
@@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
-        ldp             d12, d13, [sp], 0x10
-        ldp             d14, d15, [sp], 0x10
 
         br              x15
 endfunc
@@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon
 
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
-        ldp             d12, d13, [sp], 0x10
-        ldp             d14, d15, [sp], 0x10
 
         br              x15
 endfunc

From de06bdfe6c8abd8266d5c6f5c68e4df0060b61fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 31 Dec 2016 14:05:44 +0200
Subject: [PATCH 6/9] arm: vp9itxfm: Reorder the idct coefficients for better
 pairing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All elements are used pairwise, except for the first one.
Previously, the 16th element was unused. Move the unused element
to the second slot, to make the later element pairs not split
across registers.

This simplifies loading only parts of the coefficients,
reducing the difference to the 16 bpp version.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 124 ++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index bed502eba2..1d4d6a7910 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -22,7 +22,7 @@
 #include "neon.S"
 
 const itxfm4_coeffs, align=4
-        .short  11585, 6270, 15137, 0
+        .short  11585, 0, 6270, 15137
 iadst4_coeffs:
         .short  5283, 15212, 9929, 13377
 endconst
@@ -30,8 +30,8 @@ endconst
 const iadst8_coeffs, align=4
         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
 idct_coeffs:
-        .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
-        .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
 endconst
@@ -224,14 +224,14 @@ endconst
 .endm
 
 .macro idct4 c0, c1, c2, c3
-        vmull.s16       q13,  \c1,  d0[2]
-        vmull.s16       q11,  \c1,  d0[1]
+        vmull.s16       q13,  \c1,  d0[3]
+        vmull.s16       q11,  \c1,  d0[2]
         vadd.i16        d16,  \c0,  \c2
         vsub.i16        d17,  \c0,  \c2
-        vmlal.s16       q13,  \c3,  d0[1]
+        vmlal.s16       q13,  \c3,  d0[2]
         vmull.s16       q9,   d16,  d0[0]
         vmull.s16       q10,  d17,  d0[0]
-        vmlsl.s16       q11,  \c3,  d0[2]
+        vmlsl.s16       q11,  \c3,  d0[3]
         vrshrn.s32      d26,  q13,  #14
         vrshrn.s32      d18,  q9,   #14
         vrshrn.s32      d20,  q10,  #14
@@ -350,9 +350,9 @@ itxfm_func4x4 iwht,  iwht
 
 .macro idct8
         dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
-        dmbutterfly     d20, d21, d28, d29, d0[1], d0[2], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
-        dmbutterfly     d18, d19, d30, d31, d0[3], d1[0], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
-        dmbutterfly     d26, d27, d22, d23, d1[1], d1[2], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
+        dmbutterfly     d20, d21, d28, d29, d0[2], d0[3], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
+        dmbutterfly     d18, d19, d30, d31, d1[0], d1[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
+        dmbutterfly     d26, d27, d22, d23, d1[2], d1[3], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
 
         butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
         butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
@@ -386,8 +386,8 @@ itxfm_func4x4 iwht,  iwht
         vneg.s16        q15, q15          @ q15 = out[7]
         butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
 
-        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[1], d0[2] @ q10,q11 = t5a, q5,q7 = t4a
-        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[2], d0[1] @ q2,q3 = t6a, q13,q14 = t7a
+        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[2], d0[3] @ q10,q11 = t5a, q5,q7 = t4a
+        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[3], d0[2] @ q2,q3 = t6a, q13,q14 = t7a
 
         dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
 
@@ -594,13 +594,13 @@ endfunc
 
 function idct16
         mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
-        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
-        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
-        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
-        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
-        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
-        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
-        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
+        mbutterfly      d20, d28, d0[2], d0[3], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly      d18, d30, d1[0], d1[1], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly      d26, d22, d1[2], d1[3], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly      d17, d31, d2[0], d2[1], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly      d25, d23, d2[2], d2[3], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly      d21, d27, d3[0], d3[1], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly      d29, d19, d3[2], d3[3], q2,  q3  @ d29 = t11a, d19 = t12a
 
         butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
         butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
@@ -612,20 +612,20 @@ function idct16
         butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
 
         mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
-        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
-        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        mbutterfly      d23, d25, d0[2], d0[3], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[2], d0[3], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
         idct16_end
 endfunc
 
 function idct16_half
         mbutterfly0_h   d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
-        mbutterfly_h1   d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
-        mbutterfly_h1   d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
-        mbutterfly_h2   d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
-        mbutterfly_h1   d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
-        mbutterfly_h2   d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
-        mbutterfly_h1   d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
-        mbutterfly_h2   d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
+        mbutterfly_h1   d20, d28, d0[2], d0[3], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly_h1   d18, d30, d1[0], d1[1], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly_h2   d26, d22, d1[2], d1[3], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly_h1   d17, d31, d2[0], d2[1], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly_h2   d25, d23, d2[2], d2[3], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly_h1   d21, d27, d3[0], d3[1], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly_h2   d29, d19, d3[2], d3[3], q2,  q3  @ d29 = t11a, d19 = t12a
 
         butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
         butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
@@ -637,19 +637,19 @@ function idct16_half
         butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
 
         mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
-        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
-        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        mbutterfly      d23, d25, d0[2], d0[3], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[2], d0[3], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
         idct16_end
 endfunc
 
 function idct16_quarter
-        vmull.s16       q12, d19, d3[2]
-        vmull.s16       q2,  d17, d1[3]
-        vmull.s16       q3,  d18, d1[0]
-        vmull.s16       q15, d18, d0[3]
+        vmull.s16       q12, d19, d3[3]
+        vmull.s16       q2,  d17, d2[0]
+        vmull.s16       q3,  d18, d1[1]
+        vmull.s16       q15, d18, d1[0]
         vneg.s32        q12, q12
-        vmull.s16       q14, d17, d2[0]
-        vmull.s16       q13, d19, d3[1]
+        vmull.s16       q14, d17, d2[1]
+        vmull.s16       q13, d19, d3[2]
         vmull.s16       q11, d16, d0[0]
         vrshrn.s32      d24, q12, #14
         vrshrn.s32      d16, q2,  #14
@@ -659,8 +659,8 @@ function idct16_quarter
         vrshrn.s32      d17, q13, #14
         vrshrn.s32      d28, q11, #14
 
-        mbutterfly_l    q10, q11, d17, d24, d0[1], d0[2]
-        mbutterfly_l    q9,  q15, d29, d16, d0[1], d0[2]
+        mbutterfly_l    q10, q11, d17, d24, d0[2], d0[3]
+        mbutterfly_l    q9,  q15, d29, d16, d0[2], d0[3]
         vneg.s32        q11, q11
         vrshrn.s32      d27, q10, #14
         vrshrn.s32      d21, q11, #14
@@ -697,16 +697,16 @@ function iadst16
         movrel          r12, idct_coeffs
         vld1.16         {q0}, [r12,:128]
         butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
-        mbutterfly_l    q7,  q6,  d23, d24, d0[3], d1[0] @ q7  = t9,   q6  = t8
+        mbutterfly_l    q7,  q6,  d23, d24, d1[0], d1[1] @ q7  = t9,   q6  = t8
         butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
 
-        mbutterfly_l    q2,  q3,  d28, d19, d1[0], d0[3] @ q2  = t12,  q3  = t13
+        mbutterfly_l    q2,  q3,  d28, d19, d1[1], d1[0] @ q2  = t12,  q3  = t13
         butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
-        mbutterfly_l    q5,  q4,  d21, d26, d1[1], d1[2] @ q5  = t11,  q4  = t10
+        mbutterfly_l    q5,  q4,  d21, d26, d1[2], d1[3] @ q5  = t11,  q4  = t10
         butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
         butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
 
-        mbutterfly_l    q6,  q7,  d30, d17, d1[2], d1[1] @ q6  = t14,  q7  = t15
+        mbutterfly_l    q6,  q7,  d30, d17, d1[3], d1[2] @ q6  = t14,  q7  = t15
         butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
         butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
         butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
@@ -714,15 +714,15 @@ function iadst16
         butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
         butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
 
-        mbutterfly_l    q5,  q4,  d19, d28, d0[1], d0[2] @ q5  = t13,  q4  = t12
-        mbutterfly_l    q6,  q7,  d30, d17, d0[2], d0[1] @ q6  = t14,  q7  = t15
+        mbutterfly_l    q5,  q4,  d19, d28, d0[2], d0[3] @ q5  = t13,  q4  = t12
+        mbutterfly_l    q6,  q7,  d30, d17, d0[3], d0[2] @ q6  = t14,  q7  = t15
 
         butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
         butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
         vneg.s16        d29, d29                         @ d29 = out[13]
 
-        mbutterfly_l    q5,  q4,  d4,  d5,  d0[1], d0[2] @ q5  = t5a,  q4  = t4a
-        mbutterfly_l    q6,  q7,  d7,  d6,  d0[2], d0[1] @ q6  = t6a,  q7  = t7a
+        mbutterfly_l    q5,  q4,  d4,  d5,  d0[2], d0[3] @ q5  = t5a,  q4  = t4a
+        mbutterfly_l    q6,  q7,  d7,  d6,  d0[3], d0[2] @ q6  = t6a,  q7  = t7a
 
         butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
         butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
@@ -1194,10 +1194,10 @@ endfunc
         butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
         butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
 
-        mbutterfly      d27, d20, d0[1], d0[2], q12, q15        @ d27 = t18a, d20 = t29a
-        mbutterfly      d29, d9,  d0[1], d0[2], q12, q15        @ d29 = t19,  d9  = t28
-        mbutterfly      d28, d10, d0[1], d0[2], q12, q15, neg=1 @ d28 = t27,  d10 = t20
-        mbutterfly      d26, d21, d0[1], d0[2], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+        mbutterfly      d27, d20, d0[2], d0[3], q12, q15        @ d27 = t18a, d20 = t29a
+        mbutterfly      d29, d9,  d0[2], d0[3], q12, q15        @ d29 = t19,  d5  = t28
+        mbutterfly      d28, d10, d0[2], d0[3], q12, q15, neg=1 @ d28 = t27,  d6  = t20
+        mbutterfly      d26, d21, d0[2], d0[3], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
 
         butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
         butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
@@ -1235,10 +1235,10 @@ function idct32_odd
         butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
         butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
 
-        mbutterfly      d23, d24, d0[3], d1[0], q8, q9        @ d23 = t17a, d24 = t30a
-        mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
-        mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
-        mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        mbutterfly      d23, d24, d1[0], d1[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[2], d1[3], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
         idct32_end
 endfunc
 
@@ -1261,10 +1261,10 @@ function idct32_odd_half
         butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
         butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
 
-        mbutterfly      d23, d24, d0[3], d1[0], q8, q9        @ d23 = t17a, d24 = t30a
-        mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
-        mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
-        mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        mbutterfly      d23, d24, d1[0], d1[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[2], d1[3], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
 
         idct32_end
 endfunc
@@ -1291,17 +1291,17 @@ function idct32_odd_quarter
         vrshrn.s32      d10, q10, #14
         vrshrn.s32      d30, q12, #14
 
-        mbutterfly_l    q8,  q9,  d29, d8,  d0[3], d1[0]
-        mbutterfly_l    q13, q10, d31, d9,  d0[3], d1[0]
+        mbutterfly_l    q8,  q9,  d29, d8,  d1[0], d1[1]
+        mbutterfly_l    q13, q10, d31, d9,  d1[0], d1[1]
         vrshrn.s32      d23, q8,  #14
         vrshrn.s32      d24, q9,  #14
         vneg.s32        q10, q10
         vrshrn.s32      d27, q13, #14
         vrshrn.s32      d20, q10, #14
-        mbutterfly_l    q8,  q9,  d30, d10, d1[1], d1[2]
+        mbutterfly_l    q8,  q9,  d30, d10, d1[2], d1[3]
         vrshrn.s32      d21, q8,  #14
         vrshrn.s32      d26, q9,  #14
-        mbutterfly_l    q8,  q9,  d28, d11, d1[1], d1[2]
+        mbutterfly_l    q8,  q9,  d28, d11, d1[2], d1[3]
         vrshrn.s32      d25, q8,  #14
         vneg.s32        q9,  q9
         vrshrn.s32      d22, q9,  #14

From 09eb88a12e008d10a3f7a6be75d18ad98b368e68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 31 Dec 2016 14:18:31 +0200
Subject: [PATCH 7/9] aarch64: vp9itxfm: Reorder the idct coefficients for
 better pairing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All elements are used pairwise, except for the first one.
Previously, the 16th element was unused. Move the unused element
to the second slot, to make the later element pairs not split
across registers.

This simplifies loading only parts of the coefficients,
reducing the difference to the 16 bpp version.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 124 ++++++++++++++---------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index b6c2575236..d4fc2163aa 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -22,7 +22,7 @@
 #include "neon.S"
 
 const itxfm4_coeffs, align=4
-        .short  11585, 6270, 15137, 0
+        .short  11585, 0, 6270, 15137
 iadst4_coeffs:
         .short  5283, 15212, 9929, 13377
 endconst
@@ -30,8 +30,8 @@ endconst
 const iadst8_coeffs, align=4
         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
 idct_coeffs:
-        .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
-        .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
 endconst
@@ -192,14 +192,14 @@ endconst
 .endm
 
 .macro idct4 c0, c1, c2, c3
-        smull           v22.4s,    \c1\().4h, v0.h[2]
-        smull           v20.4s,    \c1\().4h, v0.h[1]
+        smull           v22.4s,    \c1\().4h, v0.h[3]
+        smull           v20.4s,    \c1\().4h, v0.h[2]
         add             v16.4h,    \c0\().4h, \c2\().4h
         sub             v17.4h,    \c0\().4h, \c2\().4h
-        smlal           v22.4s,    \c3\().4h, v0.h[1]
+        smlal           v22.4s,    \c3\().4h, v0.h[2]
         smull           v18.4s,    v16.4h,    v0.h[0]
         smull           v19.4s,    v17.4h,    v0.h[0]
-        smlsl           v20.4s,    \c3\().4h, v0.h[2]
+        smlsl           v20.4s,    \c3\().4h, v0.h[3]
         rshrn           v22.4h,    v22.4s,    #14
         rshrn           v18.4h,    v18.4s,    #14
         rshrn           v19.4h,    v19.4s,    #14
@@ -326,9 +326,9 @@ itxfm_func4x4 iwht,  iwht
 
 .macro idct8
         dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
-        dmbutterfly     v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
-        dmbutterfly     v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
-        dmbutterfly     v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
+        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
+        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
+        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
 
         butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
         butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
@@ -361,8 +361,8 @@ itxfm_func4x4 iwht,  iwht
         dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
         neg             v19.8h,   v19.8h  // v19 = out[3]
 
-        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[1], v0.h[2]   // v26,v27 = t5a, v28,v29 = t4a
-        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[2], v0.h[1]   // v2,v3   = t6a, v4,v5   = t7a
+        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
+        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
 
         dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
         dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
@@ -543,13 +543,13 @@ endfunc
 
 function idct16
         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
-        dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
-        dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
-        dmbutterfly     v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
-        dmbutterfly     v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
-        dmbutterfly     v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
-        dmbutterfly     v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
-        dmbutterfly     v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 
         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
@@ -561,20 +561,20 @@ function idct16
         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 
         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
-        dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
-        dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
         idct16_end
 endfunc
 
 function idct16_half
         dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
-        dmbutterfly_h1  v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
-        dmbutterfly_h1  v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
-        dmbutterfly_h2  v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
-        dmbutterfly_h1  v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
-        dmbutterfly_h2  v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
-        dmbutterfly_h1  v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
-        dmbutterfly_h2  v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 
         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
@@ -586,20 +586,20 @@ function idct16_half
         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 
         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
-        dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
-        dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
         idct16_end
 endfunc
 
 function idct16_quarter
-        dsmull_h        v24, v25, v19, v1.h[6]
-        dsmull_h        v4,  v5,  v17, v0.h[7]
-        dsmull_h        v7,  v6,  v18, v0.h[4]
-        dsmull_h        v30, v31, v18, v0.h[3]
+        dsmull_h        v24, v25, v19, v1.h[7]
+        dsmull_h        v4,  v5,  v17, v1.h[0]
+        dsmull_h        v7,  v6,  v18, v0.h[5]
+        dsmull_h        v30, v31, v18, v0.h[4]
         neg             v24.4s,  v24.4s
         neg             v25.4s,  v25.4s
-        dsmull_h        v29, v28, v17, v1.h[0]
-        dsmull_h        v26, v27, v19, v1.h[5]
+        dsmull_h        v29, v28, v17, v1.h[1]
+        dsmull_h        v26, v27, v19, v1.h[6]
         dsmull_h        v22, v23, v16, v0.h[0]
         drshrn_h        v24, v24, v25, #14
         drshrn_h        v16, v4,  v5,  #14
@@ -609,8 +609,8 @@ function idct16_quarter
         drshrn_h        v17, v26, v27, #14
         drshrn_h        v28, v22, v23, #14
 
-        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
-        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
         neg             v22.4s,  v22.4s
         neg             v23.4s,  v23.4s
         drshrn_h        v27, v20, v21, #14
@@ -646,16 +646,16 @@ function iadst16
         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
         ld1             {v0.8h}, [x10]
         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
-        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4]   // v14,v15 = t9,   v12,v13 = t8
+        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
 
-        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[4], v0.h[3]   // v4,v5   = t12,  v6,v7   = t13
+        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
-        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[5], v0.h[6]   // v10,v11 = t11,  v8,v9   = t10
+        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
         butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
 
-        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5]   // v12,v13 = t14,  v14,v15 = t15
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
         butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
@@ -663,15 +663,15 @@ function iadst16
         butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
         butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
 
-        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[1], v0.h[2]   // v10,v11 = t13,  v8,v9   = t12
-        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1]   // v12,v13 = t14,  v14,v15 = t15
+        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
 
         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
         neg             v29.8h, v29.8h                   // v29 = out[13]
 
-        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[1], v0.h[2]   // v10,v11 = t5a,  v8,v9   = t4a
-        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[2], v0.h[1]   // v12,v13 = t6a,  v14,v15 = t7a
+        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
+        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
 
         butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
         butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
@@ -1101,10 +1101,10 @@ endfunc
         butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
         butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
 
-        dmbutterfly     v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
-        dmbutterfly     v3,  v5,  v0.h[1], v0.h[2], v24, v25, v30, v31        // v3  = t19,  v5  = t28
-        dmbutterfly     v28, v6,  v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
-        dmbutterfly     v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
+        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
+        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
+        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
 
         butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
         butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
@@ -1141,10 +1141,10 @@ function idct32_odd
         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
 
-        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
-        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
-        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
-        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
         idct32_end
 endfunc
 
@@ -1167,10 +1167,10 @@ function idct32_odd_half
         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
 
-        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
-        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
-        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
-        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
         idct32_end
 endfunc
 
@@ -1198,18 +1198,18 @@ function idct32_odd_quarter
         drshrn_h        v6,  v20, v21, #14
         drshrn_h        v30, v24, v25, #14
 
-        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[3], v0.h[4]
-        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[3], v0.h[4]
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
         drshrn_h        v23, v16, v17, #14
         drshrn_h        v24, v18, v19, #14
         neg             v20.4s, v20.4s
         neg             v21.4s, v21.4s
         drshrn_h        v27, v27, v26, #14
         drshrn_h        v20, v20, v21, #14
-        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[5], v0.h[6]
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
         drshrn_h        v21, v16, v17, #14
         drshrn_h        v26, v18, v19, #14
-        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[5], v0.h[6]
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
         drshrn_h        v25, v16, v17, #14
         neg             v18.4s, v18.4s
         neg             v19.4s, v19.4s

From 08074c092d8c97d71c5986e5325e97ffc956119d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 31 Dec 2016 22:27:13 +0200
Subject: [PATCH 8/9] arm: vp9itxfm: Reorder iadst16 coeffs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This matches the order they are in the 16 bpp version.

There they are in this order, to make sure we access them in the
same order they are declared, easing loading only half of the
coefficients at a time.

This makes the 8 bpp version match the 16 bpp version better.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 1d4d6a7910..a612b25f4f 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -37,8 +37,8 @@ idct_coeffs:
 endconst
 
 const iadst16_coeffs, align=4
-        .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
-        .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
 endconst
 
 @ Do four 4x4 transposes, using q registers for the subtransposes that don't
@@ -678,19 +678,19 @@ function iadst16
         vld1.16         {q0-q1}, [r12,:128]
 
         mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
-        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
+        mbutterfly_l    q5,  q4,  d23, d24, d1[1], d1[0] @ q5  = t9,   q4  = t8
         butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
         mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = t2
         butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
 
-        mbutterfly_l    q3,  q2,  d21, d26, d2[3], d2[2] @ q3  = t11,  q2  = t10
+        mbutterfly_l    q3,  q2,  d21, d26, d1[3], d1[2] @ q3  = t11,  q2  = t10
         butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
-        mbutterfly_l    q5,  q4,  d27, d20, d1[1], d1[0] @ q5  = t5,   q4  = t4
+        mbutterfly_l    q5,  q4,  d27, d20, d2[1], d2[0] @ q5  = t5,   q4  = t4
         butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
 
         mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = t12
         butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
-        mbutterfly_l    q3,  q2,  d25, d22, d1[3], d1[2] @ q3  = t7,   q2  = t6
+        mbutterfly_l    q3,  q2,  d25, d22, d2[3], d2[2] @ q3  = t7,   q2  = t6
         butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
 
         mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = t14

From b8f66c0838b4c645227f23a35b4d54373da4c60a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 31 Dec 2016 22:27:13 +0200
Subject: [PATCH 9/9] aarch64: vp9itxfm: Reorder iadst16 coeffs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This matches the order they are in the 16 bpp version.

There they are in this order, to make sure we access them in the
same order they are declared, easing loading only half of the
coefficients at a time.

This makes the 8 bpp version match the 16 bpp version better.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index d4fc2163aa..93dc736f01 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -37,8 +37,8 @@ idct_coeffs:
 endconst
 
 const iadst16_coeffs, align=4
-        .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
-        .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
 endconst
 
 // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
@@ -628,19 +628,19 @@ function iadst16
         ld1             {v0.8h,v1.8h}, [x11]
 
         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
-        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.h[1], v1.h[0]   // v10,v11 = t9,   v8,v9   = t8
+        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
 
-        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.h[3], v1.h[2]   // v6,v7   = t11,  v4,v5   = t10
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
-        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v0.h[5], v0.h[4]   // v10,v11 = t5,   v8,v9   = t4
+        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
 
         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
-        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v0.h[7], v0.h[6]   // v6,v7   = t7,   v4,v5   = t6
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
 
         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14