From 0331c3f5e8cb6e6b53fab7893e91d1be1bfa979c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 23 Nov 2016 10:56:12 +0200
Subject: [PATCH 01/12] arm: vp9itxfm: Make the larger core transforms
 standalone functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
15324 to 12388 bytes.

This gives a small slowdown of a couple tens of cycles, up to around
150 cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.

Before:                              Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub4_add_neon:    2063.4   1516.0   1719.5   1245.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   3279.3   2454.5   2525.2   1982.3
vp9_inv_dct_dct_32x32_sub4_add_neon:   10750.0   7955.4   8525.6   6754.2
vp9_inv_dct_dct_32x32_sub32_add_neon:  18574.0  17108.4  14216.7  12010.2

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 43 ++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 49b993ffe3..fd53a20a73 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-.macro idct16
+function idct16
         mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
         mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
         mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
@@ -580,9 +580,10 @@ endfunc
         vmov            d4,  d21                         @ d4  = t10a
         butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
         butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
-.endm
+        bx              lr
+endfunc
 
-.macro iadst16
+function iadst16
         movrel          r12, iadst16_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -653,7 +654,8 @@ endfunc
 
         vmov            d16, d2
         vmov            d30, d4
-.endm
+        bx              lr
+endfunc
 
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -662,6 +664,8 @@ endfunc
 @ r1 = slice offset
 @ r2 = src
 function \txfm\()16_1d_4x16_pass1_neon
+        push            {lr}
+
         mov             r12, #32
         vmov.s16        q2, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
-        \txfm\()16
+        bl              \txfm\()16
 
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
         vst1.16         {d\i}, [r0,:64]!
 .endr
-        bx              lr
+        pop             {pc}
 1:
         @ Special case: For the last input column (r1 == 12),
         @ which would be stored as the last row in the temp buffer,
@@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         vmov            d29, d17
         vmov            d30, d18
         vmov            d31, d19
-        bx              lr
+        pop             {pc}
 endfunc
 
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -719,6 +723,7 @@ endfunc
 @ r2 = src (temp buffer)
 @ r3 = slice offset
 function \txfm\()16_1d_4x16_pass2_neon
+        push            {lr}
         mov             r12, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
         vld1.16         {d\i}, [r2,:64], r12
@@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon
 
         add             r3,  r0,  r1
         lsl             r1,  r1,  #1
-        \txfm\()16
+        bl              \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3
         vrshr.s16       \coef0, \coef0, #6
@@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon
         load_add_store  q12, q13, q14, q15
 .purgem load_add_store
 
-        bx              lr
+        pop             {pc}
 endfunc
 .endm
 
@@ -908,7 +913,7 @@ function idct32x32_dc_add_neon
         bx              lr
 endfunc
 
-.macro idct32_odd
+function idct32_odd
         movrel          r12, idct_coeffs
         add             r12, r12, #32
         vld1.16         {q0-q1}, [r12,:128]
@@ -967,7 +972,8 @@ endfunc
         mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
         mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
         mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
-.endm
+        bx              lr
+endfunc
 
 @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
 @ We don't have register space to do a single pass IDCT of 4x32 though,
@@ -979,6 +985,8 @@ endfunc
 @ r1 = unused
 @ r2 = src
 function idct32_1d_4x32_pass1_neon
+        push            {lr}
+
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
-        idct16
+        bl              idct16
 
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
-        idct32_odd
+        bl              idct32_odd
 
         transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
 
@@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon
         store_rev       29, 25, 21, 17
         store_rev       28, 24, 20, 16
 .purgem store_rev
-        bx              lr
+        pop             {pc}
 endfunc
 .ltorg
 
@@ -1065,6 +1073,7 @@ endfunc
 @ r1 = dst stride
 @ r2 = src (temp buffer)
 function idct32_1d_4x32_pass2_neon
+        push            {lr}
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon
 .endr
         sub             r2,  r2,  r12, lsl #4
 
-        idct16
+        bl              idct16
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vst1.16         {d\i}, [r2,:64], r12
@@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon
         sub             r2,  r2,  r12, lsl #4
         sub             r2,  r2,  #64
 
-        idct32_odd
+        bl              idct32_odd
 
         mov             r12, #128
 .macro load_acc_store a, b, c, d, neg=0
@@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon
         load_acc_store  24, 25, 26, 27, 1
         load_acc_store  28, 29, 30, 31, 1
 .purgem load_acc_store
-        bx              lr
+        pop             {pc}
 endfunc
 
 const min_eob_idct_idct_32, align=4

From 115476018d2c97df7e9b4445fe8f6cc7420ab91f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 23 Nov 2016 14:03:05 +0200
Subject: [PATCH 02/12] aarch64: vp9itxfm: Make the larger core transforms
 standalone functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/aarch64/vp9itxfm_neon.o from
19496 to 14740 bytes.

This gives a small slowdown of a couple of tens of cycles, but makes
it more feasible to add more optimized versions of these transforms.

Before:
vp9_inv_dct_dct_16x16_sub4_add_neon:    1036.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   1372.2
vp9_inv_dct_dct_32x32_sub4_add_neon:    5180.0
vp9_inv_dct_dct_32x32_sub32_add_neon:   8095.7

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   1390.1
vp9_inv_dct_dct_32x32_sub4_add_neon:    5199.9
vp9_inv_dct_dct_32x32_sub32_add_neon:   8125.8

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 42 ++++++++++++++++++------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index c14c5f9ded..729860ca1f 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -463,7 +463,7 @@ function idct16x16_dc_add_neon
         ret
 endfunc
 
-.macro idct16
+function idct16
         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
         dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
         dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
@@ -506,9 +506,10 @@ endfunc
         butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
         butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
         butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
-.endm
+        ret
+endfunc
 
-.macro iadst16
+function iadst16
         ld1             {v0.8h,v1.8h}, [x11]
 
         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
@@ -577,7 +578,8 @@ endfunc
 
         mov             v16.16b, v2.16b
         mov             v30.16b, v4.16b
-.endm
+        ret
+endfunc
 
 // Helper macros; we can't use these expressions directly within
 // e.g. .irp due to the extra concatenation \(). Therefore wrap
@@ -604,12 +606,14 @@ endfunc
 // x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
+        mov             x14, x30
+
         movi            v2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         load_clear      \i,  x2,  x9
 .endr
 
-        \txfm\()16
+        bl              \txfm\()16
 
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
@@ -623,7 +627,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
         store           \i,  x0,  #16
 .endr
-        ret
+        br              x14
 1:
         // Special case: For the last input column (x1 == 8),
         // which would be stored as the last row in the temp buffer,
@@ -642,7 +646,7 @@ function \txfm\()16_1d_8x16_pass1_neon
         mov             v29.16b, v21.16b
         mov             v30.16b, v22.16b
         mov             v31.16b, v23.16b
-        ret
+        br              x14
 endfunc
 
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
@@ -653,6 +657,7 @@ endfunc
 // x3 = slice offset
 // x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
+        mov             x14, x30
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         load            \i,  x2,  x9
 .endr
@@ -664,7 +669,7 @@ function \txfm\()16_1d_8x16_pass2_neon
 
         add             x3,  x0,  x1
         lsl             x1,  x1,  #1
-        \txfm\()16
+        bl              \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
         srshr           \coef0, \coef0, #6
@@ -714,7 +719,7 @@ function \txfm\()16_1d_8x16_pass2_neon
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 .purgem load_add_store
 
-        ret
+        br              x14
 endfunc
 .endm
 
@@ -843,7 +848,7 @@ function idct32x32_dc_add_neon
         ret
 endfunc
 
-.macro idct32_odd
+function idct32_odd
         ld1             {v0.8h,v1.8h}, [x11]
 
         dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
@@ -898,7 +903,8 @@ endfunc
         dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
         dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
         dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
-.endm
+        ret
+endfunc
 
 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
@@ -912,6 +918,7 @@ endfunc
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1_neon
+        mov             x14, x30
         ld1             {v0.8h,v1.8h}, [x10]
 
         movi            v4.8h, #0
@@ -922,7 +929,7 @@ function idct32_1d_8x32_pass1_neon
         st1             {v4.8h},  [x2], x9
 .endr
 
-        idct16
+        bl              idct16
 
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
@@ -967,7 +974,7 @@ function idct32_1d_8x32_pass1_neon
         st1             {v4.8h},  [x2], x9
 .endr
 
-        idct32_odd
+        bl              idct32_odd
 
         transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
         transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
@@ -1003,7 +1010,7 @@ function idct32_1d_8x32_pass1_neon
         store_rev       v25.8h, v17.8h
         store_rev       v24.8h, v16.8h
 .purgem store_rev
-        ret
+        br              x14
 endfunc
 
 // This is mostly the same as 8x32_pass1, but without the transpose,
@@ -1017,6 +1024,7 @@ endfunc
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2_neon
+        mov             x14, x30
         ld1             {v0.8h,v1.8h}, [x10]
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -1025,7 +1033,7 @@ function idct32_1d_8x32_pass2_neon
 .endr
         sub             x2,  x2,  x9, lsl #4
 
-        idct16
+        bl              idct16
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         st1             {v\i\().8h}, [x2], x9
@@ -1041,7 +1049,7 @@ function idct32_1d_8x32_pass2_neon
         sub             x2,  x2,  x9, lsl #4
         sub             x2,  x2,  #64
 
-        idct32_odd
+        bl              idct32_odd
 
 .macro load_acc_store a, b, c, d, neg=0
 .if \neg == 0
@@ -1095,7 +1103,7 @@ function idct32_1d_8x32_pass2_neon
         load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
         load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
 .purgem load_acc_store
-        ret
+        br              x14
 endfunc
 
 const min_eob_idct_idct_32, align=4

From 47b3c2c18d1897f3c753ba0cec4b2d7aa24526af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sun, 5 Feb 2017 22:55:20 +0200
Subject: [PATCH 03/12] arm: vp9itxfm: Move the load_add_store macro out from
 the itxfm16 pass2 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows reusing the macro for a separate implementation of the
pass2 function.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 72 +++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index fd53a20a73..b3188bc711 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -657,6 +657,42 @@ function iadst16
         bx              lr
 endfunc
 
+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s16       \coef0, \coef0, #6
+        vrshr.s16       \coef1, \coef1, #6
+
+        vld1.32         {d4[]},   [r0,:32], r1
+        vld1.32         {d4[1]},  [r3,:32], r1
+        vrshr.s16       \coef2, \coef2, #6
+        vrshr.s16       \coef3, \coef3, #6
+        vld1.32         {d5[]},   [r0,:32], r1
+        vld1.32         {d5[1]},  [r3,:32], r1
+        vaddw.u8        \coef0, \coef0, d4
+        vld1.32         {d6[]},   [r0,:32], r1
+        vld1.32         {d6[1]},  [r3,:32], r1
+        vaddw.u8        \coef1, \coef1, d5
+        vld1.32         {d7[]},   [r0,:32], r1
+        vld1.32         {d7[1]},  [r3,:32], r1
+
+        vqmovun.s16     d4,  \coef0
+        vqmovun.s16     d5,  \coef1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r3,  r3,  r1, lsl #2
+        vaddw.u8        \coef2, \coef2, d6
+        vaddw.u8        \coef3, \coef3, d7
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r3,:32], r1
+        vqmovun.s16     d6,  \coef2
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r3,:32], r1
+        vqmovun.s16     d7,  \coef3
+
+        vst1.32         {d6[0]},  [r0,:32], r1
+        vst1.32         {d6[1]},  [r3,:32], r1
+        vst1.32         {d7[0]},  [r0,:32], r1
+        vst1.32         {d7[1]},  [r3,:32], r1
+.endm
+
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
@@ -739,44 +775,8 @@ function \txfm\()16_1d_4x16_pass2_neon
         lsl             r1,  r1,  #1
         bl              \txfm\()16
 
-.macro load_add_store coef0, coef1, coef2, coef3
-        vrshr.s16       \coef0, \coef0, #6
-        vrshr.s16       \coef1, \coef1, #6
-
-        vld1.32         {d4[]},   [r0,:32], r1
-        vld1.32         {d4[1]},  [r3,:32], r1
-        vrshr.s16       \coef2, \coef2, #6
-        vrshr.s16       \coef3, \coef3, #6
-        vld1.32         {d5[]},   [r0,:32], r1
-        vld1.32         {d5[1]},  [r3,:32], r1
-        vaddw.u8        \coef0, \coef0, d4
-        vld1.32         {d6[]},   [r0,:32], r1
-        vld1.32         {d6[1]},  [r3,:32], r1
-        vaddw.u8        \coef1, \coef1, d5
-        vld1.32         {d7[]},   [r0,:32], r1
-        vld1.32         {d7[1]},  [r3,:32], r1
-
-        vqmovun.s16     d4,  \coef0
-        vqmovun.s16     d5,  \coef1
-        sub             r0,  r0,  r1, lsl #2
-        sub             r3,  r3,  r1, lsl #2
-        vaddw.u8        \coef2, \coef2, d6
-        vaddw.u8        \coef3, \coef3, d7
-        vst1.32         {d4[0]},  [r0,:32], r1
-        vst1.32         {d4[1]},  [r3,:32], r1
-        vqmovun.s16     d6,  \coef2
-        vst1.32         {d5[0]},  [r0,:32], r1
-        vst1.32         {d5[1]},  [r3,:32], r1
-        vqmovun.s16     d7,  \coef3
-
-        vst1.32         {d6[0]},  [r0,:32], r1
-        vst1.32         {d6[1]},  [r3,:32], r1
-        vst1.32         {d7[0]},  [r0,:32], r1
-        vst1.32         {d7[1]},  [r3,:32], r1
-.endm
         load_add_store  q8,  q9,  q10, q11
         load_add_store  q12, q13, q14, q15
-.purgem load_add_store
 
         pop             {pc}
 endfunc

From 79d332ebbde8c0a3e9da094dcfd10abd33ba7378 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sun, 5 Feb 2017 22:53:55 +0200
Subject: [PATCH 04/12] aarch64: vp9itxfm: Move the load_add_store macro out
 from the itxfm16 pass2 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows reusing the macro for a separate implementation of the
pass2 function.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 90 +++++++++++++++---------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 729860ca1f..b6cadfb66b 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -598,6 +598,51 @@ endfunc
         st1             {v2.8h},  [\src], \inc
 .endm
 
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
+        srshr           \coef0, \coef0, #6
+        ld1             {v2.8b},  [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v3.8b},  [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v4.8b},  [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v2.8b
+        ld1             {v5.8b},  [x3], x1
+        uaddw           \coef1, \coef1, v3.8b
+        srshr           \coef4, \coef4, #6
+        ld1             {v6.8b},  [x0], x1
+        srshr           \coef5, \coef5, #6
+        ld1             {v7.8b},  [x3], x1
+        sqxtun          v2.8b,  \coef0
+        srshr           \coef6, \coef6, #6
+        sqxtun          v3.8b,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef2, \coef2, v4.8b
+        ld1             {\tmp1},  [x0], x1
+        uaddw           \coef3, \coef3, v5.8b
+        ld1             {\tmp2},  [x3], x1
+        sqxtun          v4.8b,  \coef2
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.8b,  \coef3
+        uaddw           \coef4, \coef4, v6.8b
+        st1             {v2.8b},  [x0], x1
+        uaddw           \coef5, \coef5, v7.8b
+        st1             {v3.8b},  [x3], x1
+        sqxtun          v6.8b,  \coef4
+        st1             {v4.8b},  [x0], x1
+        sqxtun          v7.8b,  \coef5
+        st1             {v5.8b},  [x3], x1
+        uaddw           \coef6, \coef6, \tmp1
+        st1             {v6.8b},  [x0], x1
+        uaddw           \coef7, \coef7, \tmp2
+        st1             {v7.8b},  [x3], x1
+        sqxtun          \tmp1,  \coef6
+        sqxtun          \tmp2,  \coef7
+        st1             {\tmp1},  [x0], x1
+        st1             {\tmp2},  [x3], x1
+.endm
+
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 // transpose into a horizontal 16x8 slice and store.
 // x0 = dst (temp buffer)
@@ -671,53 +716,8 @@ function \txfm\()16_1d_8x16_pass2_neon
         lsl             x1,  x1,  #1
         bl              \txfm\()16
 
-.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
-        srshr           \coef0, \coef0, #6
-        ld1             {v2.8b},  [x0], x1
-        srshr           \coef1, \coef1, #6
-        ld1             {v3.8b},  [x3], x1
-        srshr           \coef2, \coef2, #6
-        ld1             {v4.8b},  [x0], x1
-        srshr           \coef3, \coef3, #6
-        uaddw           \coef0, \coef0, v2.8b
-        ld1             {v5.8b},  [x3], x1
-        uaddw           \coef1, \coef1, v3.8b
-        srshr           \coef4, \coef4, #6
-        ld1             {v6.8b},  [x0], x1
-        srshr           \coef5, \coef5, #6
-        ld1             {v7.8b},  [x3], x1
-        sqxtun          v2.8b,  \coef0
-        srshr           \coef6, \coef6, #6
-        sqxtun          v3.8b,  \coef1
-        srshr           \coef7, \coef7, #6
-        uaddw           \coef2, \coef2, v4.8b
-        ld1             {\tmp1},  [x0], x1
-        uaddw           \coef3, \coef3, v5.8b
-        ld1             {\tmp2},  [x3], x1
-        sqxtun          v4.8b,  \coef2
-        sub             x0,  x0,  x1, lsl #2
-        sub             x3,  x3,  x1, lsl #2
-        sqxtun          v5.8b,  \coef3
-        uaddw           \coef4, \coef4, v6.8b
-        st1             {v2.8b},  [x0], x1
-        uaddw           \coef5, \coef5, v7.8b
-        st1             {v3.8b},  [x3], x1
-        sqxtun          v6.8b,  \coef4
-        st1             {v4.8b},  [x0], x1
-        sqxtun          v7.8b,  \coef5
-        st1             {v5.8b},  [x3], x1
-        uaddw           \coef6, \coef6, \tmp1
-        st1             {v6.8b},  [x0], x1
-        uaddw           \coef7, \coef7, \tmp2
-        st1             {v7.8b},  [x3], x1
-        sqxtun          \tmp1,  \coef6
-        sqxtun          \tmp2,  \coef7
-        st1             {\tmp1},  [x0], x1
-        st1             {\tmp2},  [x3], x1
-.endm
         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
-.purgem load_add_store
 
         br              x14
 endfunc

From 5eb5aec475aabc884d083566f902876ecbc072cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 22 Nov 2016 11:07:38 +0200
Subject: [PATCH 05/12] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32
 when possible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work is sponsored by, and copyright, Google.

This avoids loading and calculating coefficients that we know will
be zero, and avoids filling the temp buffer with zeros in places
where we know the second pass won't read.

This gives a pretty substantial speedup for the smaller subpartitions.

The code size increases from 12388 bytes to 19784 bytes.

The idct16/32_end macros are moved above the individual functions; the
instructions themselves are unchanged, but since new functions are added
at the same place where the code is moved from, the diff looks rather
messy.

Before:                              Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.5    212.0    235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:    2102.1   1521.7   1736.2   1265.8
vp9_inv_dct_dct_16x16_sub4_add_neon:    2104.5   1533.0   1736.6   1265.5
vp9_inv_dct_dct_16x16_sub8_add_neon:    2484.8   1828.7   2014.4   1506.5
vp9_inv_dct_dct_16x16_sub12_add_neon:   2851.2   2117.8   2294.8   1753.2
vp9_inv_dct_dct_16x16_sub16_add_neon:   3239.4   2408.3   2543.5   1994.9
vp9_inv_dct_dct_32x32_sub1_add_neon:     758.3    456.7    864.5    553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   10776.7   7949.8   8567.7   6819.7
vp9_inv_dct_dct_32x32_sub4_add_neon:   10865.6   8131.5   8589.6   6816.3
vp9_inv_dct_dct_32x32_sub8_add_neon:   12053.9   9271.3   9387.7   7564.0
vp9_inv_dct_dct_32x32_sub12_add_neon:  13328.3  10463.2  10217.0   8321.3
vp9_inv_dct_dct_32x32_sub16_add_neon:  14176.4  11509.5  11018.7   9062.3
vp9_inv_dct_dct_32x32_sub20_add_neon:  15301.5  12999.9  11855.1   9828.2
vp9_inv_dct_dct_32x32_sub24_add_neon:  16482.7  14931.5  12650.1  10575.0
vp9_inv_dct_dct_32x32_sub28_add_neon:  17589.5  15811.9  13482.8  11333.4
vp9_inv_dct_dct_32x32_sub32_add_neon:  18696.2  17049.2  14355.6  12089.7

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.5    211.7    235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:    1203.5    998.2   1035.3    763.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    1203.5    998.1   1035.5    760.8
vp9_inv_dct_dct_16x16_sub8_add_neon:    1926.1   1610.6   1722.1   1271.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2873.2   2129.7   2285.1   1757.3
vp9_inv_dct_dct_16x16_sub16_add_neon:   3221.4   2520.3   2557.6   2002.1
vp9_inv_dct_dct_32x32_sub1_add_neon:     753.0    457.5    866.6    554.6
vp9_inv_dct_dct_32x32_sub2_add_neon:    7554.6   5652.4   6048.4   4920.2
vp9_inv_dct_dct_32x32_sub4_add_neon:    7549.9   5685.0   6046.9   4925.7
vp9_inv_dct_dct_32x32_sub8_add_neon:    8336.9   6704.5   6604.0   5478.0
vp9_inv_dct_dct_32x32_sub12_add_neon:  10914.0   9777.2   9240.4   7416.9
vp9_inv_dct_dct_32x32_sub16_add_neon:  11859.2  11223.3   9966.3   8095.1
vp9_inv_dct_dct_32x32_sub20_add_neon:  15237.1  13029.4  11838.3   9829.4
vp9_inv_dct_dct_32x32_sub24_add_neon:  16293.2  14379.8  12644.9  10572.0
vp9_inv_dct_dct_32x32_sub28_add_neon:  17424.3  15734.7  13473.0  11326.9
vp9_inv_dct_dct_32x32_sub32_add_neon:  18531.3  17457.0  14298.6  12080.0

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 601 +++++++++++++++++++++++++++++----
 1 file changed, 542 insertions(+), 59 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index b3188bc711..167d5178e4 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@ endconst
         vrshrn.s32      \out2, \tmpq4, #14
 .endm
 
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+        vmull.s16       \tmpq3, \in1, d0[0]
+        vrshrn.s32      \out1,  \tmpq3, #14
+        vrshrn.s32      \out2,  \tmpq3, #14
+.endm
+
 @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
 @ Same as mbutterfly0, but with input being 2 q registers, output
@@ -137,6 +145,23 @@ endconst
         vrshrn.s32      \inout2, \tmp2,  #14
 .endm
 
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s16       \tmp1,   \inout1, \coef1
+        vmull.s16       \tmp2,   \inout1, \coef2
+        vrshrn.s32      \inout1, \tmp1,   #14
+        vrshrn.s32      \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s16       \tmp1,   \inout2, \coef2
+        vmull.s16       \tmp2,   \inout2, \coef1
+        vneg.s32        \tmp1,   \tmp1
+        vrshrn.s32      \inout2, \tmp2,   #14
+        vrshrn.s32      \inout1, \tmp1,   #14
+.endm
+
 @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
 @ inout are 4 d registers, tmp are 4 q registers
@@ -534,29 +559,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-function idct16
-        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
-        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
-        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
-        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
-        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
-        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
-        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
-        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
-
-        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
-        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
-        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
-        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
-        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
-        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
-        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
-        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
-
-        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
-        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
-        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
-
+.macro idct16_end
         butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
         butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
         butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
@@ -581,6 +584,87 @@ function idct16
         butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
         butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
         bx              lr
+.endm
+
+function idct16
+        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
+        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
+
+        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
+        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
+        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
+        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        mbutterfly0_h   d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
+        mbutterfly_h1   d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly_h1   d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly_h2   d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly_h1   d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly_h2   d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly_h1   d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly_h2   d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
+
+        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
+        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
+        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
+        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        vmull.s16       q12, d19, d3[2]
+        vmull.s16       q2,  d17, d1[3]
+        vmull.s16       q3,  d18, d1[0]
+        vmull.s16       q15, d18, d0[3]
+        vneg.s32        q12, q12
+        vmull.s16       q14, d17, d2[0]
+        vmull.s16       q13, d19, d3[1]
+        vmull.s16       q11, d16, d0[0]
+        vrshrn.s32      d24, q12, #14
+        vrshrn.s32      d16, q2,  #14
+        vrshrn.s32      d7,  q3,  #14
+        vrshrn.s32      d6,  q15, #14
+        vrshrn.s32      d29, q14, #14
+        vrshrn.s32      d17, q13, #14
+        vrshrn.s32      d28, q11, #14
+
+        mbutterfly_l    q10, q11, d17, d24, d0[1], d0[2]
+        mbutterfly_l    q9,  q15, d29, d16, d0[1], d0[2]
+        vneg.s32        q11, q11
+        vrshrn.s32      d27, q10, #14
+        vrshrn.s32      d21, q11, #14
+        vrshrn.s32      d23, q9,  #14
+        vrshrn.s32      d25, q15, #14
+        vmov            d4,  d28
+        vmov            d5,  d28
+        mbutterfly0     d22, d26, d7,  d6,  d18, d30, q9,  q15
+        vmov            d20, d28
+        idct16_end
 endfunc
 
 function iadst16
@@ -819,6 +903,13 @@ A       and             r7,  sp,  #15
         vld1.16         {q0-q1}, [r12,:128]
 .endif
 
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #10
+        ble             idct16x16_quarter_add_neon
+        cmp             r3,  #38
+        ble             idct16x16_half_add_neon
+.endif
+
 .irp i, 0, 4, 8, 12
         add             r0,  sp,  #(\i*32)
 .ifc \txfm1\()_\txfm2,idct_idct
@@ -877,6 +968,169 @@ itxfm_func16x16 idct,  iadst
 itxfm_func16x16 iadst, iadst
 .ltorg
 
+function idct16_1d_4x16_pass1_quarter_neon
+        push            {lr}
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              idct16_quarter
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        @ The first 4x4 block is kept in registers for the second pass,
+        @ store the rest in the temp buffer.
+        add             r0,  r0,  #8
+        vst1.16         {d20}, [r0,:64]!
+        vst1.16         {d24}, [r0,:64]!
+        vst1.16         {d28}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d21}, [r0,:64]!
+        vst1.16         {d25}, [r0,:64]!
+        vst1.16         {d29}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d22}, [r0,:64]!
+        vst1.16         {d26}, [r0,:64]!
+        vst1.16         {d30}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d23}, [r0,:64]!
+        vst1.16         {d27}, [r0,:64]!
+        vst1.16         {d31}, [r0,:64]!
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+        push            {lr}
+        @ Only load the top 4 lines, and only do it for the later slices.
+        @ For the first slice, d16-d19 is kept in registers from the first pass.
+        cmp             r3,  #0
+        beq             1f
+        mov             r12, #32
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              idct16_quarter
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+        push            {lr}
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              idct16_half
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r1,  #4
+        beq             1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+1:
+        @ Special case: For the second input column (r1 == 4),
+        @ which would be stored as the second row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ second 4x4 block).
+        add             r0,  r0,  #8
+        vst1.16         {d20}, [r0,:64]!
+        vst1.16         {d24}, [r0,:64]!
+        vst1.16         {d28}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d21}, [r0,:64]!
+        vst1.16         {d25}, [r0,:64]!
+        vst1.16         {d29}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d22}, [r0,:64]!
+        vst1.16         {d26}, [r0,:64]!
+        vst1.16         {d30}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d23}, [r0,:64]!
+        vst1.16         {d27}, [r0,:64]!
+        vst1.16         {d31}, [r0,:64]!
+        vmov            d20, d16
+        vmov            d21, d17
+        vmov            d22, d18
+        vmov            d23, d19
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+        push            {lr}
+        mov             r12, #32
+        cmp             r3,  #0
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        beq             1f
+.irp i, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              idct16_half
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+.purgem load_add_store
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+        add             r0,  sp,  #(0*32)
+        mov             r1,  #0
+        add             r2,  r6,  #(0*2)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+        add             r0,  sp,  #(4*32)
+        mov             r1,  #4
+        add             r2,  r6,  #(4*2)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.endif
+.irp i, 0, 4, 8, 12
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        mov             r3,  #\i
+        bl              idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        pop             {r4-r8,pc}
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
 
 function idct32x32_dc_add_neon
         movrel          r12, idct_coeffs
@@ -913,6 +1167,38 @@ function idct32x32_dc_add_neon
         bx              lr
 endfunc
 
+.macro idct32_end
+        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
+        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
+        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
+        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
+        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
+        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
+        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
+        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
+
+        mbutterfly      d27, d20, d0[1], d0[2], q12, q15        @ d27 = t18a, d20 = t29a
+        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15        @ d29 = t19,  d5  = t28
+        mbutterfly      d28, d6,  d0[1], d0[2], q12, q15, neg=1 @ d28 = t27,  d6  = t20
+        mbutterfly      d26, d21, d0[1], d0[2], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+
+        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
+        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
+        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
+        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
+        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
+        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
+        vmov            d29, d4            @ d29 = t29
+
+        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 = t20
+        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
+        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
+        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
+        bx              lr
+.endm
+
 function idct32_odd
         movrel          r12, idct_coeffs
         add             r12, r12, #32
@@ -943,38 +1229,91 @@ function idct32_odd
         mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
         mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
         mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
-
-        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
-        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
-        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
-        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
-        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
-        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
-        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
-        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
-
-        mbutterfly      d27, d20, d0[1], d0[2], q12, q15        @ d27 = t18a, d20 = t29a
-        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15        @ d29 = t19,  d5  = t28
-        mbutterfly      d28, d6,  d0[1], d0[2], q12, q15, neg=1 @ d28 = t27,  d6  = t20
-        mbutterfly      d26, d21, d0[1], d0[2], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
-
-        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
-        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
-        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
-        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
-        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
-        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
-        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
-        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
-        vmov            d29, d4            @ d29 = t29
-
-        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 = t20
-        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
-        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
-        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
-        bx              lr
+        idct32_end
 endfunc
 
+function idct32_odd_half
+        movrel          r12, idct_coeffs
+        add             r12, r12, #32
+        vld1.16         {q0-q1}, [r12,:128]
+
+        mbutterfly_h1   d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
+        mbutterfly_h2   d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
+        mbutterfly_h1   d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
+        mbutterfly_h2   d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
+        mbutterfly_h1   d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
+        mbutterfly_h2   d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
+        mbutterfly_h1   d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
+        mbutterfly_h2   d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
+
+        sub             r12, r12, #32
+        vld1.16         {q0}, [r12,:128]
+
+        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
+        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
+        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
+        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d0[3], d1[0], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        movrel          r12, idct_coeffs
+        add             r12, r12, #32
+        vld1.16         {q0-q1}, [r12,:128]
+
+        vmull.s16       q2,  d16, d0[0]
+        vmull.s16       q14, d19, d1[3]
+        vmull.s16       q15, d16, d0[1]
+        vmull.s16       q11, d17, d3[2]
+        vmull.s16       q3,  d17, d3[3]
+        vmull.s16       q13, d19, d1[2]
+        vmull.s16       q10, d18, d2[0]
+        vmull.s16       q12, d18, d2[1]
+
+        sub             r12, r12, #32
+        vld1.16         {q0}, [r12,:128]
+
+        vneg.s32        q14, q14
+        vneg.s32        q3,  q3
+
+        vrshrn.s32      d4,  q2,  #14
+        vrshrn.s32      d5,  q14, #14
+        vrshrn.s32      d29, q15, #14
+        vrshrn.s32      d28, q11, #14
+        vrshrn.s32      d7,  q3,  #14
+        vrshrn.s32      d31, q13, #14
+        vrshrn.s32      d6,  q10, #14
+        vrshrn.s32      d30, q12, #14
+
+        mbutterfly_l    q8,  q9,  d29, d4,  d0[3], d1[0]
+        mbutterfly_l    q13, q10, d31, d5,  d0[3], d1[0]
+        vrshrn.s32      d23, q8,  #14
+        vrshrn.s32      d24, q9,  #14
+        vneg.s32        q10, q10
+        vrshrn.s32      d27, q13, #14
+        vrshrn.s32      d20, q10, #14
+        mbutterfly_l    q8,  q9,  d30, d6,  d1[1], d1[2]
+        vrshrn.s32      d21, q8,  #14
+        vrshrn.s32      d26, q9,  #14
+        mbutterfly_l    q8,  q9,  d28, d7,  d1[1], d1[2]
+        vrshrn.s32      d25, q8,  #14
+        vneg.s32        q9,  q9
+        vrshrn.s32      d22, q9,  #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
 @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
 @ We don't have register space to do a single pass IDCT of 4x32 though,
 @ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
@@ -984,7 +1323,7 @@ endfunc
 @ r0 = dst (temp buffer)
 @ r1 = unused
 @ r2 = src
-function idct32_1d_4x32_pass1_neon
+function idct32_1d_4x32_pass1\suffix\()_neon
         push            {lr}
 
         movrel          r12, idct_coeffs
@@ -995,12 +1334,26 @@ function idct32_1d_4x32_pass1_neon
         vmov.s16        d4, #0
 
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
 
-        bl              idct16
+        bl              idct16\suffix
 
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -1026,17 +1379,39 @@ function idct32_1d_4x32_pass1_neon
 
         @ Move r2 back to the start of the input, and move
         @ to the first odd row
+.ifb \suffix
         sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             r2,  r2,  r12, lsl #3
+.endif
         add             r2,  r2,  #64
 
         vmov.s16        d4, #0
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
 
-        bl              idct32_odd
+        bl              idct32_odd\suffix
 
         transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
 
@@ -1072,19 +1447,33 @@ endfunc
 @ r0 = dst
 @ r1 = dst stride
 @ r2 = src (temp buffer)
-function idct32_1d_4x32_pass2_neon
+function idct32_1d_4x32_pass2\suffix\()_neon
         push            {lr}
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
         mov             r12, #128
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
 
-        bl              idct16
+        bl              idct16\suffix
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vst1.16         {d\i}, [r2,:64], r12
@@ -1094,13 +1483,27 @@ function idct32_1d_4x32_pass2_neon
         add             r2,  r2,  #64
 
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
         sub             r2,  r2,  #64
 
-        bl              idct32_odd
+        bl              idct32_odd\suffix
 
         mov             r12, #128
 .macro load_acc_store a, b, c, d, neg=0
@@ -1150,6 +1553,11 @@ function idct32_1d_4x32_pass2_neon
 .purgem load_acc_store
         pop             {pc}
 endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
 
 const min_eob_idct_idct_32, align=4
         .short  0, 9, 34, 70, 135, 240, 336, 448
@@ -1173,6 +1581,11 @@ A       and             r7,  sp,  #15
         mov             r5,  r1
         mov             r6,  r2
 
+        cmp             r3,  #34
+        ble             idct32x32_quarter_add_neon
+        cmp             r3,  #135
+        ble             idct32x32_half_add_neon
+
 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
         add             r0,  sp,  #(\i*64)
 .if \i > 0
@@ -1209,3 +1622,73 @@ A       and             r7,  sp,  #15
         vpop            {q4-q7}
         pop             {r4-r8,pc}
 endfunc
+
+function idct32x32_quarter_add_neon
+.irp i, 0, 4
+        add             r0,  sp,  #(\i*64)
+.if \i == 4
+        cmp             r3,  #9
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_quarter_neon
+.endr
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+.rept 8
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_quarter_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r8,pc}
+endfunc
+
+function idct32x32_half_add_neon
+.irp i, 0, 4, 8, 12
+        add             r0,  sp,  #(\i*64)
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(16 - \i)/2
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_half_neon
+.endr
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_half_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r8,pc}
+endfunc

From a63da4511d0fee66695ff4afd264ba1dbf1e812d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 22 Nov 2016 22:58:35 +0200
Subject: [PATCH 06/12] aarch64: vp9itxfm: Do separate functions for
 half/quarter idct16 and idct32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work is sponsored by, and copyright, Google.

This avoids loading and calculating coefficients that we know will
be zero, and avoids filling the temp buffer with zeros in places
where we know the second pass won't read.

This gives a pretty substantial speedup for the smaller subpartitions.

The code size increases from 14740 bytes to 24292 bytes.

The idct16/32_end macros are moved above the individual functions; the
instructions themselves are unchanged, but since new functions are added
at the same place where the code is moved from, the diff looks rather
messy.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:     236.7
vp9_inv_dct_dct_16x16_sub2_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub8_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1387.4
vp9_inv_dct_dct_16x16_sub16_add_neon:   1387.6
vp9_inv_dct_dct_32x32_sub1_add_neon:     554.1
vp9_inv_dct_dct_32x32_sub2_add_neon:    5198.5
vp9_inv_dct_dct_32x32_sub4_add_neon:    5198.6
vp9_inv_dct_dct_32x32_sub8_add_neon:    5196.3
vp9_inv_dct_dct_32x32_sub12_add_neon:   6183.4
vp9_inv_dct_dct_32x32_sub16_add_neon:   6174.3
vp9_inv_dct_dct_32x32_sub20_add_neon:   7151.4
vp9_inv_dct_dct_32x32_sub24_add_neon:   7145.3
vp9_inv_dct_dct_32x32_sub28_add_neon:   8119.3
vp9_inv_dct_dct_32x32_sub32_add_neon:   8118.7

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:     236.7
vp9_inv_dct_dct_16x16_sub2_add_neon:     640.8
vp9_inv_dct_dct_16x16_sub4_add_neon:     639.0
vp9_inv_dct_dct_16x16_sub8_add_neon:     842.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1388.3
vp9_inv_dct_dct_16x16_sub16_add_neon:   1389.3
vp9_inv_dct_dct_32x32_sub1_add_neon:     554.1
vp9_inv_dct_dct_32x32_sub2_add_neon:    3685.5
vp9_inv_dct_dct_32x32_sub4_add_neon:    3685.1
vp9_inv_dct_dct_32x32_sub8_add_neon:    3684.4
vp9_inv_dct_dct_32x32_sub12_add_neon:   5312.2
vp9_inv_dct_dct_32x32_sub16_add_neon:   5315.4
vp9_inv_dct_dct_32x32_sub20_add_neon:   7154.9
vp9_inv_dct_dct_32x32_sub24_add_neon:   7154.5
vp9_inv_dct_dct_32x32_sub28_add_neon:   8126.6
vp9_inv_dct_dct_32x32_sub32_add_neon:   8127.2

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 531 +++++++++++++++++++++++++----
 1 file changed, 469 insertions(+), 62 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index b6cadfb66b..c954d1a5e1 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,17 @@ endconst
 .endif
 .endm
 
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
+        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
+        rshrn           \out1\().4h,  \tmp1\().4s, #14
+        rshrn2          \out1\().8h,  \tmp2\().4s, #14
+        rshrn           \out2\().4h,  \tmp1\().4s, #14
+        rshrn2          \out2\().8h,  \tmp2\().4s, #14
+.endm
+
 // out1,out2 = in1 * coef1 - in2 * coef2
 // out3,out4 = in1 * coef2 + in2 * coef1
 // out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +115,43 @@ endconst
         rshrn2          \inout2\().8h, \tmp4\().4s,  #14
 .endm
 
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout1\().4h, \coef1
+        smull2          \tmp2\().4s, \inout1\().8h, \coef1
+        smull           \tmp3\().4s, \inout1\().4h, \coef2
+        smull2          \tmp4\().4s, \inout1\().8h, \coef2
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout2\().4h, \coef2
+        smull2          \tmp2\().4s, \inout2\().8h, \coef2
+        smull           \tmp3\().4s, \inout2\().4h, \coef1
+        smull2          \tmp4\().4s, \inout2\().8h, \coef1
+        neg             \tmp1\().4s, \tmp1\().4s
+        neg             \tmp2\().4s, \tmp2\().4s
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().4s, \in\().4h, \coef
+        smull2          \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().4h, \in1\().4s, \shift
+        rshrn2          \out\().8h, \in2\().4s, \shift
+.endm
+
+
 // out1 = in1 + in2
 // out2 = in1 - in2
 .macro butterfly_8h out1, out2, in1, in2
@@ -463,6 +511,30 @@ function idct16x16_dc_add_neon
         ret
 endfunc
 
+.macro idct16_end
+        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
+        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
+        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
+        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
+        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
+        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
+        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
+        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+
+        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
+        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+
+        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
+        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
+        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
+        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
+        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
+        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
+        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
+        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
+        ret
+.endm
+
 function idct16
         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
         dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
@@ -485,28 +557,65 @@ function idct16
         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
         dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
         dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
 
-        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
-        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
-        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
-        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
-        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
-        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
-        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
-        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 
-        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
-        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 
-        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
-        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
-        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
-        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
-        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
-        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
-        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
-        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
-        ret
+        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v1.h[6]
+        dsmull_h        v4,  v5,  v17, v0.h[7]
+        dsmull_h        v7,  v6,  v18, v0.h[4]
+        dsmull_h        v30, v31, v18, v0.h[3]
+        neg             v24.4s,  v24.4s
+        neg             v25.4s,  v25.4s
+        dsmull_h        v29, v28, v17, v1.h[0]
+        dsmull_h        v26, v27, v19, v1.h[5]
+        dsmull_h        v22, v23, v16, v0.h[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
+        neg             v22.4s,  v22.4s
+        neg             v23.4s,  v23.4s
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
 endfunc
 
 function iadst16
@@ -756,6 +865,13 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .endif
         mov             x9, #32
 
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #10
+        b.le            idct16x16_quarter_add_neon
+        cmp             w3,  #38
+        b.le            idct16x16_half_add_neon
+.endif
+
 .irp i, 0, 8
         add             x0,  sp,  #(\i*32)
 .ifc \txfm1\()_\txfm2,idct_idct
@@ -812,6 +928,116 @@ itxfm_func16x16 iadst, idct
 itxfm_func16x16 idct,  iadst
 itxfm_func16x16 iadst, iadst
 
+function idct16_1d_8x16_pass1_quarter_neon
+        mov             x14, x30
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        // Since only a 4x4 part of the input was nonzero, this means that
+        // only 4 rows are nonzero after transposing, and the second pass
+        // only reads the topmost 4 rows. Therefore only store the topmost
+        // 4 rows.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27
+        store           \i,  x0,  x9
+.endr
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+        mov             x14, x30
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+        mov             x14, x30
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i,  x0,  x9
+.endr
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+        mov             x14, x30
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+        add             x0,  sp,  #(0*32)
+        add             x2,  x6,  #(0*2)
+        bl              idct16_1d_8x16_pass1_\size\()_neon
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              idct16_1d_8x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
 
 function idct32x32_dc_add_neon
         movrel          x4, idct_coeffs
@@ -848,34 +1074,7 @@ function idct32x32_dc_add_neon
         ret
 endfunc
 
-function idct32_odd
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
-
-        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
-        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
-        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
-        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
-        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
-        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
-        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
-        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
-
-        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
-        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
-        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
-        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
-
+.macro idct32_end
         butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
         butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
         butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
@@ -904,8 +1103,117 @@ function idct32_odd
         dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
         dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
         ret
+.endm
+
+function idct32_odd
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        ld1             {v0.8h}, [x10]
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
 endfunc
 
+function idct32_odd_half
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        ld1             {v0.8h}, [x10]
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dsmull_h        v4,  v5,  v16, v0.h[0]
+        dsmull_h        v28, v29, v19, v0.h[7]
+        dsmull_h        v30, v31, v16, v0.h[1]
+        dsmull_h        v22, v23, v17, v1.h[6]
+        dsmull_h        v7,  v6,  v17, v1.h[7]
+        dsmull_h        v26, v27, v19, v0.h[6]
+        dsmull_h        v20, v21, v18, v1.h[0]
+        dsmull_h        v24, v25, v18, v1.h[1]
+
+        ld1             {v0.8h}, [x10]
+
+        neg             v28.4s, v28.4s
+        neg             v29.4s, v29.4s
+        neg             v7.4s,  v7.4s
+        neg             v6.4s,  v6.4s
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[3], v0.h[4]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[3], v0.h[4]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.4s, v20.4s
+        neg             v21.4s, v21.4s
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[5], v0.h[6]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[5], v0.h[6]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.4s, v18.4s
+        neg             v19.4s, v19.4s
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
 // a normal IDCT16 with every other input component (the even ones, with
@@ -917,19 +1225,30 @@ endfunc
 // x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass1_neon
+function idct32_1d_8x32_pass1\suffix\()_neon
         mov             x14, x30
         ld1             {v0.8h,v1.8h}, [x10]
 
-        movi            v4.8h, #0
+        movi            v2.8h, #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
 
-        bl              idct16
+        bl              idct16\suffix
 
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
@@ -964,17 +1283,36 @@ function idct32_1d_8x32_pass1_neon
 
         // Move x2 back to the start of the input, and move
         // to the first odd row
+.ifb \suffix
         sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
         add             x2,  x2,  #64
 
-        movi            v4.8h, #0
+        movi            v2.8h, #0
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
 
-        bl              idct32_odd
+        bl              idct32_odd\suffix
 
         transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
         transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
@@ -1023,33 +1361,61 @@ endfunc
 // x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass2_neon
+function idct32_1d_8x32_pass2\suffix\()_neon
         mov             x14, x30
         ld1             {v0.8h,v1.8h}, [x10]
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
 
-        bl              idct16
+        bl              idct16\suffix
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        st1             {v\i\().8h}, [x2], x9
+        store           \i, x2, x9
 .endr
 
         sub             x2,  x2,  x9, lsl #4
         add             x2,  x2,  #64
 
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
         sub             x2,  x2,  #64
 
-        bl              idct32_odd
+        bl              idct32_odd\suffix
 
 .macro load_acc_store a, b, c, d, neg=0
 .if \neg == 0
@@ -1105,6 +1471,11 @@ function idct32_1d_8x32_pass2_neon
 .purgem load_acc_store
         br              x14
 endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
 
 const min_eob_idct_idct_32, align=4
         .short  0, 34, 135, 336
@@ -1135,6 +1506,11 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         mov             x9,  #128
         neg             x7,  x9
 
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_neon
+
 .irp i, 0, 8, 16, 24
         add             x0,  sp,  #(\i*64)
 .if \i > 0
@@ -1177,3 +1553,34 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 
         br              x15
 endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
+        add             x0,  sp,  #(0*64)
+        add             x2,  x6,  #(0*2)
+        bl              idct32_1d_8x32_pass1_\size\()_neon
+.ifc \size,half
+        add             x0,  sp,  #(8*64)
+        add             x2,  x6,  #(8*2)
+        bl              idct32_1d_8x32_pass1_\size\()_neon
+.endif
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+
+        br              x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half

From 3933b86bb93aca47f29fbd493075b0f110c1e3f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 3 Jan 2017 16:38:56 +0200
Subject: [PATCH 07/12] arm: vp9itxfm: Share instructions for loading idct
 coeffs in the 8x8 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9itxfm_neon.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 167d5178e4..3d0b0fab2e 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -412,13 +412,12 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         movrel          r12, idct_coeffs
         vpush           {q4-q5}
-        vld1.16         {q0}, [r12,:128]
 .else
         movrel          r12, iadst8_coeffs
         vld1.16         {q1}, [r12,:128]!
         vpush           {q4-q7}
-        vld1.16         {q0}, [r12,:128]
 .endif
+        vld1.16         {q0}, [r12,:128]
 
         vmov.i16        q2, #0
         vmov.i16        q3, #0

From 4da4b2b87f08a1331650c7e36eb7d4029a160776 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 3 Jan 2017 16:39:41 +0200
Subject: [PATCH 08/12] aarch64: vp9itxfm: Share instructions for loading idct
 coeffs in the 8x8 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index c954d1a5e1..a9c7626e65 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -379,12 +379,11 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         // idct, so those always need to be loaded.
 .ifc \txfm1\()_\txfm2,idct_idct
         movrel          x4,  idct_coeffs
-        ld1             {v0.8h}, [x4]
 .else
         movrel          x4, iadst8_coeffs
         ld1             {v1.8h}, [x4], #16
-        ld1             {v0.8h}, [x4]
 .endif
+        ld1             {v0.8h}, [x4]
 
         movi            v2.16b, #0
         movi            v3.16b, #0

From ed8d293306e12c9b79022d37d39f48825ce7f2fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 3 Jan 2017 14:55:46 +0200
Subject: [PATCH 09/12] aarch64: vp9itxfm: Use a single lane ld1 instead of
 ld1r where possible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ld1r is a leftover from the arm version, where this trick is
beneficial on some cores.

Use a single-lane load where we don't need the semantics of ld1r.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index a9c7626e65..e7b88364f1 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -255,7 +255,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
         cmp             w3,  #1
         b.ne            1f
         // DC-only for idct/idct
-        ld1r            {v2.4h},  [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -287,8 +287,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 
         \txfm2\()4      v4,  v5,  v6,  v7
 2:
-        ld1r            {v0.2s},   [x0], x1
-        ld1r            {v1.2s},   [x0], x1
+        ld1             {v0.s}[0],   [x0], x1
+        ld1             {v1.s}[0],   [x0], x1
 .ifnc \txfm1,iwht
         srshr           v4.4h,  v4.4h,  #4
         srshr           v5.4h,  v5.4h,  #4
@@ -297,8 +297,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 .endif
         uaddw           v4.8h,  v4.8h,  v0.8b
         uaddw           v5.8h,  v5.8h,  v1.8b
-        ld1r            {v2.2s},   [x0], x1
-        ld1r            {v3.2s},   [x0], x1
+        ld1             {v2.s}[0],   [x0], x1
+        ld1             {v3.s}[0],   [x0], x1
         sqxtun          v0.8b,  v4.8h
         sqxtun          v1.8b,  v5.8h
         sub             x0,  x0,  x1, lsl #2
@@ -394,7 +394,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         cmp             w3,  #1
         b.ne            1f
         // DC-only for idct/idct
-        ld1r            {v2.4h},  [x2]
+        ld1             {v2.h}[0],  [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -485,7 +485,7 @@ function idct16x16_dc_add_neon
 
         movi            v1.4h, #0
 
-        ld1r            {v2.4h}, [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -1044,7 +1044,7 @@ function idct32x32_dc_add_neon
 
         movi            v1.4h, #0
 
-        ld1r            {v2.4h}, [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h,  v0.h[0]
         rshrn           v2.4h,  v2.4s,  #14
         smull           v2.4s,  v2.4h,  v0.h[0]

From 3dd7827258ddaa2e51085d0c677d6f3b1be3572f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 3 Jan 2017 16:46:17 +0200
Subject: [PATCH 10/12] aarch64: vp9itxfm: Use the right lane sizes in 8x8 for
 improved readability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index e7b88364f1..7582081bd8 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -385,10 +385,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 .endif
         ld1             {v0.8h}, [x4]
 
-        movi            v2.16b, #0
-        movi            v3.16b, #0
-        movi            v4.16b, #0
-        movi            v5.16b, #0
+        movi            v2.8h, #0
+        movi            v3.8h, #0
+        movi            v4.8h, #0
+        movi            v5.8h, #0
 
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             w3,  #1
@@ -411,11 +411,11 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         b               2f
 .endif
 1:
-        ld1             {v16.16b,v17.16b,v18.16b,v19.16b},  [x2], #64
-        ld1             {v20.16b,v21.16b,v22.16b,v23.16b},  [x2], #64
+        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
+        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
         sub             x2,  x2,  #128
-        st1             {v2.16b,v3.16b,v4.16b,v5.16b},  [x2], #64
-        st1             {v2.16b,v3.16b,v4.16b,v5.16b},  [x2], #64
+        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
+        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
 
         \txfm1\()8
 

From 8476eb0d3ab1f7a52317b23346646389c08fb57a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 3 Jan 2017 23:11:51 +0200
Subject: [PATCH 11/12] aarch64: vp9itxfm: Update a comment to refer to a
 register with a different name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 7582081bd8..81027202f8 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -41,8 +41,8 @@ const iadst16_coeffs, align=4
         .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
 endconst
 
-// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
-// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
 // in/out are .8h registers; this can do with 4 temp registers, but is
 // more efficient if 6 temp registers are available.
 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0

From 0c0b87f12d48d4e7f0d3d13f9345e828a3a5ea32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 3 Jan 2017 16:11:56 +0200
Subject: [PATCH 12/12] aarch64: vp9itxfm: Fix incorrect vertical alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 81027202f8..a199e9c603 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -225,7 +225,7 @@ endconst
         add             v21.4s,    v17.4s,    v19.4s
         rshrn           \c0\().4h, v20.4s,    #14
         add             v16.4s,    v16.4s,    v17.4s
-        rshrn           \c1\().4h, v21.4s,     #14
+        rshrn           \c1\().4h, v21.4s,    #14
         sub             v16.4s,    v16.4s,    v19.4s
         rshrn           \c2\().4h, v18.4s,    #14
         rshrn           \c3\().4h, v16.4s,    #14
@@ -1313,8 +1313,8 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 
         bl              idct32_odd\suffix
 
-        transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
-        transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
+        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
+        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
 
         // Store the registers a, b horizontally,
         // adding into the output first, and the mirrored,