diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index be65eb7919..dd9fde15d7 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1123,18 +1123,14 @@ endfunc
 .endm
 
 function idct32_odd
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
+        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 
         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1153,18 +1149,14 @@ function idct32_odd
 endfunc
 
 function idct32_odd_half
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
+        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 
         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1183,18 +1175,14 @@ function idct32_odd_half
 endfunc
 
 function idct32_odd_quarter
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dsmull_h        v4,  v5,  v16, v0.h[0]
-        dsmull_h        v28, v29, v19, v0.h[7]
-        dsmull_h        v30, v31, v16, v0.h[1]
-        dsmull_h        v22, v23, v17, v1.h[6]
-        dsmull_h        v7,  v6,  v17, v1.h[7]
-        dsmull_h        v26, v27, v19, v0.h[6]
-        dsmull_h        v20, v21, v18, v1.h[0]
-        dsmull_h        v24, v25, v18, v1.h[1]
-
-        ld1             {v0.8h}, [x10]
+        dsmull_h        v4,  v5,  v16, v8.h[0]
+        dsmull_h        v28, v29, v19, v8.h[7]
+        dsmull_h        v30, v31, v16, v8.h[1]
+        dsmull_h        v22, v23, v17, v9.h[6]
+        dsmull_h        v7,  v6,  v17, v9.h[7]
+        dsmull_h        v26, v27, v19, v8.h[6]
+        dsmull_h        v20, v21, v18, v9.h[0]
+        dsmull_h        v24, v25, v18, v9.h[1]
 
         neg             v28.4s, v28.4s
         neg             v29.4s, v29.4s
@@ -1240,12 +1228,8 @@ endfunc
 // x1 = unused
 // x2 = src
 // x9 = double input stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1\suffix\()_neon
         mov             x14, x30
-        ld1             {v0.8h,v1.8h}, [x10]
-
         movi            v2.8h, #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .macro store_rev a, b
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, \b
+        rev64           v3.8h, \b
         st1             {\a},  [x0], #16
-        rev64           v0.8h, \a
-        ext             v1.16b, v1.16b, v1.16b, #8
+        rev64           v2.8h, \a
+        ext             v3.16b, v3.16b, v3.16b, #8
         st1             {\b},  [x0], #16
-        ext             v0.16b, v0.16b, v0.16b, #8
-        st1             {v1.8h},  [x0], #16
-        st1             {v0.8h},  [x0], #16
+        ext             v2.16b, v2.16b, v2.16b, #8
+        st1             {v3.8h},  [x0], #16
+        st1             {v2.8h},  [x0], #16
 .endm
         store_rev       v16.8h, v24.8h
         store_rev       v17.8h, v25.8h
@@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
         // subtracted from the output.
 .macro store_rev a, b
         ld1             {v4.8h},  [x0]
-        rev64           v1.8h, \b
+        rev64           v3.8h, \b
         add             v4.8h, v4.8h, \a
-        rev64           v0.8h, \a
+        rev64           v2.8h, \a
         st1             {v4.8h},  [x0], #16
-        ext             v1.16b, v1.16b, v1.16b, #8
+        ext             v3.16b, v3.16b, v3.16b, #8
         ld1             {v5.8h},  [x0]
-        ext             v0.16b, v0.16b, v0.16b, #8
+        ext             v2.16b, v2.16b, v2.16b, #8
         add             v5.8h, v5.8h, \b
         st1             {v5.8h},  [x0], #16
         ld1             {v6.8h},  [x0]
-        sub             v6.8h, v6.8h, v1.8h
+        sub             v6.8h, v6.8h, v3.8h
         st1             {v6.8h},  [x0], #16
         ld1             {v7.8h},  [x0]
-        sub             v7.8h, v7.8h, v0.8h
+        sub             v7.8h, v7.8h, v2.8h
         st1             {v7.8h},  [x0], #16
 .endm
 
@@ -1376,12 +1360,8 @@ endfunc
 // x2 = src (temp buffer)
 // x7 = negative double temp buffer stride
 // x9 = double temp buffer stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2\suffix\()_neon
         mov             x14, x30
-        ld1             {v0.8h,v1.8h}, [x10]
-
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
         sub             v6.8h, v6.8h, \c
         sub             v7.8h, v7.8h, \d
 .endif
-        ld1             {v0.8b}, [x0], x1
-        ld1             {v1.8b}, [x0], x1
+        ld1             {v10.8b}, [x0], x1
+        ld1             {v11.8b}, [x0], x1
         srshr           v4.8h, v4.8h, #6
         ld1             {v2.8b}, [x0], x1
         srshr           v5.8h, v5.8h, #6
-        uaddw           v4.8h, v4.8h, v0.8b
+        uaddw           v4.8h, v4.8h, v10.8b
         ld1             {v3.8b}, [x0], x1
         srshr           v6.8h, v6.8h, #6
-        uaddw           v5.8h, v5.8h, v1.8b
+        uaddw           v5.8h, v5.8h, v11.8b
         srshr           v7.8h, v7.8h, #6
         sub             x0,  x0,  x1, lsl #2
         uaddw           v6.8h, v6.8h, v2.8b
@@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         b.eq            idct32x32_dc_add_neon
 
         movrel          x10, idct_coeffs
-        add             x11, x10, #32
         movrel          x12, min_eob_idct_idct_32, 2
 
         mov             x15, x30
 
-        stp             d14, d15, [sp, #-0x10]!
-        stp             d12, d13, [sp, #-0x10]!
         stp             d10, d11, [sp, #-0x10]!
         stp             d8,  d9,  [sp, #-0x10]!
 
@@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         mov             x9,  #128
         neg             x7,  x9
 
+        ld1             {v0.8h,v1.8h}, [x10], #32
+        ld1             {v8.8h,v9.8h}, [x10]
+
         cmp             w3,  #34
         b.le            idct32x32_quarter_add_neon
         cmp             w3,  #135
@@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
-        ldp             d12, d13, [sp], 0x10
-        ldp             d14, d15, [sp], 0x10
 
         br              x15
 endfunc
@@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon
 
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
-        ldp             d12, d13, [sp], 0x10
-        ldp             d14, d15, [sp], 0x10
 
         br              x15
 endfunc