diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 186208b10a..451fd8af24 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -58,6 +58,24 @@
   .endif
 .endm
 
+//trashes v0-v4
+.macro  lowpass_8_v     r0,  r1,  r2,  r3,  r4,  r5,  r6,  d0,  d1,  narrow=1
+        uaddl           v2.8H,      \r2\().8B, \r3\().8B
+        uaddl           v0.8H,      \r3\().8B, \r4\().8B
+        uaddl           v4.8H,      \r1\().8B, \r4\().8B
+        uaddl           v1.8H,      \r2\().8B, \r5\().8B
+        uaddl           \d0\().8H,  \r0\().8B, \r5\().8B
+        uaddl           \d1\().8H,  \r1\().8B, \r6\().8B
+        mla             \d0\().8H,  v2.8H,     v6.H[1]
+        mls             \d0\().8H,  v4.8H,     v6.H[0]
+        mla             \d1\().8H,  v0.8H,     v6.H[1]
+        mls             \d1\().8H,  v1.8H,     v6.H[0]
+  .if \narrow
+        sqrshrun        \d0\().8B,  \d0\().8H, #5
+        sqrshrun        \d1\().8B,  \d1\().8H, #5
+  .endif
+.endm
+
 //trashes v0-v5, v7, v30-v31
 .macro  lowpass_8H      r0,  r1
         ext             v0.16B,     \r0\().16B, \r0\().16B, #2
@@ -100,18 +118,13 @@
 .endm
 
 // trashed v0-v7
-.macro  lowpass_8.16    r0,  r1,  r2
-        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
-        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
-        saddl           v5.4S,      v1.4H,      v0.4H
-        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
-        saddl2          v1.4S,      v1.8H,      v0.8H
-        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
-        saddl           v6.4S,      v2.4H,      v3.4H
-        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
-        saddl2          v2.4S,      v2.8H,      v3.8H
-        saddl           v0.4S,      \r0\().4H,  \r1\().4H
-        saddl2          v4.4S,      \r0\().8H,  \r1\().8H
+.macro  lowpass_8.16    r0,  r1,  r2,  r3,  r4,  r5
+        saddl           v5.4S,      \r2\().4H,  \r3\().4H
+        saddl2          v1.4S,      \r2\().8H,  \r3\().8H
+        saddl           v6.4S,      \r1\().4H,  \r4\().4H
+        saddl2          v2.4S,      \r1\().8H,  \r4\().8H
+        saddl           v0.4S,      \r0\().4H,  \r5\().4H
+        saddl2          v4.4S,      \r0\().8H,  \r5\().8H
 
         shl             v3.4S,  v5.4S,  #4
         shl             v5.4S,  v5.4S,  #2
@@ -134,7 +147,7 @@
         rshrn           v5.4H,  v5.4S,  #10
         rshrn2          v5.8H,  v1.4S,  #10
 
-        sqxtun          \r2\().8B,  v5.8H
+        sqxtun          \r0\().8B,  v5.8H
 .endm
 
 function put_h264_qpel16_h_lowpass_neon_packed
@@ -258,27 +271,23 @@ endfunc
 
 function \type\()_h264_qpel8_v_lowpass_neon
         ld1             {v16.8B}, [x1], x3
-        ld1             {v18.8B}, [x1], x3
-        ld1             {v20.8B}, [x1], x3
-        ld1             {v22.8B}, [x1], x3
-        ld1             {v24.8B}, [x1], x3
-        ld1             {v26.8B}, [x1], x3
-        ld1             {v28.8B}, [x1], x3
-        ld1             {v30.8B}, [x1], x3
         ld1             {v17.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
         ld1             {v19.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
         ld1             {v21.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
         ld1             {v23.8B}, [x1], x3
-        ld1             {v25.8B}, [x1]
-
-        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
-        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
-        lowpass_8       v16, v17, v18, v19, v16, v17
-        lowpass_8       v20, v21, v22, v23, v18, v19
-        lowpass_8       v24, v25, v26, v27, v20, v21
-        lowpass_8       v28, v29, v30, v31, v22, v23
-        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v25.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v27.8B}, [x1], x3
+        ld1             {v28.8B}, [x1]
 
+        lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
+        lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
+        lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
+        lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
   .ifc \type,avg
         ld1             {v24.8B},  [x0], x2
         ld1             {v25.8B}, [x0], x2
@@ -335,26 +344,23 @@ endfunc
 
 function \type\()_h264_qpel8_v_lowpass_l2_neon
         ld1             {v16.8B}, [x1], x3
-        ld1             {v18.8B}, [x1], x3
-        ld1             {v20.8B}, [x1], x3
-        ld1             {v22.8B}, [x1], x3
-        ld1             {v24.8B}, [x1], x3
-        ld1             {v26.8B}, [x1], x3
-        ld1             {v28.8B}, [x1], x3
-        ld1             {v30.8B}, [x1], x3
         ld1             {v17.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
         ld1             {v19.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
         ld1             {v21.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
         ld1             {v23.8B}, [x1], x3
-        ld1             {v25.8B}, [x1]
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v25.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v27.8B}, [x1], x3
+        ld1             {v28.8B}, [x1]
 
-        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
-        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
-        lowpass_8       v16, v17, v18, v19, v16, v17
-        lowpass_8       v20, v21, v22, v23, v18, v19
-        lowpass_8       v24, v25, v26, v27, v20, v21
-        lowpass_8       v28, v29, v30, v31, v22, v23
-        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
+        lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
+        lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
+        lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
 
         ld1             {v24.8B},  [x12], x2
         ld1             {v25.8B},  [x12], x2
@@ -432,22 +438,17 @@ function put_h264_qpel8_hv_lowpass_neon_top
         lowpass_8H      v26, v27
         lowpass_8H      v28, v29
 
-        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
-        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+        lowpass_8.16    v16, v17, v18, v19, v20, v21
+        lowpass_8.16    v17, v18, v19, v20, v21, v22
 
-        lowpass_8.16    v16, v24, v16
-        lowpass_8.16    v17, v25, v17
+        lowpass_8.16    v18, v19, v20, v21, v22, v23
+        lowpass_8.16    v19, v20, v21, v22, v23, v24
 
-        lowpass_8.16    v18, v26, v18
-        lowpass_8.16    v19, v27, v19
+        lowpass_8.16    v20, v21, v22, v23, v24, v25
+        lowpass_8.16    v21, v22, v23, v24, v25, v26
 
-        lowpass_8.16    v20, v28, v20
-        lowpass_8.16    v21, v29, v21
-
-        lowpass_8.16    v22, v30, v22
-        lowpass_8.16    v23, v31, v23
-
-        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        lowpass_8.16    v22, v23, v24, v25, v26, v27
+        lowpass_8.16    v23, v24, v25, v26, v27, v28
 
         ret
 endfunc