aarch64: Make the indentation more consistent

Some functions have slightly different indentation styles; try to match the surrounding code. libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally uses a layered indentation style to visually show how different unrolled/interleaved phases fit together. Signed-off-by: Martin Storsjö <martin@martin.st>
2025-01-24 13:56:33 +02:00 · 2023-10-17 13:47:27 +03:00 · 2023-10-17 13:47:27 +03:00 · 7f905f3672
commit 7f905f3672
parent 93cda5a9c2
7 changed files with 304 additions and 304 deletions
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@ -526,7 +526,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
        ld1             {v17.8b}, [x4], x1
        ld1             {v19.8b}, [x4], x1

-        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29

        h264_loop_filter_chroma_intra

@ -554,7 +554,7 @@ h_loop_filter_chroma420_intra:
        ld1             {v17.s}[1], [x4], x1
        ld1             {v19.s}[1], [x4], x1

-        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29

        h264_loop_filter_chroma_intra

@ -1017,7 +1017,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
        ld1             {v16.8h}, [x4], x1
        ld1             {v19.8h}, [x9], x1

-        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29

        h264_loop_filter_chroma_intra_10

@ -1045,7 +1045,7 @@ h_loop_filter_chroma420_intra_10:
        ld1             {v19.4h},   [x4], x1
        ld1             {v19.d}[1], [x9], x1

-        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29

        h264_loop_filter_chroma_intra_10

--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@ -580,8 +580,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
 endfunc
 .endm

-        h264_qpel16_hv put
-        h264_qpel16_hv avg
+        h264_qpel16_hv  put
+        h264_qpel16_hv  avg

 .macro  h264_qpel8      type
 function ff_\type\()_h264_qpel8_mc10_neon, export=1
@ -759,8 +759,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel8 put
-        h264_qpel8 avg
+        h264_qpel8      put
+        h264_qpel8      avg

 .macro  h264_qpel16     type
 function ff_\type\()_h264_qpel16_mc10_neon, export=1
@ -931,5 +931,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
 endfunc
 .endm

-        h264_qpel16 put
-        h264_qpel16 avg
+        h264_qpel16     put
+        h264_qpel16     avg
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@ -239,23 +239,23 @@ function hevc_add_residual_32x32_16_neon, export=0
 endfunc

 .macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
-         sshll          v20.4s, \in0, #6
-         sshll          v21.4s, \in0, #6
-         smull          v22.4s, \in1, v4.h[1]
-         smull          v23.4s, \in1, v4.h[3]
-         smlal          v20.4s, \in2, v4.h[0] //e0
-         smlsl          v21.4s, \in2, v4.h[0] //e1
-         smlal          v22.4s, \in3, v4.h[3] //o0
-         smlsl          v23.4s, \in3, v4.h[1] //o1
+        sshll           v20.4s, \in0, #6
+        sshll           v21.4s, \in0, #6
+        smull           v22.4s, \in1, v4.h[1]
+        smull           v23.4s, \in1, v4.h[3]
+        smlal           v20.4s, \in2, v4.h[0] //e0
+        smlsl           v21.4s, \in2, v4.h[0] //e1
+        smlal           v22.4s, \in3, v4.h[3] //o0
+        smlsl           v23.4s, \in3, v4.h[1] //o1

-         add            v24.4s, v20.4s, v22.4s
-         sub            v20.4s, v20.4s, v22.4s
-         add            v22.4s, v21.4s, v23.4s
-         sub            v21.4s, v21.4s, v23.4s
-         sqrshrn        \out0, v24.4s, #\shift
-         sqrshrn        \out3, v20.4s, #\shift
-         sqrshrn        \out1, v22.4s, #\shift
-         sqrshrn        \out2, v21.4s, #\shift
+        add             v24.4s, v20.4s, v22.4s
+        sub             v20.4s, v20.4s, v22.4s
+        add             v22.4s, v21.4s, v23.4s
+        sub             v21.4s, v21.4s, v23.4s
+        sqrshrn         \out0, v24.4s, #\shift
+        sqrshrn         \out3, v20.4s, #\shift
+        sqrshrn         \out1, v22.4s, #\shift
+        sqrshrn         \out2, v21.4s, #\shift
 .endm

 .macro idct_4x4 bitdepth
@ -294,19 +294,19 @@ endfunc

 // uses and clobbers v28-v31 as temp registers
 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
-         sshll\p1       v28.4s, \in0, #6
-         mov            v29.16b, v28.16b
-         smull\p1       v30.4s, \in1, v0.h[1]
-         smull\p1       v31.4s, \in1, v0.h[3]
-         smlal\p2       v28.4s, \in2, v0.h[0] //e0
-         smlsl\p2       v29.4s, \in2, v0.h[0] //e1
-         smlal\p2       v30.4s, \in3, v0.h[3] //o0
-         smlsl\p2       v31.4s, \in3, v0.h[1] //o1
+        sshll\p1        v28.4s, \in0, #6
+        mov             v29.16b, v28.16b
+        smull\p1        v30.4s, \in1, v0.h[1]
+        smull\p1        v31.4s, \in1, v0.h[3]
+        smlal\p2        v28.4s, \in2, v0.h[0] //e0
+        smlsl\p2        v29.4s, \in2, v0.h[0] //e1
+        smlal\p2        v30.4s, \in3, v0.h[3] //o0
+        smlsl\p2        v31.4s, \in3, v0.h[1] //o1

-         add            \out0, v28.4s, v30.4s
-         add            \out1, v29.4s, v31.4s
-         sub            \out2, v29.4s, v31.4s
-         sub            \out3, v28.4s, v30.4s
+        add             \out0, v28.4s, v30.4s
+        add             \out1, v29.4s, v31.4s
+        sub             \out2, v29.4s, v31.4s
+        sub             \out3, v28.4s, v30.4s
 .endm

 .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
@ -362,11 +362,11 @@ endfunc
 .macro idct_8x8 bitdepth
 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 //x0 - coeffs
-        mov              x1,  x0
+        mov             x1,  x0
        ld1             {v16.8h-v19.8h}, [x1], #64
        ld1             {v20.8h-v23.8h}, [x1]

-        movrel           x1, trans
+        movrel          x1, trans
        ld1             {v0.8h}, [x1]

        tr_8x4          7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
@ -379,7 +379,7 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1

        transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23

-        mov              x1,  x0
+        mov             x1,  x0
        st1             {v16.8h-v19.8h}, [x1], #64
        st1             {v20.8h-v23.8h}, [x1]

@ -388,8 +388,8 @@ endfunc
 .endm

 .macro butterfly e, o, tmp_p, tmp_m
-        add        \tmp_p, \e, \o
-        sub        \tmp_m, \e, \o
+        add             \tmp_p, \e, \o
+        sub             \tmp_m, \e, \o
 .endm

 .macro tr16_8x4 in0, in1, in2, in3, offset
@ -418,7 +418,7 @@ endfunc
        butterfly       v25.4s, v29.4s, v17.4s, v22.4s
        butterfly       v26.4s, v30.4s, v18.4s, v21.4s
        butterfly       v27.4s, v31.4s, v19.4s, v20.4s
-        add              x4,  sp,  #\offset
+        add             x4,  sp,  #\offset
        st1             {v16.4s-v19.4s}, [x4], #64
        st1             {v20.4s-v23.4s}, [x4]
 .endm
@ -435,14 +435,14 @@ endfunc
 .endm

 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
-        sum_sub v21.4s, \in, \t0, \op0, \p
-        sum_sub v22.4s, \in, \t1, \op1, \p
-        sum_sub v23.4s, \in, \t2, \op2, \p
-        sum_sub v24.4s, \in, \t3, \op3, \p
-        sum_sub v25.4s, \in, \t4, \op4, \p
-        sum_sub v26.4s, \in, \t5, \op5, \p
-        sum_sub v27.4s, \in, \t6, \op6, \p
-        sum_sub v28.4s, \in, \t7, \op7, \p
+        sum_sub         v21.4s, \in, \t0, \op0, \p
+        sum_sub         v22.4s, \in, \t1, \op1, \p
+        sum_sub         v23.4s, \in, \t2, \op2, \p
+        sum_sub         v24.4s, \in, \t3, \op3, \p
+        sum_sub         v25.4s, \in, \t4, \op4, \p
+        sum_sub         v26.4s, \in, \t5, \op5, \p
+        sum_sub         v27.4s, \in, \t6, \op6, \p
+        sum_sub         v28.4s, \in, \t7, \op7, \p
 .endm

 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@ -528,20 +528,20 @@ endfunc

 .macro tr_16x4 name, shift, offset, step
 function func_tr_16x4_\name
-        mov              x1,  x5
-        add              x3,  x5, #(\step * 64)
-        mov              x2,  #(\step * 128)
+        mov             x1,  x5
+        add             x3,  x5, #(\step * 64)
+        mov             x2,  #(\step * 128)
        load16          v16.d, v17.d, v18.d, v19.d
-        movrel           x1,  trans
+        movrel          x1,  trans
        ld1             {v0.8h}, [x1]

        tr16_8x4        v16, v17, v18, v19, \offset

-        add              x1,  x5, #(\step * 32)
-        add              x3,  x5, #(\step * 3 *32)
-        mov              x2,  #(\step * 128)
+        add             x1,  x5, #(\step * 32)
+        add             x3,  x5, #(\step * 3 *32)
+        mov             x2,  #(\step * 128)
        load16          v20.d, v17.d, v18.d, v19.d
-        movrel           x1, trans, 16
+        movrel          x1, trans, 16
        ld1             {v1.8h}, [x1]
        smull           v21.4s, v20.4h, v1.h[0]
        smull           v22.4s, v20.4h, v1.h[1]
@ -560,19 +560,19 @@ function func_tr_16x4_\name
        add_member      v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
        add_member      v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2

-        add              x4, sp, #\offset
+        add             x4, sp, #\offset
        ld1             {v16.4s-v19.4s}, [x4], #64
        butterfly16     v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
    .if \shift > 0
        scale           v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
        transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7
-        mov              x1,  x6
-        add              x3,  x6, #(24 +3*32)
-        mov              x2, #32
-        mov              x4, #-32
+        mov             x1,  x6
+        add             x3,  x6, #(24 +3*32)
+        mov             x2, #32
+        mov             x4, #-32
        store16         v29.d, v30.d, v31.d, v24.d, x4
    .else
-       store_to_stack  \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
+        store_to_stack  \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
    .endif

        add             x4, sp, #(\offset + 64)
@ -582,13 +582,13 @@ function func_tr_16x4_\name
        scale           v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
        transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7

-        add              x1,  x6, #8
-        add              x3,  x6, #(16 + 3 * 32)
-        mov              x2, #32
-        mov              x4, #-32
+        add             x1,  x6, #8
+        add             x3,  x6, #(16 + 3 * 32)
+        mov             x2, #32
+        mov             x4, #-32
        store16         v29.d, v30.d, v31.d, v20.d, x4
   .else
-       store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
+        store_to_stack  (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
   .endif

        ret
@ -601,21 +601,21 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
        mov             x15, x30

        // allocate a temp buffer
-        sub              sp,  sp,  #640
+        sub             sp,  sp,  #640

 .irp i, 0, 1, 2, 3
-        add              x5,  x0, #(8 * \i)
-        add              x6,  sp, #(8 * \i * 16)
+        add             x5,  x0, #(8 * \i)
+        add             x6,  sp, #(8 * \i * 16)
        bl              func_tr_16x4_firstpass
 .endr

 .irp i, 0, 1, 2, 3
-        add              x5,  sp, #(8 * \i)
-        add              x6,  x0, #(8 * \i * 16)
+        add             x5,  sp, #(8 * \i)
+        add             x6,  x0, #(8 * \i * 16)
        bl              func_tr_16x4_secondpass_\bitdepth
 .endr

-        add              sp,  sp,  #640
+        add             sp,  sp,  #640

        ret             x15
 endfunc
@ -644,10 +644,10 @@ endfunc
 .endm

 .macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
-        sum_sub v24.4s, \in, \t0, \op0, \p
-        sum_sub v25.4s, \in, \t1, \op1, \p
-        sum_sub v26.4s, \in, \t2, \op2, \p
-        sum_sub v27.4s, \in, \t3, \op3, \p
+        sum_sub         v24.4s, \in, \t0, \op0, \p
+        sum_sub         v25.4s, \in, \t1, \op1, \p
+        sum_sub         v26.4s, \in, \t2, \op2, \p
+        sum_sub         v27.4s, \in, \t3, \op3, \p
 .endm

 .macro butterfly32 in0, in1, in2, in3, out
@ -841,85 +841,85 @@ idct_32x32 8
 idct_32x32 10

 .macro tr4_luma_shift r0, r1, r2, r3, shift
-        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
-        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
-        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
-        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
+        saddl           v0.4s, \r0, \r2         // c0 = src0 + src2
+        saddl           v1.4s, \r2, \r3         // c1 = src2 + src3
+        ssubl           v2.4s, \r0, \r3         // c2 = src0 - src3
+        smull           v3.4s, \r1, v21.4h      // c3 = 74 * src1

-        saddl       v7.4s, \r0, \r3         // src0 + src3
-        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
-        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
+        saddl           v7.4s, \r0, \r3         // src0 + src3
+        ssubw           v7.4s, v7.4s, \r2       // src0 - src2 + src3
+        mul             v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)

-        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
-        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
-        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
-        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
+        mul             v5.4s, v0.4s, v19.4s    // 29 * c0
+        mul             v6.4s, v1.4s, v20.4s    // 55 * c1
+        add             v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
+        add             v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3

-        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
-        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
-        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
-        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
+        mul             v1.4s, v1.4s, v19.4s    // 29 * c1
+        mul             v6.4s, v2.4s, v20.4s    // 55 * c2
+        sub             v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
+        add             v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3

-        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
-        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
-        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
-        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
+        mul             v0.4s, v0.4s, v20.4s    // 55 * c0
+        mul             v2.4s, v2.4s, v19.4s    // 29 * c2
+        add             v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
+        sub             v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3

-        sqrshrn     \r0, v5.4s, \shift
-        sqrshrn     \r1, v6.4s, \shift
-        sqrshrn     \r2, v7.4s, \shift
-        sqrshrn     \r3, v0.4s, \shift
+        sqrshrn         \r0, v5.4s, \shift
+        sqrshrn         \r1, v6.4s, \shift
+        sqrshrn         \r2, v7.4s, \shift
+        sqrshrn         \r3, v0.4s, \shift
 .endm

 function ff_hevc_transform_luma_4x4_neon_8, export=1
-        ld1            {v28.4h-v31.4h}, [x0]
-        movi           v18.4s, #74
-        movi           v19.4s, #29
-        movi           v20.4s, #55
-        movi           v21.4h, #74
+        ld1             {v28.4h-v31.4h}, [x0]
+        movi            v18.4s, #74
+        movi            v19.4s, #29
+        movi            v20.4s, #55
+        movi            v21.4h, #74

-        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
-        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+        tr4_luma_shift  v28.4h, v29.4h, v30.4h, v31.4h, #7
+        transpose_4x4H  v28, v29, v30, v31, v22, v23, v24, v25

-        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
-        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+        tr4_luma_shift  v28.4h, v29.4h, v30.4h, v31.4h, #12
+        transpose_4x4H  v28, v29, v30, v31, v22, v23, v24, v25

-        st1            {v28.4h-v31.4h}, [x0]
+        st1             {v28.4h-v31.4h}, [x0]
        ret
 endfunc

 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 .macro idct_dc size, bitdepth
 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
-        ld1r         {v4.8h}, [x0]
-        srshr         v4.8h,  v4.8h,  #1
-        srshr         v0.8h,  v4.8h,  #(14 - \bitdepth)
-        srshr         v1.8h,  v4.8h,  #(14 - \bitdepth)
+        ld1r            {v4.8h}, [x0]
+        srshr           v4.8h,  v4.8h,  #1
+        srshr           v0.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr           v1.8h,  v4.8h,  #(14 - \bitdepth)
 .if \size > 4
-        srshr         v2.8h,  v4.8h,  #(14 - \bitdepth)
-        srshr         v3.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr           v2.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr           v3.8h,  v4.8h,  #(14 - \bitdepth)
 .if \size > 16 /* dc 32x32 */
-        mov              x2,  #4
+        mov             x2,  #4
 1:
-        subs             x2,  x2, #1
+        subs            x2,  x2, #1
 .endif
        add             x12,  x0, #64
        mov             x13,  #128
 .if \size > 8 /* dc 16x16 */
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
 .endif /* dc 8x8 */
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
 .if \size > 16 /* dc 32x32 */
        bne             1b
 .endif
 .else /* dc 4x4 */
-        st1            {v0.8h-v1.8h},  [x0]
+        st1             {v0.8h-v1.8h},  [x0]
 .endif
        ret
 endfunc
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@ -840,19 +840,19 @@ function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
 endfunc

 function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
-        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+        b               X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
 endfunc

 function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
-        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+        b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
 endfunc

 function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
-        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+        b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
 endfunc

 function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
-        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+        b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
 endfunc

 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
@ -1560,21 +1560,21 @@ endfunc
 #if HAVE_I8MM

 .macro calc_all2
-        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+        calc            v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
        b.eq            2f
-        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+        calc            v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
        b.eq            2f
-        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+        calc            v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
        b.eq            2f
-        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+        calc            v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
        b.eq            2f
-        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+        calc            v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
        b.eq            2f
-        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+        calc            v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
        b.eq            2f
-        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+        calc            v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
        b.eq            2f
-        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+        calc            v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
        b.hi            1b
 .endm

--- a/libavcodec/aarch64/opusdsp_neon.S
+++ b/libavcodec/aarch64/opusdsp_neon.S
@ -34,13 +34,13 @@ endconst

 function ff_opus_deemphasis_neon, export=1
        movrel  x4, tab_st
-        ld1    {v4.4s}, [x4]
+        ld1     {v4.4s}, [x4]
        movrel  x4, tab_x0
-        ld1    {v5.4s}, [x4]
+        ld1     {v5.4s}, [x4]
        movrel  x4, tab_x1
-        ld1    {v6.4s}, [x4]
+        ld1     {v6.4s}, [x4]
        movrel  x4, tab_x2
-        ld1    {v7.4s}, [x4]
+        ld1     {v7.4s}, [x4]

        fmul v0.4s, v4.4s, v0.s[0]

--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@ -330,32 +330,32 @@ endfunc
        //   v17: hev

        // convert to signed value:
-        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
-        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
+        eor             v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
+        eor             v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80

-        movi           v20.8h, #3
-        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
-        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
-        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
-        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
-        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
-        mul            v19.8h, v19.8h, v20.8h
+        movi            v20.8h, #3
+        ssubl           v18.8h, v4.8b,  v3.8b             // QS0 - PS0
+        ssubl2          v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
+        eor             v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
+        eor             v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
+        mul             v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
+        mul             v19.8h, v19.8h, v20.8h

-        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
-        movi           v22.16b, #4
-        movi           v23.16b, #3
+        sqsub           v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
+        movi            v22.16b, #4
+        movi            v23.16b, #3
    .if \inner
-        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
+        and             v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
    .endif
-        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
-        saddw2         v19.8h,  v19.8h, v20.16b
-        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
-        sqxtn2         v18.16b, v19.8h
+        saddw           v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
+        saddw2          v19.8h,  v19.8h, v20.16b
+        sqxtn           v18.8b,  v18.8h                   // narrow result back into v18
+        sqxtn2          v18.16b, v19.8h
    .if !\inner && !\simple
-        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
-        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
+        eor             v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
+        eor             v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
    .endif
-        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
+        and             v18.16b, v18.16b, v16.16b         // w &= normal_limit

        // registers used at this point..
        //   v0 -> P3  (don't corrupt)
@ -375,44 +375,44 @@ endfunc
        //   P0 = s2u(PS0 + c2);

    .if \simple
-        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .elseif \inner
        // the !is4tap case of filter_common, only used for inner blocks
        //   c3 = ((c1&~hev) + 1) >> 1;
        //   Q1 = s2u(QS1 - c3);
        //   P1 = s2u(PS1 + c3);
-        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
-        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
-        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
-        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
-        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        bic             v19.16b, v19.16b, v17.16b           // c1 & ~hev
+        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        srshr           v19.16b, v19.16b, #1                // c3 >>= 1
+        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        sqsub           v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
+        sqadd           v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
+        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
    .else
-        and            v20.16b, v18.16b, v17.16b           // w & hev
-        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        and             v20.16b, v18.16b, v17.16b           // w & hev
+        sqadd           v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        bic             v18.16b, v18.16b, v17.16b           // w &= ~hev
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)

        // filter_mbedge:
        //   a = clamp((27*w + 63) >> 7);
@ -424,35 +424,35 @@ endfunc
        //   a = clamp((9*w + 63) >> 7);
        //   Q2 = s2u(QS2 - a);
        //   P2 = s2u(PS2 + a);
-        movi           v17.8h,  #63
-        sshll          v22.8h,  v18.8b, #3
-        sshll2         v23.8h,  v18.16b, #3
-        saddw          v22.8h,  v22.8h, v18.8b
-        saddw2         v23.8h,  v23.8h, v18.16b
-        add            v16.8h,  v17.8h, v22.8h
-        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
-        add            v19.8h,  v16.8h, v22.8h
-        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
-        add            v22.8h,  v19.8h, v22.8h
-        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
-        sqshrn         v16.8b,  v16.8h,  #7
-        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
-        sqshrn         v19.8b,  v19.8h, #7
-        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
-        sqshrn         v22.8b,  v22.8h, #7
-        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
-        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
-        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
-        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
-        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
-        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
-        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
-        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
-        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
-        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
-        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
-        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
+        movi            v17.8h,  #63
+        sshll           v22.8h,  v18.8b, #3
+        sshll2          v23.8h,  v18.16b, #3
+        saddw           v22.8h,  v22.8h, v18.8b
+        saddw2          v23.8h,  v23.8h, v18.16b
+        add             v16.8h,  v17.8h, v22.8h
+        add             v17.8h,  v17.8h, v23.8h           //  9*w + 63
+        add             v19.8h,  v16.8h, v22.8h
+        add             v20.8h,  v17.8h, v23.8h           // 18*w + 63
+        add             v22.8h,  v19.8h, v22.8h
+        add             v23.8h,  v20.8h, v23.8h           // 27*w + 63
+        sqshrn          v16.8b,  v16.8h,  #7
+        sqshrn2         v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
+        sqshrn          v19.8b,  v19.8h, #7
+        sqshrn2         v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
+        sqshrn          v22.8b,  v22.8h, #7
+        sqshrn2         v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
+        sqadd           v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
+        sqsub           v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
+        sqadd           v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
+        sqsub           v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
+        sqadd           v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
+        sqsub           v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
+        eor             v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
+        eor             v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
+        eor             v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
+        eor             v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
+        eor             v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
    .endif
 .endm

@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2,  lsl #2
        sub             x1,  x1,  x2,  lsl #2
        // Load pixels:
-        ld1          {v0.d}[0],     [x0], x2  // P3
-        ld1          {v0.d}[1],     [x1], x2  // P3
-        ld1          {v1.d}[0],     [x0], x2  // P2
-        ld1          {v1.d}[1],     [x1], x2  // P2
-        ld1          {v2.d}[0],     [x0], x2  // P1
-        ld1          {v2.d}[1],     [x1], x2  // P1
-        ld1          {v3.d}[0],     [x0], x2  // P0
-        ld1          {v3.d}[1],     [x1], x2  // P0
-        ld1          {v4.d}[0],     [x0], x2  // Q0
-        ld1          {v4.d}[1],     [x1], x2  // Q0
-        ld1          {v5.d}[0],     [x0], x2  // Q1
-        ld1          {v5.d}[1],     [x1], x2  // Q1
-        ld1          {v6.d}[0],     [x0], x2  // Q2
-        ld1          {v6.d}[1],     [x1], x2  // Q2
-        ld1          {v7.d}[0],     [x0]      // Q3
-        ld1          {v7.d}[1],     [x1]      // Q3
+        ld1             {v0.d}[0],     [x0], x2  // P3
+        ld1             {v0.d}[1],     [x1], x2  // P3
+        ld1             {v1.d}[0],     [x0], x2  // P2
+        ld1             {v1.d}[1],     [x1], x2  // P2
+        ld1             {v2.d}[0],     [x0], x2  // P1
+        ld1             {v2.d}[1],     [x1], x2  // P1
+        ld1             {v3.d}[0],     [x0], x2  // P0
+        ld1             {v3.d}[1],     [x1], x2  // P0
+        ld1             {v4.d}[0],     [x0], x2  // Q0
+        ld1             {v4.d}[1],     [x1], x2  // Q0
+        ld1             {v5.d}[0],     [x0], x2  // Q1
+        ld1             {v5.d}[1],     [x1], x2  // Q1
+        ld1             {v6.d}[0],     [x0], x2  // Q2
+        ld1             {v6.d}[1],     [x1], x2  // Q2
+        ld1             {v7.d}[0],     [x0]      // Q3
+        ld1             {v7.d}[1],     [x1]      // Q3

-        dup          v22.16b, w3                 // flim_E
-        dup          v23.16b, w4                 // flim_I
+        dup             v22.16b, w3                 // flim_E
+        dup             v23.16b, w4                 // flim_I

        vp8_loop_filter inner=\inner, hev_thresh=w5

        // back up to P2:  u,v -= stride * 6
-        sub          x0,  x0,  x2,  lsl #2
-        sub          x1,  x1,  x2,  lsl #2
-        sub          x0,  x0,  x2,  lsl #1
-        sub          x1,  x1,  x2,  lsl #1
+        sub             x0,  x0,  x2,  lsl #2
+        sub             x1,  x1,  x2,  lsl #2
+        sub             x0,  x0,  x2,  lsl #1
+        sub             x1,  x1,  x2,  lsl #1

        // Store pixels:

-        st1          {v1.d}[0],     [x0], x2  // P2
-        st1          {v1.d}[1],     [x1], x2  // P2
-        st1          {v2.d}[0],     [x0], x2  // P1
-        st1          {v2.d}[1],     [x1], x2  // P1
-        st1          {v3.d}[0],     [x0], x2  // P0
-        st1          {v3.d}[1],     [x1], x2  // P0
-        st1          {v4.d}[0],     [x0], x2  // Q0
-        st1          {v4.d}[1],     [x1], x2  // Q0
-        st1          {v5.d}[0],     [x0], x2  // Q1
-        st1          {v5.d}[1],     [x1], x2  // Q1
-        st1          {v6.d}[0],     [x0]      // Q2
-        st1          {v6.d}[1],     [x1]      // Q2
+        st1             {v1.d}[0],     [x0], x2  // P2
+        st1             {v1.d}[1],     [x1], x2  // P2
+        st1             {v2.d}[0],     [x0], x2  // P1
+        st1             {v2.d}[1],     [x1], x2  // P1
+        st1             {v3.d}[0],     [x0], x2  // P0
+        st1             {v3.d}[1],     [x1], x2  // P0
+        st1             {v4.d}[0],     [x0], x2  // Q0
+        st1             {v4.d}[1],     [x1], x2  // Q0
+        st1             {v5.d}[0],     [x0], x2  // Q1
+        st1             {v5.d}[1],     [x1], x2  // Q1
+        st1             {v6.d}[0],     [x0]      // Q2
+        st1             {v6.d}[1],     [x1]      // Q2

        ret
 endfunc
@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
        ld1             {v6.d}[1], [x0], x1
        ld1             {v7.d}[1], [x0], x1

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w2                 // flim_E
    .if !\simple
@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1

        sub             x0,  x0,  x1, lsl #4    // backup 16 rows

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
        st1             {v0.d}[0], [x0], x1
@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x1,  x1,  #4

        // Load pixels:
-        ld1          {v0.d}[0],     [x0], x2 // load u
-        ld1          {v0.d}[1],     [x1], x2 // load v
-        ld1          {v1.d}[0],     [x0], x2
-        ld1          {v1.d}[1],     [x1], x2
-        ld1          {v2.d}[0],     [x0], x2
-        ld1          {v2.d}[1],     [x1], x2
-        ld1          {v3.d}[0],     [x0], x2
-        ld1          {v3.d}[1],     [x1], x2
-        ld1          {v4.d}[0],     [x0], x2
-        ld1          {v4.d}[1],     [x1], x2
-        ld1          {v5.d}[0],     [x0], x2
-        ld1          {v5.d}[1],     [x1], x2
-        ld1          {v6.d}[0],     [x0], x2
-        ld1          {v6.d}[1],     [x1], x2
-        ld1          {v7.d}[0],     [x0], x2
-        ld1          {v7.d}[1],     [x1], x2
+        ld1             {v0.d}[0],     [x0], x2 // load u
+        ld1             {v0.d}[1],     [x1], x2 // load v
+        ld1             {v1.d}[0],     [x0], x2
+        ld1             {v1.d}[1],     [x1], x2
+        ld1             {v2.d}[0],     [x0], x2
+        ld1             {v2.d}[1],     [x1], x2
+        ld1             {v3.d}[0],     [x0], x2
+        ld1             {v3.d}[1],     [x1], x2
+        ld1             {v4.d}[0],     [x0], x2
+        ld1             {v4.d}[1],     [x1], x2
+        ld1             {v5.d}[0],     [x0], x2
+        ld1             {v5.d}[1],     [x1], x2
+        ld1             {v6.d}[0],     [x0], x2
+        ld1             {v6.d}[1],     [x1], x2
+        ld1             {v7.d}[0],     [x0], x2
+        ld1             {v7.d}[1],     [x1], x2

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        dup             v22.16b, w3                 // flim_E
        dup             v23.16b, w4                 // flim_I
@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows

-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31

        // Store pixels:
-        st1          {v0.d}[0],     [x0], x2 // load u
-        st1          {v0.d}[1],     [x1], x2 // load v
-        st1          {v1.d}[0],     [x0], x2
-        st1          {v1.d}[1],     [x1], x2
-        st1          {v2.d}[0],     [x0], x2
-        st1          {v2.d}[1],     [x1], x2
-        st1          {v3.d}[0],     [x0], x2
-        st1          {v3.d}[1],     [x1], x2
-        st1          {v4.d}[0],     [x0], x2
-        st1          {v4.d}[1],     [x1], x2
-        st1          {v5.d}[0],     [x0], x2
-        st1          {v5.d}[1],     [x1], x2
-        st1          {v6.d}[0],     [x0], x2
-        st1          {v6.d}[1],     [x1], x2
-        st1          {v7.d}[0],     [x0]
-        st1          {v7.d}[1],     [x1]
+        st1             {v0.d}[0],     [x0], x2 // load u
+        st1             {v0.d}[1],     [x1], x2 // load v
+        st1             {v1.d}[0],     [x0], x2
+        st1             {v1.d}[1],     [x1], x2
+        st1             {v2.d}[0],     [x0], x2
+        st1             {v2.d}[1],     [x1], x2
+        st1             {v3.d}[0],     [x0], x2
+        st1             {v3.d}[1],     [x1], x2
+        st1             {v4.d}[0],     [x0], x2
+        st1             {v4.d}[1],     [x1], x2
+        st1             {v5.d}[0],     [x0], x2
+        st1             {v5.d}[1],     [x1], x2
+        st1             {v6.d}[0],     [x0], x2
+        st1             {v6.d}[1],     [x1], x2
+        st1             {v7.d}[0],     [x0]
+        st1             {v7.d}[1],     [x1]

        ret

--- a/libavutil/aarch64/tx_float_neon.S
+++ b/libavutil/aarch64/tx_float_neon.S
@ -729,9 +729,9 @@ FFT16_FN ns_float, 1
 .endm

 .macro SR_COMBINE_4 len, part, off
-        add              x10, x1, x21
-        add              x11, x1, x21, lsl #1
-        add              x12, x1, x22
+        add             x10, x1, x21
+        add             x11, x1, x21, lsl #1
+        add             x12, x1, x22

        ldp              q0,  q1, [x1,  #((0 + \part)*32 + \off)]
        ldp              q4,  q5, [x1,  #((2 + \part)*32 + \off)]
@ -759,9 +759,9 @@ FFT16_FN ns_float, 1
 .endm

 .macro SR_COMBINE_FULL len, off=0
-        add              x10, x1, x21
-        add              x11, x1, x21, lsl #1
-        add              x12, x1, x22
+        add             x10, x1, x21
+        add             x11, x1, x21, lsl #1
+        add             x12, x1, x22

        SR_COMBINE_4    \len, 0, \off
        SR_COMBINE_4    \len, 1, \off