avcodec/aarch64/vvc: Optimize apply_bdof

Before this patch, prof_grad_filter calculate gh[0], gh[1], gv[0], gv[1] and save them to stack. derive_bdof_vx_vy load them from stack and calculate gh[0] + gh[1], gv[0] + gv[1]. apply_bdof_min_block load them from stack and calculate gh[0] - gh[1], gv[0] - gv[1] This patch add bdof_grad_filter, which calculate gh[0] + gh[1], gh[0] - gh[1], gv[0] + gv[1], gv[0] - gv[1], and save them to stack, so derive_bdof_vx_vy and apply_bdof_min_block can use the results directly. prof_grad_filter is kept for reuse by other functions in the future. Benchmark on rpi5 with gcc 12 Before After -------------------------------------------------------------------- apply_bdof_8_8x16_c: | 7431.4 ( 1.00x) | 7371.7 ( 1.00x) apply_bdof_8_8x16_neon: | 1175.4 ( 6.32x) | 1036.3 ( 7.11x) apply_bdof_8_16x8_c: | 7182.2 ( 1.00x) | 7201.1 ( 1.00x) apply_bdof_8_16x8_neon: | 1021.7 ( 7.03x) | 879.9 ( 8.18x) apply_bdof_8_16x16_c: | 14577.1 ( 1.00x) | 14589.3 ( 1.00x) apply_bdof_8_16x16_neon: | 2012.8 ( 7.24x) | 1743.3 ( 8.37x) apply_bdof_10_8x16_c: | 7292.4 ( 1.00x) | 7308.5 ( 1.00x) apply_bdof_10_8x16_neon: | 1156.3 ( 6.31x) | 1045.3 ( 6.99x) apply_bdof_10_16x8_c: | 7112.4 ( 1.00x) | 7214.4 ( 1.00x) apply_bdof_10_16x8_neon: | 1007.6 ( 7.06x) | 904.8 ( 7.97x) apply_bdof_10_16x16_c: | 14363.3 ( 1.00x) | 14476.4 ( 1.00x) apply_bdof_10_16x16_neon: | 1986.9 ( 7.23x) | 1783.1 ( 8.12x) apply_bdof_12_8x16_c: | 7433.3 ( 1.00x) | 7374.7 ( 1.00x) apply_bdof_12_8x16_neon: | 1155.9 ( 6.43x) | 1040.8 ( 7.09x) apply_bdof_12_16x8_c: | 7171.1 ( 1.00x) | 7376.3 ( 1.00x) apply_bdof_12_16x8_neon: | 1010.8 ( 7.09x) | 899.4 ( 8.20x) apply_bdof_12_16x16_c: | 14515.5 ( 1.00x) | 14731.5 ( 1.00x) apply_bdof_12_16x16_neon: | 1988.4 ( 7.30x) | 1785.2 ( 8.25x)
2025-11-23 21:54:53 +02:00 · 2025-08-14 12:42:38 +08:00
parent 2e92417603
commit 6ce02bcc3a
3 changed files with 370 additions and 138 deletions
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -30,38 +30,16 @@
 #define BDOF_BLOCK_SIZE         16
 #define BDOF_MIN_BLOCK_SIZE     4

-void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
-                                     int16_t *gradient_v,
-                                     ptrdiff_t gradient_stride,
-                                     const int16_t *_src,
-                                     ptrdiff_t src_stride,
-                                     int width, int height);
-
-void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
-                                      const int16_t *_src1,
-                                      int16_t *const gradient_h[2],
-                                      int16_t *const gradient_v[2],
-                                      int16_t vx[16], int16_t vy[16],
-                                      int block_h);
-void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
-                                       const int16_t *_src1,
-                                       int16_t *const gradient_h[2],
-                                       int16_t *const gradient_v[2],
-                                       int16_t vx[16], int16_t vy[16],
-                                       int block_h);
 #define BIT_DEPTH 8
 #include "alf_template.c"
-#include "of_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 10
 #include "alf_template.c"
-#include "of_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 12
 #include "alf_template.c"
-#include "of_template.c"
 #undef BIT_DEPTH

 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
@@ -121,6 +99,15 @@ DMVR_FUN(hv_, 8)
 DMVR_FUN(hv_, 10)
 DMVR_FUN(hv_, 12)

+#define APPLY_BDOF_FUNC(bd) \
+    void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \
+        const int16_t *_src0, const int16_t *_src1, \
+        int block_w, int block_h);
+
+APPLY_BDOF_FUNC(8)
+APPLY_BDOF_FUNC(10)
+APPLY_BDOF_FUNC(12)
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -202,7 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.w_avg = vvc_w_avg_8;
        c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
-        c->inter.apply_bdof = apply_bdof_8;
+        c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;

        c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
        for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
@@ -246,7 +233,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.avg = ff_vvc_avg_10_neon;
        c->inter.w_avg = vvc_w_avg_10;
        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
-        c->inter.apply_bdof = apply_bdof_10;
+        c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;

        c->alf.filter[LUMA] = alf_filter_luma_10_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -255,7 +242,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.w_avg = vvc_w_avg_12;
        c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
-        c->inter.apply_bdof = apply_bdof_12;
+        c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;

        c->alf.filter[LUMA] = alf_filter_luma_12_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -716,7 +716,93 @@ function ff_vvc_prof_grad_filter_8x_neon, export=1
 .unreq height
 endfunc

-.macro vvc_apply_bdof_block bit_depth
+function vvc_bdof_grad_filter_8x_neon, export=0
+        gh0             .req x0
+        gh1             .req x1
+        gv0             .req x2
+        gv1             .req x3
+        src0            .req x4
+        src1            .req x5
+        width           .req w6
+        height          .req w7
+
+1:
+        mov             x10, src0
+        mov             w11, width
+        mov             x12, gh0
+        mov             x13, gv0
+        mov             x14, src1
+        mov             x15, gh1
+        mov             x16, gv1
+2:
+        ldur            q0, [x10, #2]
+        ldur            q1, [x10, #-2]
+        ldr             q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
+        ldr             q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
+        sshr            v0.8h, v0.8h, #6
+        sshr            v1.8h, v1.8h, #6
+        ldur            q4, [x14, #2]
+        ldur            q5, [x14, #-2]
+        sshr            v2.8h, v2.8h, #6
+        sshr            v3.8h, v3.8h, #6
+        ldr             q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
+        ldr             q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
+        // results of gradient_h0
+        sub             v0.8h, v0.8h, v1.8h
+        // results of gradient_v0
+        sub             v2.8h, v2.8h, v3.8h
+
+        sshr            v4.8h, v4.8h, #6
+        sshr            v5.8h, v5.8h, #6
+        sshr            v6.8h, v6.8h, #6
+        sshr            v7.8h, v7.8h, #6
+        // results of gradient_h1
+        sub             v4.8h, v4.8h, v5.8h
+        // results of gradient_v1
+        sub             v6.8h, v6.8h, v7.8h
+
+        add             x10, x10, #16
+        add             x14, x14, #16
+
+        // (gradient_h0 + gradient_h1) >> 1
+        shadd           v1.8h, v0.8h, v4.8h
+        // gradient_h0 - gradient_h1
+        sub             v5.8h, v0.8h, v4.8h
+
+        subs            w11, w11, #8
+
+        // (gradient_v0 + gradient_v1) >> 1
+        shadd           v3.8h, v2.8h, v6.8h
+        // gradient_v0 - gradient_v1
+        sub             v7.8h, v2.8h, v6.8h
+
+        st1             {v1.8h}, [x12], #16
+        st1             {v5.8h}, [x15], #16
+        st1             {v3.8h}, [x13], #16
+        st1             {v7.8h}, [x16], #16
+        b.ne            2b
+
+        subs            height, height, #1
+        add             gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
+        add             gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
+        add             src0, src0, #(VVC_MAX_PB_SIZE << 1)
+        add             gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
+        add             gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
+        add             src1, src1, #(VVC_MAX_PB_SIZE << 1)
+        b.ne            1b
+        ret
+
+.unreq gh0
+.unreq gh1
+.unreq gv0
+.unreq gv1
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
+endfunc
+
+.macro vvc_apply_bdof_block_8x bit_depth
        dst             .req x0
        dst_stride      .req x1
        src0            .req x2
@@ -726,33 +812,28 @@ endfunc
        vx              .req x6
        vy              .req x7

-        ld1r            {v0.8h}, [vx], #2
-        ld1r            {v1.8h}, [vy], #2
-        ld1r            {v2.8h}, [vx]
-        ld1r            {v3.8h}, [vy]
-        ins             v0.d[1], v2.d[1]
-        ins             v1.d[1], v3.d[1]
-
+        ldr             w8, [sp]
        movi            v7.4s, #(1 << (14 - \bit_depth))
-        ldp             x8, x9, [gh]
-        ldp             x10, x11, [gv]
        mov             x12, #(BDOF_BLOCK_SIZE * 2)
-        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
        mov             x14, #(VVC_MAX_PB_SIZE * 2)
 .if \bit_depth >= 10
        // clip pixel
        mov             w15, #((1 << \bit_depth) - 1)
        movi            v18.8h, #0
-        lsl             dst_stride, dst_stride, #1
        dup             v19.8h, w15
 .endif
+
+0:
+        ld1r            {v0.8h}, [vx], #2
+        ld1r            {v1.8h}, [vy], #2
+        ld1r            {v2.8h}, [vx]
+        ld1r            {v3.8h}, [vy]
+        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
+        ins             v0.d[1], v2.d[1]
+        ins             v1.d[1], v3.d[1]
 1:
-        ld1             {v2.8h}, [x8], x12
-        ld1             {v3.8h}, [x9], x12
-        ld1             {v4.8h}, [x10], x12
-        ld1             {v5.8h}, [x11], x12
-        sub             v2.8h, v2.8h, v3.8h
-        sub             v4.8h, v4.8h, v5.8h
+        ld1             {v2.8h}, [gh], x12
+        ld1             {v4.8h}, [gv], x12
        smull           v3.4s, v0.4h, v2.4h
        smull2          v16.4s, v0.8h, v2.8h
        smlal           v3.4s, v1.4h, v4.4h
@@ -780,6 +861,11 @@ endfunc
        st1             {v5.8h}, [dst], dst_stride
 .endif
        b.ne            1b
+
+        subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
+        add             vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
+        add             vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
+        b.ne            0b
        ret

 .unreq dst
@@ -792,16 +878,128 @@ endfunc
 .unreq vy
 .endm

-function ff_vvc_apply_bdof_block_8_neon, export=1
-        vvc_apply_bdof_block 8
+function vvc_apply_bdof_block_8x_8_neon, export=0
+        vvc_apply_bdof_block_8x 8
 endfunc

-function ff_vvc_apply_bdof_block_10_neon, export=1
-        vvc_apply_bdof_block 10
+function vvc_apply_bdof_block_8x_10_neon, export=0
+        vvc_apply_bdof_block_8x 10
 endfunc

-function ff_vvc_apply_bdof_block_12_neon, export=1
-        vvc_apply_bdof_block 12
+function vvc_apply_bdof_block_8x_12_neon, export=0
+        vvc_apply_bdof_block_8x 12
+endfunc
+
+.macro vvc_apply_bdof_block_16x bit_depth
+        dst             .req x0
+        dst_stride      .req x1
+        src0            .req x2
+        src1            .req x3
+        gh              .req x4
+        gv              .req x5
+        vx              .req x6
+        vy              .req x7
+
+        ldr             w8, [sp]
+        movi            v7.4s, #(1 << (14 - \bit_depth))
+.if \bit_depth >= 10
+        // clip pixel
+        mov             w15, #((1 << \bit_depth) - 1)
+        movi            v18.8h, #0
+        dup             v19.8h, w15
+.endif
+
+0:
+        ld1r            {v0.8h}, [vx], #2
+        ld1r            {v1.8h}, [vy], #2
+        ld1r            {v2.8h}, [vx], #2
+        ld1r            {v3.8h}, [vy], #2
+
+        mov             w13, #(BDOF_MIN_BLOCK_SIZE)
+
+        ld1r            {v20.8h}, [vx], #2
+        ld1r            {v21.8h}, [vy], #2
+        ld1r            {v22.8h}, [vx], #2
+        ld1r            {v23.8h}, [vy], #2
+
+        ins             v0.d[1], v2.d[1]
+        ins             v1.d[1], v3.d[1]
+        ins             v20.d[1], v22.d[1]
+        ins             v21.d[1], v23.d[1]
+1:
+        ldp             q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
+        ldp             q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
+        smull           v3.4s, v0.4h, v2.4h
+        smull2          v16.4s, v0.8h, v2.8h
+        smlal           v3.4s, v1.4h, v4.4h
+        smlal2          v16.4s, v1.8h, v4.8h
+
+        ldp             q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
+        ldp             q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)
+
+        smull           v23.4s, v20.4h, v22.4h
+        smull2          v27.4s, v20.8h, v22.8h
+        smlal           v23.4s, v21.4h, v24.4h
+        smlal2          v27.4s, v21.8h, v24.8h
+
+        saddl           v2.4s, v5.4h, v6.4h
+        add             v2.4s, v2.4s, v7.4s
+        add             v2.4s, v2.4s, v3.4s
+        saddl2          v4.4s, v5.8h, v6.8h
+        add             v4.4s, v4.4s, v7.4s
+        add             v4.4s, v4.4s, v16.4s
+
+        saddl           v22.4s, v25.4h, v26.4h
+        add             v22.4s, v22.4s, v7.4s
+        add             v22.4s, v22.4s, v23.4s
+        saddl2          v24.4s, v25.8h, v26.8h
+        add             v24.4s, v24.4s, v7.4s
+        add             v24.4s, v24.4s, v27.4s
+
+        sqshrn          v5.4h, v2.4s, #(15 - \bit_depth)
+        sqshrn2         v5.8h, v4.4s, #(15 - \bit_depth)
+        sqshrn          v25.4h, v22.4s, #(15 - \bit_depth)
+        sqshrn2         v25.8h, v24.4s, #(15 - \bit_depth)
+
+        subs            w13, w13, #1
+.if \bit_depth == 8
+        sqxtun          v5.8b, v5.8h
+        sqxtun2         v5.16b, v25.8h
+        str             q5, [dst]
+.else
+        smin            v5.8h, v5.8h, v19.8h
+        smax            v5.8h, v5.8h, v18.8h
+        smin            v25.8h, v25.8h, v19.8h
+        smax            v25.8h, v25.8h, v18.8h
+        stp             q5, q25, [dst]
+.endif
+        add             dst, dst, dst_stride
+        b.ne            1b
+
+        subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
+        b.ne            0b
+        ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq gh
+.unreq gv
+.unreq vx
+.unreq vy
+.endm
+
+function vvc_apply_bdof_block_16x_8_neon, export=0
+        vvc_apply_bdof_block_16x 8
+endfunc
+
+function vvc_apply_bdof_block_16x_10_neon, export=0
+        vvc_apply_bdof_block_16x 10
+endfunc
+
+function vvc_apply_bdof_block_16x_12_neon, export=0
+        vvc_apply_bdof_block_16x 12
 endfunc

 const bdof_vx_vy_8x_tbl
@@ -919,18 +1117,16 @@ endconst
 * ----------------------------------------------------------------------
 * x0: const int16_t *_src0,
 * x1: const int16_t *_src1,
- * x2: int16_t *const gradient_h[2],
- * x3: int16_t *const gradient_v[2],
+ * x2: const int16_t *gradient_h,
+ * x3: const int16_t *gradient_v,
 * x4: int16_t vx[16],
 * x5: int16_t vy[16],
 * w6: int block_h
 */
-function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
+function vvc_derive_bdof_vx_vy_8x_neon, export=0
        stp             d11, d10, [sp, #-0x20]!
        stp             d9, d8, [sp, #0x10]

-        ldp             x14, x13, [x2]                      // gh0, gh1
-        ldp             x10, x9, [x3]                       // gv0, gv1
        movrel          x11, bdof_vx_vy_8x_tbl
        ldr             q0, [x11]                           // table
        mvni            v2.4s, #30                          // -31, for log2
@@ -998,17 +1194,13 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
 9:
        ldr             q28, [x0]                                   // src0
        ldr             q29, [x1]                                   // src1
-        ldr             q30, [x14], #(BDOF_BLOCK_SIZE * 2)          // gh0
-        ldr             q31, [x13], #(BDOF_BLOCK_SIZE * 2)          // gh1
-        ldr             q8, [x10], #(BDOF_BLOCK_SIZE * 2)           // gv0
-        ldr             q9, [x9], #(BDOF_BLOCK_SIZE * 2)            // gv1
+        ldr             q30, [x2], #(BDOF_BLOCK_SIZE * 2)           // (gh0 + gh1) >> 1
+        ldr             q31, [x3], #(BDOF_BLOCK_SIZE * 2)           // (gv0 + gv1) >> 1
        add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
        add             x1, x1, #(VVC_MAX_PB_SIZE * 2)

        sshr            v28.8h, v28.8h, #0x4
        sshr            v29.8h, v29.8h, #0x4
-        shadd           v30.8h, v30.8h, v31.8h                      // tmph
-        shadd           v31.8h, v8.8h, v9.8h                        // tmpv
        sub             v8.8h, v28.8h, v29.8h                       // diff

        abs             v28.8h, v30.8h
@@ -1067,20 +1259,18 @@ endfunc
 /*
 * x0: const int16_t *_src0,
 * x1: const int16_t *_src1,
- * x2: int16_t *const gradient_h[2],
- * x3: int16_t *const gradient_v[2],
+ * x2: const int16_t *gradient_h,
+ * x3: const int16_t *gradient_v,
 * x4: int16_t vx[16],
 * x5: int16_t vy[16],
 * w6: int block_h
 */
-function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
+function vvc_derive_bdof_vx_vy_16x_neon, export=0
        stp             d15, d14, [sp, #-0x40]!
        stp             d13, d12, [sp, #0x10]
        stp             d11, d10, [sp, #0x20]
        stp             d9, d8,   [sp, #0x30]

-        ldp             x8, x9, [x2]                        // gh0, gh1
-        ldp             x10, x11, [x3]                      // gv0, gv1
        movrel          x12, bdof_vx_vy_16x_tbl
        ldp             q0, q1, [x12]                       // table
        mov             w13, w6                             // y = block_h
@@ -1142,17 +1332,11 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
        sshr            v31.8h, v29.8h, #0x4
        ld1             {v8.8h, v9.8h}, [x1]                // src1
        sshr            v10.8h, v8.8h, #0x4
-        ld1             {v11.8h, v12.8h}, [x8], #32         // gh0
+        ldp             q13, q8, [x2], #32                  // (gh0 + gh1) >> 1
        sshr            v29.8h, v30.8h, #0x4
        sshr            v30.8h, v9.8h, #0x4
-        ld1             {v8.8h, v9.8h}, [x9], #32           // gh1
-        shadd           v13.8h, v11.8h, v8.8h               // (gh0 + gh1) >> 1, left half
-        ld1             {v14.8h, v15.8h}, [x10], #32        // gv0
-        ld1             {v3.8h, v4.8h}, [x11], #32          // gv1
-        shadd           v5.8h, v14.8h, v3.8h                // (gv0 + gv1) >> 1, left half
+        ldp             q5, q3, [x3], #32                   // (gv0 + gv1) >> 1
        sub             v31.8h, v31.8h, v10.8h              // diff, left half
-        shadd           v8.8h, v12.8h, v9.8h                // (gh0 + gh1) >> 1, right half
-        shadd           v3.8h, v15.8h, v4.8h                // (gv0 + gv1) >> 1, right half
        sub             v4.8h, v29.8h, v30.8h               // diff, right half

        abs             v29.8h, v13.8h
@@ -1219,3 +1403,129 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
        ldp             d15, d14, [sp], #0x40
        ret
 endfunc
+
+function ff_vvc_apply_bdof_10_neon, export=1
+        mov             w6, #10
+        b               0f
+endfunc
+
+function ff_vvc_apply_bdof_12_neon, export=1
+        mov             w6, #12
+        b               0f
+endfunc
+
+// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
+// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
+// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
+#define APPLY_BDOF_STACK_SIZE   ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
+#define GRADIENT_H0_OFFSET      2
+#define GRADIENT_H1_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
+#define GRADIENT_V0_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
+#define GRADIENT_V1_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
+#define VX_OFFSET               ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
+#define VY_OFFSET               ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
+function ff_vvc_apply_bdof_8_neon, export=1
+        mov             w6, #8
+0:
+        stp             x19, x20, [sp, #-0x40]!
+        stp             x21, x22, [sp, #0x10]
+        stp             x23, x24, [sp, #0x20]
+        stp             x25, x30, [sp, #0x30]
+
+        sub             sp, sp, #APPLY_BDOF_STACK_SIZE
+        mov             w19, w6                         // bit_depth
+        mov             x20, x0                         // dst
+        mov             x21, x1                         // dst_stride
+        mov             x22, x2                         // src0
+        mov             x23, x3                         // src1
+        mov             w24, w4                         // block_w
+        mov             w25, w5                         // block_h
+
+        // int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
+        add             x0, sp, #GRADIENT_H0_OFFSET
+        add             x1, sp, #GRADIENT_H1_OFFSET
+        add             x2, sp, #GRADIENT_V0_OFFSET
+        add             x3, sp, #GRADIENT_V1_OFFSET
+        mov             x4, x22
+        mov             x5, x23
+        mov             w6, w24
+        mov             w7, w25
+        bl              vvc_bdof_grad_filter_8x_neon
+
+        cmp             w24, #8
+        mov             x0, x22                         // src0
+        mov             x1, x23                         // src1
+        add             x2, sp, #GRADIENT_H0_OFFSET     // gh0
+        add             x3, sp, #GRADIENT_V0_OFFSET     // gv0
+        add             x4, sp, #VX_OFFSET              // vx
+        add             x5, sp, #VY_OFFSET              // vy
+        mov             w6, w25                         // block_h
+
+        b.gt            16f
+
+        bl              vvc_derive_bdof_vx_vy_8x_neon
+        cmp             w19, #10                        // check bitdepth
+        mov             x0, x20                         // dst
+        mov             x1, x21                         // dst_stride
+        mov             x2, x22                         // src0
+        mov             x3, x23                         // src1
+        add             x4, sp, #GRADIENT_H1_OFFSET     // gh1
+        add             x5, sp, #GRADIENT_V1_OFFSET     // gv1
+        add             x6, sp, #VX_OFFSET
+        add             x7, sp, #VY_OFFSET
+        str             w25, [sp]
+        b.eq            1f
+        b.gt            2f
+        // 8bit
+0:
+        bl              vvc_apply_bdof_block_8x_8_neon
+        b               32f
+1:
+        // 10bit
+        bl              vvc_apply_bdof_block_8x_10_neon
+        b               32f
+2:
+        // 12bit
+        bl              vvc_apply_bdof_block_8x_12_neon
+        b               32f
+16:
+        bl              vvc_derive_bdof_vx_vy_16x_neon
+
+        cmp             w19, #10                        // check bitdepth
+        mov             x0, x20                         // dst
+        mov             x1, x21                         // dst_stride
+        mov             x2, x22                         // src0
+        mov             x3, x23                         // src1
+        add             x4, sp, #GRADIENT_H1_OFFSET     // gh1
+        add             x5, sp, #GRADIENT_V1_OFFSET     // gv1
+        add             x6, sp, #VX_OFFSET
+        add             x7, sp, #VY_OFFSET
+        str             w25, [sp]
+        b.eq            17f
+        b.gt            18f
+        // 8bit
+        bl              vvc_apply_bdof_block_16x_8_neon
+        b               32f
+17:
+        // 10bit
+        bl              vvc_apply_bdof_block_16x_10_neon
+        b               32f
+18:
+        // 12bit
+        bl              vvc_apply_bdof_block_16x_12_neon
+32:
+        add             sp, sp, #APPLY_BDOF_STACK_SIZE
+        ldp             x25, x30, [sp, #0x30]
+        ldp             x23, x24, [sp, #0x20]
+        ldp             x21, x22, [sp, #0x10]
+        ldp             x19, x20, [sp], #0x40
+        ret
+endfunc
+
+#undef APPLY_BDOF_STACK_SIZE
+#undef GRADIENT_H0_OFFSET
+#undef GRADIENT_H1_OFFSET
+#undef GRADIENT_V0_OFFSET
+#undef GRADIENT_V1_OFFSET
+#undef VX_OFFSET
+#undef VY_OFFSET
--- a/libavcodec/aarch64/vvc/of_template.c
+++ b/libavcodec/aarch64/vvc/of_template.c
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/bit_depth_template.c"
-
-void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst,
-        ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
-        const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy);
-
-static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
-                             const int16_t *_src0, const int16_t *_src1,
-                             int block_w, int block_h) {
-    // +2 for pad left and right
-    int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
-    int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
-    int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
-    int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]};
-    ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
-    pixel *dst = (pixel *) _dst;
-
-    ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
-                                    BDOF_BLOCK_SIZE,
-                                    _src0, MAX_PB_SIZE, block_w, block_h);
-    ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
-                                    BDOF_BLOCK_SIZE,
-                                    _src1, MAX_PB_SIZE, block_w, block_h);
-    int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
-    if (block_w == 8)
-        ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
-    else
-        ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
-
-    for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
-        for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
-            const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x;
-            const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x;
-            pixel *d = dst + x;
-            int idx = BDOF_BLOCK_SIZE * y + x;
-            const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
-            const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
-            int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
-            FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
-                                                             src0, src1, gh, gv,
-                                                             vx + idx1, vy + idx1);
-        }
-        dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
-    }
-}