avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}

This patch replaces integer widening with halving addition, and multi-step "emulated" rounding shift with a single asm instruction doing exactly that. Benchmarks before and after: A78 avg_8_64x64_neon: 2686.2 ( 6.12x) avg_8_128x128_neon: 10734.2 ( 5.88x) avg_10_64x64_neon: 2536.8 ( 5.40x) avg_10_128x128_neon: 10079.0 ( 5.22x) avg_12_64x64_neon: 2548.2 ( 5.38x) avg_12_128x128_neon: 10133.8 ( 5.19x) avg_8_64x64_neon: 897.8 (18.26x) avg_8_128x128_neon: 3608.5 (17.37x) avg_10_32x32_neon: 444.2 ( 8.51x) avg_10_64x64_neon: 1711.8 ( 8.00x) avg_12_64x64_neon: 1706.2 ( 8.02x) avg_12_128x128_neon: 7010.0 ( 7.46x) A72 avg_8_64x64_neon: 5823.4 ( 3.88x) avg_8_128x128_neon: 17430.5 ( 4.73x) avg_10_64x64_neon: 5228.1 ( 3.71x) avg_10_128x128_neon: 16722.2 ( 4.17x) avg_12_64x64_neon: 5379.1 ( 3.51x) avg_12_128x128_neon: 16715.7 ( 4.17x) avg_8_64x64_neon: 2006.5 (10.61x) avg_8_128x128_neon: 9158.7 ( 8.96x) avg_10_64x64_neon: 3357.7 ( 5.60x) avg_10_128x128_neon: 12411.7 ( 5.56x) avg_12_64x64_neon: 3317.5 ( 5.67x) avg_12_128x128_neon: 12358.5 ( 5.58x) A53 avg_8_64x64_neon: 8327.8 ( 5.18x) avg_8_128x128_neon: 31631.3 ( 5.34x) avg_10_64x64_neon: 8783.5 ( 4.98x) avg_10_128x128_neon: 32617.0 ( 5.25x) avg_12_64x64_neon: 8686.0 ( 5.06x) avg_12_128x128_neon: 32487.5 ( 5.25x) avg_8_64x64_neon: 6032.3 ( 7.17x) avg_8_128x128_neon: 22008.5 ( 7.69x) avg_10_64x64_neon: 7738.0 ( 5.68x) avg_10_128x128_neon: 27813.8 ( 6.14x) avg_12_64x64_neon: 7844.5 ( 5.60x) avg_12_128x128_neon: 26999.5 ( 6.34x) Signed-off-by: Martin Storsjö <martin@martin.st>
2025-08-04 22:03:09 +02:00 · 2025-03-03 22:18:23 +01:00
parent 7225e307be
commit f9b8f30680
1 changed files with 130 additions and 47 deletions
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@ -24,9 +24,9 @@
 #define BDOF_BLOCK_SIZE 16
 #define BDOF_MIN_BLOCK_SIZE 4

-.macro vvc_avg type, bit_depth
+.macro vvc_w_avg bit_depth

-.macro vvc_\type\()_\bit_depth\()_2_4 tap
+.macro vvc_w_avg_\bit_depth\()_2_4 tap
 .if \tap == 2
        ldr             s0, [src0]
        ldr             s2, [src1]
@ -34,18 +34,11 @@
        ldr             d0, [src0]
        ldr             d2, [src1]
 .endif
-
-.ifc \type, avg
-        saddl           v4.4s, v0.4h, v2.4h
-        add             v4.4s, v4.4s, v16.4s
-        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
-.else
        mov             v4.16b, v16.16b
        smlal           v4.4s, v0.4h, v19.4h
        smlal           v4.4s, v2.4h, v20.4h
        sqshl           v4.4s, v4.4s, v22.4s
        sqxtun          v4.4h, v4.4s
-.endif

 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
@ -68,7 +61,7 @@
        add             dst, dst, dst_stride
 .endm

-function ff_vvc_\type\()_\bit_depth\()_neon, export=1
+function ff_vvc_w_avg_\bit_depth\()_neon, export=1
        dst             .req x0
        dst_stride      .req x1
        src0            .req x2
@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1

        mov             x10, #(VVC_MAX_PB_SIZE * 2)
        cmp             width, #8
-.ifc \type, avg
-        movi            v16.4s, #(1 << (14 - \bit_depth))
-.else
        lsr             x11, x6, #32        // weight0
        mov             w12, w6             // weight1
        lsr             x13, x7, #32        // offset
@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        dup             v20.8h, w12
        dup             v16.4s, w13
        dup             v22.4s, w14
-.endif // avg

- .if \bit_depth >= 10
+.if \bit_depth >= 10
        // clip pixel
        mov             w6, #((1 << \bit_depth) - 1)
        dup             v17.8h, w6
@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        b.eq            4f
 2:      // width == 2
        subs            height, height, #1
-        vvc_\type\()_\bit_depth\()_2_4 2
+        vvc_w_avg_\bit_depth\()_2_4 2
        b.ne            2b
        b               32f
 4:      // width == 4
        subs            height, height, #1
-        vvc_\type\()_\bit_depth\()_2_4 4
+        vvc_w_avg_\bit_depth\()_2_4 4
        b.ne            4b
        b               32f
 8:      // width == 8
        ld1             {v0.8h}, [src0], x10
        ld1             {v2.8h}, [src1], x10
-.ifc \type, avg
-        saddl           v4.4s, v0.4h, v2.4h
-        saddl2          v5.4s, v0.8h, v2.8h
-        add             v4.4s, v4.4s, v16.4s
-        add             v5.4s, v5.4s, v16.4s
-        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
-        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
-.else
        mov             v4.16b, v16.16b
        mov             v5.16b, v16.16b
        smlal           v4.4s, v0.4h, v19.4h
@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        sqshl           v5.4s, v5.4s, v22.4s
        sqxtun          v4.4h, v4.4s
        sqxtun2         v4.8h, v5.4s
-.endif
        subs            height, height, #1
 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
 17:
        ldp             q0, q1, [x7], #32
        ldp             q2, q3, [x8], #32
-.ifc \type, avg
-        saddl           v4.4s, v0.4h, v2.4h
-        saddl2          v5.4s, v0.8h, v2.8h
-        saddl           v6.4s, v1.4h, v3.4h
-        saddl2          v7.4s, v1.8h, v3.8h
-        add             v4.4s, v4.4s, v16.4s
-        add             v5.4s, v5.4s, v16.4s
-        add             v6.4s, v6.4s, v16.4s
-        add             v7.4s, v7.4s, v16.4s
-        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
-        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
-        sqshrun         v6.4h, v6.4s, #(15 - \bit_depth)
-        sqshrun2        v6.8h, v7.4s, #(15 - \bit_depth)
-.else   // avg
        mov             v4.16b, v16.16b
        mov             v5.16b, v16.16b
        mov             v6.16b, v16.16b
@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        sqxtun          v6.4h, v6.4s
        sqxtun2         v4.8h, v5.4s
        sqxtun2         v6.8h, v7.4s
-.endif  // w_avg
        subs            w6, w6, #16
 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
 endfunc
 .endm

-vvc_avg avg, 8
-vvc_avg avg, 10
-vvc_avg avg, 12
-vvc_avg w_avg, 8
-vvc_avg w_avg, 10
-vvc_avg w_avg, 12
+vvc_w_avg 8
+vvc_w_avg 10
+vvc_w_avg 12
+
+.macro vvc_avg bit_depth
+function ff_vvc_avg_\bit_depth\()_neon, export=1
+        mov             x10, #(VVC_MAX_PB_SIZE * 2)
+        movi            v16.8h, #0
+        movi            v17.16b, #255
+        ushr            v17.8h, v17.8h, #(16 - \bit_depth)
+
+        cmp             w4, #8
+        b.gt            16f
+        b.eq            8f
+        cmp             w4, #4
+        b.eq            4f
+
+2: // width == 2
+        ldr             s0, [x2]
+        subs            w5, w5, #1
+        ldr             s1, [x3]
+.if \bit_depth == 8
+        shadd           v0.4h, v0.4h, v1.4h
+        sqrshrun        v0.8b, v0.8h, #(15 - 1 - \bit_depth)
+        str             h0, [x0]
+.else
+        shadd           v0.4h, v0.4h, v1.4h
+        srshr           v0.4h, v0.4h, #(15 - 1 - \bit_depth)
+        smax            v0.4h, v0.4h, v16.4h
+        smin            v0.4h, v0.4h, v17.4h
+        str             s0, [x0]
+.endif
+        add             x2, x2, #(VVC_MAX_PB_SIZE * 2)
+        add             x3, x3, #(VVC_MAX_PB_SIZE * 2)
+        add             x0, x0, x1
+        b.ne            2b
+        ret
+
+4: // width == 4
+        ldr             d0, [x2]
+        subs            w5, w5, #1
+        ldr             d1, [x3]
+.if \bit_depth == 8
+        shadd           v0.4h, v0.4h, v1.4h
+        sqrshrun        v0.8b, v0.8h, #(15 - 1 - \bit_depth)
+        str             s0, [x0]
+.else
+        shadd           v0.4h, v0.4h, v1.4h
+        srshr           v0.4h, v0.4h, #(15 - 1 - \bit_depth)
+        smax            v0.4h, v0.4h, v16.4h
+        smin            v0.4h, v0.4h, v17.4h
+        str             d0, [x0]
+.endif
+        add             x2, x2, #(VVC_MAX_PB_SIZE * 2)
+        add             x3, x3, #(VVC_MAX_PB_SIZE * 2)
+        add             x0, x0, x1
+        b.ne            4b
+        ret
+
+8: // width == 8
+        ldr             q0, [x2]
+        subs            w5, w5, #1
+        ldr             q1, [x3]
+.if \bit_depth == 8
+        shadd           v0.8h, v0.8h, v1.8h
+        sqrshrun        v0.8b, v0.8h, #(15 - 1 - \bit_depth)
+        str             d0, [x0]
+.else
+        shadd           v0.8h, v0.8h, v1.8h
+        srshr           v0.8h, v0.8h, #(15 - 1 - \bit_depth)
+        smax            v0.8h, v0.8h, v16.8h
+        smin            v0.8h, v0.8h, v17.8h
+        str             q0, [x0]
+.endif
+        add             x2, x2, #(VVC_MAX_PB_SIZE * 2)
+        add             x3, x3, #(VVC_MAX_PB_SIZE * 2)
+        add             x0, x0, x1
+        b.ne            8b
+        ret
+
+16: // width >= 16
+.if \bit_depth == 8
+        sub             x1, x1, w4, sxtw
+.else
+        sub             x1, x1, w4, sxtw #1
+.endif
+        sub             x10, x10, w4, sxtw #1
+3:
+        mov             w6, w4 // width
+1:
+        ldp             q0, q1, [x2], #32
+        subs            w6, w6, #16
+        ldp             q2, q3, [x3], #32
+.if \bit_depth == 8
+        shadd           v4.8h, v0.8h, v2.8h
+        shadd           v5.8h, v1.8h, v3.8h
+        sqrshrun        v0.8b, v4.8h, #6
+        sqrshrun2       v0.16b, v5.8h, #6
+        st1             {v0.16b}, [x0], #16
+.else
+        shadd           v4.8h, v0.8h, v2.8h
+        shadd           v5.8h, v1.8h, v3.8h
+        srshr           v0.8h, v4.8h, #(15 - 1 - \bit_depth)
+        srshr           v1.8h, v5.8h, #(15 - 1 - \bit_depth)
+        smax            v0.8h, v0.8h, v16.8h
+        smax            v1.8h, v1.8h, v16.8h
+        smin            v0.8h, v0.8h, v17.8h
+        smin            v1.8h, v1.8h, v17.8h
+        stp             q0, q1, [x0], #32
+.endif
+        b.ne            1b
+
+        subs            w5, w5, #1
+        add             x2, x2, x10
+        add             x3, x3, x10
+        add             x0, x0, x1
+        b.ne            3b
+        ret
+endfunc
+.endm
+
+vvc_avg 8
+vvc_avg 10
+vvc_avg 12

 /* x0: int16_t *dst
 * x1: const uint8_t *_src