arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub16_add_neon: 3188.1 2435.4 2499.0 1969.0 vp9_inv_dct_dct_32x32_sub32_add_neon: 18531.7 16582.3 14207.6 12000.3 By skipping individual 4x16 or 4x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 274.6 189.5 211.7 235.8 vp9_inv_dct_dct_16x16_sub2_add_neon: 2064.0 1534.8 1719.4 1248.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 2135.0 1477.2 1736.3 1249.5 vp9_inv_dct_dct_16x16_sub8_add_neon: 2446.7 1828.7 1993.6 1494.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 2832.4 2118.3 2266.5 1735.1 vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.7 2475.3 2523.5 1983.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 756.2 456.7 862.0 553.9 vp9_inv_dct_dct_32x32_sub2_add_neon: 10682.2 8190.4 8539.2 6762.5 vp9_inv_dct_dct_32x32_sub4_add_neon: 10813.5 8014.9 8518.3 6762.8 vp9_inv_dct_dct_32x32_sub8_add_neon: 11859.6 9313.0 9347.4 7514.5 vp9_inv_dct_dct_32x32_sub12_add_neon: 12946.6 10752.4 10192.2 8280.2 vp9_inv_dct_dct_32x32_sub16_add_neon: 14074.6 11946.5 11001.4 9008.6 vp9_inv_dct_dct_32x32_sub20_add_neon: 15269.9 13662.7 11816.1 9762.6 vp9_inv_dct_dct_32x32_sub24_add_neon: 16327.9 14940.1 12626.7 10516.0 vp9_inv_dct_dct_32x32_sub28_add_neon: 17462.7 15776.1 13446.2 11264.7 vp9_inv_dct_dct_32x32_sub32_add_neon: 18575.5 17157.0 14249.3 12015.1 I.e. in general a very minor overhead for the full subpartition case due to the additional loads and cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. In common VP9 content in a few inspected clips, 70-90% of the non-dc-only 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left 8x8 or 16x16 subpartitions respectively. This is cherrypicked from libav commit 9c8bc74c2b. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2025-08-10 06:10:52 +02:00 · 2017-01-10 00:15:15 +02:00
parent ecd343aa1f
commit 388f6e6715
2 changed files with 70 additions and 11 deletions
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,9 +659,8 @@ endfunc
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store.
@ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
@ r2 = src
-@ r3 = slice offset
 function \txfm\()16_1d_4x16_pass1_neon
        mov             r12, #32
        vmov.s16        q2, #0
@@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon
        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31

        @ Store the transposed 4x4 blocks horizontally.
-        cmp             r3,  #12
+        cmp             r1,  #12
        beq             1f
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
        vst1.16         {d\i}, [r0,:64]!
 .endr
        bx              lr
 1:
-        @ Special case: For the last input column (r3 == 12),
+        @ Special case: For the last input column (r1 == 12),
        @ which would be stored as the last row in the temp buffer,
        @ don't store the first 4x4 block, but keep it in registers
        @ for the first slice of the second pass (where it is the
@@ -781,15 +780,22 @@ endfunc
 itxfm16_1d_funcs idct
 itxfm16_1d_funcs iadst

+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
 .macro itxfm_func16x16 txfm1, txfm2
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
        cmp             r3,  #1
        beq             idct16x16_dc_add_neon
 .endif
-        push            {r4-r7,lr}
+        push            {r4-r8,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
        vpush           {q4-q7}
+.else
+        movrel          r8,  min_eob_idct_idct_16 + 2
 .endif

        @ Align the stack, allocate a temp buffer
@@ -810,10 +816,36 @@ A       and             r7,  sp,  #15

 .irp i, 0, 4, 8, 12
        add             r0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(16 - \i)/4
+        ble             1f
+.endif
+.endif
+        mov             r1,  #\i
        add             r2,  r6,  #(\i*2)
-        mov             r3,  #\i
        bl              \txfm1\()16_1d_4x16_pass1_neon
 .endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
+        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.endif
+
 .ifc \txfm1\()_\txfm2,iadst_idct
        movrel          r12, idct_coeffs
        vld1.16         {q0-q1}, [r12,:128]
@@ -830,7 +862,7 @@ A       and             r7,  sp,  #15
 .ifnc \txfm1\()_\txfm2,idct_idct
        vpop            {q4-q7}
 .endif
-        pop             {r4-r7,pc}
+        pop             {r4-r8,pc}
 endfunc
 .endm

@@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon
        bx              lr
 endfunc

+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
 function ff_vp9_idct_idct_32x32_add_neon, export=1
        cmp             r3,  #1
        beq             idct32x32_dc_add_neon
-        push            {r4-r7,lr}
+        push            {r4-r8,lr}
        vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32 + 2

        @ Align the stack, allocate a temp buffer
 T       mov             r7,  sp
@@ -1129,9 +1166,29 @@ A       and             r7,  sp,  #15

 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
        add             r0,  sp,  #(\i*64)
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(32 - \i)/2
+        ble             1f
+.endif
        add             r2,  r6,  #(\i*2)
        bl              idct32_1d_4x32_pass1_neon
 .endr
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
        add             r0,  r4,  #(\i)
        mov             r1,  r5
@@ -1141,5 +1198,5 @@ A       and             r7,  sp,  #15

        add             sp,  sp,  r7
        vpop            {q4-q7}
-        pop             {r4-r7,pc}
+        pop             {r4-r8,pc}
 endfunc
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -334,8 +334,10 @@ static void check_itxfm(void)
                // skip testing sub-IDCTs for WHT or ADST since they don't
                // implement it in any of the SIMD functions. If they do,
                // consider changing this to ensure we have complete test
-                // coverage
-                for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) {
+                // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,
+                // since the arm version can distinguish them at that level.
+                for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
+                     sub < 4 ? (sub <<= 1) : (sub += 4)) {
                    if (check_func(dsp.itxfm_add[tx][txtp],
                                   "vp9_inv_%s_%dx%d_sub%d_add_%d",
                                   tx == 4 ? "wht_wht" : txtp_types[txtp],