From c582cb8537367721bb399a5d01b652c20142b756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 14 Jan 2017 20:49:19 +0200 Subject: [PATCH 1/9] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The theoretical maximum value of E is 193, so we can just saturate the addition to 255. Before: Cortex A7 A8 A9 A53 A53/AArch64 vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7 vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7 vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7 vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0 After: vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0 vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7 vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0 vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7 Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++------------------------- libavcodec/arm/vp9lpf_neon.S | 11 ++++----- 2 files changed, 14 insertions(+), 37 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 5fafc7ad5c..48cac4cac6 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -51,13 +51,6 @@ // see the arm version instead. -.macro uabdl_sz dst1, dst2, in1, in2, sz - uabdl \dst1, \in1\().8b, \in2\().8b -.ifc \sz, .16b - uabdl2 \dst2, \in1\().16b, \in2\().16b -.endif -.endm - .macro add_sz dst1, dst2, in1, in2, in3, in4, sz add \dst1, \in1, \in3 .ifc \sz, .16b @@ -86,20 +79,6 @@ .endif .endm -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz - cmhs \dst1, \in1, \in3 -.ifc \sz, .16b - cmhs \dst2, \in2, \in4 -.endif -.endm - -.macro xtn_sz dst, in1, in2, sz - xtn \dst\().8b, \in1 -.ifc \sz, .16b - xtn2 \dst\().16b, \in2 -.endif -.endm - .macro usubl_sz dst1, dst2, in1, in2, sz usubl \dst1, \in1\().8b, \in2\().8b .ifc \sz, .16b @@ -179,20 +158,20 @@ // tmpq2 == tmp3 + tmp4, etc. .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 .if \mix == 0 - dup v0.8h, w2 // E - dup v1.8h, w2 // E + dup v0\sz, w2 // E dup v2\sz, w3 // I dup v3\sz, w4 // H .else - dup v0.8h, w2 // E + dup v0.8b, w2 // E dup v2.8b, w3 // I dup v3.8b, w4 // H + lsr w5, w2, #8 lsr w6, w3, #8 lsr w7, w4, #8 - ushr v1.8h, v0.8h, #8 // E + dup v1.8b, w5 // E dup v4.8b, w6 // I - bic v0.8h, #255, lsl 8 // E dup v5.8b, w7 // H + trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d .endif @@ -206,16 +185,15 @@ umax v4\sz, v4\sz, v5\sz umax v5\sz, v6\sz, v7\sz umax \tmp1\sz, \tmp1\sz, \tmp2\sz - uabdl_sz v6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) + uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) umax v4\sz, v4\sz, v5\sz - add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz // abs(p0 - q0) * 2 + uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) ushr v5\sz, v5\sz, #1 cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I - uaddw_sz v6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 - cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz - xtn_sz v5, v6.8h, v7.8h, \sz + uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + cmhs v5\sz, v0\sz, v6\sz and v4\sz, v4\sz, v5\sz // fm // If no pixels need filtering, just exit as soon as possible diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index 1e161e0c63..e31c807cc0 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -51,7 +51,7 @@ @ and d28-d31 as temp registers, or d8-d15. @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4 - vdup.u16 q0, r2 @ E + vdup.u8 d0, r2 @ E vdup.u8 d2, r3 @ I ldr r3, [sp] @@ -64,16 +64,15 @@ vmax.u8 d4, d4, d5 vmax.u8 d5, d6, d7 vmax.u8 \tmp1, \tmp1, \tmp2 - vabdl.u8 q3, d23, d24 @ abs(p0 - q0) + vabd.u8 d6, d23, d24 @ abs(p0 - q0) vmax.u8 d4, d4, d5 - vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2 + vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2 vabd.u8 d5, d22, d25 @ abs(p1 - q1) vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) vshr.u8 d5, d5, #1 vcle.u8 d4, d4, d2 @ max(abs()) <= I - vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 - vcle.u16 q3, q3, q0 - vmovn.u16 d5, q3 + vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + vcle.u8 d5, d6, d0 vand d4, d4, d5 @ fm vdup.u8 d3, r3 @ H From 3bf9c48320f25f3d5557485b0202f22ae60748b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 23 Feb 2017 23:33:58 +0200 Subject: [PATCH 2/9] aarch64: vp9lpf: Use dup+rev16+uzp1 instead of dup+lsr+dup+trn1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one cycle faster in total, and three instructions fewer. Before: vp9_loop_filter_mix2_v_44_16_neon: 123.2 After: vp9_loop_filter_mix2_v_44_16_neon: 122.2 Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp9lpf_neon.S | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 48cac4cac6..e9c497096b 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -162,18 +162,15 @@ dup v2\sz, w3 // I dup v3\sz, w4 // H .else - dup v0.8b, w2 // E - dup v2.8b, w3 // I - dup v3.8b, w4 // H - lsr w5, w2, #8 - lsr w6, w3, #8 - lsr w7, w4, #8 - dup v1.8b, w5 // E - dup v4.8b, w6 // I - dup v5.8b, w7 // H - trn1 v0.2d, v0.2d, v1.2d - trn1 v2.2d, v2.2d, v4.2d - trn1 v3.2d, v3.2d, v5.2d + dup v0.8h, w2 // E + dup v2.8h, w3 // I + dup v3.8h, w4 // H + rev16 v1.16b, v0.16b // E + rev16 v4.16b, v2.16b // I + rev16 v5.16b, v3.16b // H + uzp1 v0.16b, v0.16b, v1.16b + uzp1 v2.16b, v2.16b, v4.16b + uzp1 v3.16b, v3.16b, v5.16b .endif uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2) From 575e31e931e4178e9f1e24407503c9b4ec0ef9ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 14 Jan 2017 13:22:30 +0200 Subject: [PATCH 3/9] arm: vp9lpf: Implement the mix2_44 function with one single filter pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For this case, with 8 inputs but only changing 4 of them, we can fit all 16 input pixels into a q register, and still have enough temporary registers for doing the loop filter. The wd=8 filters would require too many temporary registers for processing all 16 pixels at once though. Before: Cortex A7 A8 A9 A53 vp9_loop_filter_mix2_v_44_16_neon: 289.7 256.2 237.5 181.2 After: vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 177.7 138.0 Signed-off-by: Martin Storsjö --- libavcodec/arm/vp9dsp_init_arm.c | 7 +- libavcodec/arm/vp9lpf_neon.S | 191 +++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 3 deletions(-) diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c index e99d931674..1ede1708e7 100644 --- a/libavcodec/arm/vp9dsp_init_arm.c +++ b/libavcodec/arm/vp9dsp_init_arm.c @@ -194,6 +194,8 @@ define_loop_filters(8, 8); define_loop_filters(16, 8); define_loop_filters(16, 16); +define_loop_filters(44, 16); + #define lf_mix_fn(dir, wd1, wd2, stridea) \ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, \ ptrdiff_t stride, \ @@ -207,7 +209,6 @@ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, lf_mix_fn(h, wd1, wd2, stride) \ lf_mix_fn(v, wd1, wd2, sizeof(uint8_t)) -lf_mix_fns(4, 4) lf_mix_fns(4, 8) lf_mix_fns(8, 4) lf_mix_fns(8, 8) @@ -227,8 +228,8 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon; dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon; - dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon; - dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon; + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon; + dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon; dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon; dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon; dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon; diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index e31c807cc0..12984a900c 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -44,6 +44,109 @@ vtrn.8 \r2, \r3 .endm +@ The input to and output from this macro is in the registers q8-q15, +@ and q0-q7 are used as scratch registers. +@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 +.macro loop_filter_q + vdup.u8 d0, r2 @ E + lsr r2, r2, #8 + vdup.u8 d2, r3 @ I + lsr r3, r3, #8 + vdup.u8 d1, r2 @ E + vdup.u8 d3, r3 @ I + + vabd.u8 q2, q8, q9 @ abs(p3 - p2) + vabd.u8 q3, q9, q10 @ abs(p2 - p1) + vabd.u8 q4, q10, q11 @ abs(p1 - p0) + vabd.u8 q5, q12, q13 @ abs(q0 - q1) + vabd.u8 q6, q13, q14 @ abs(q1 - q2) + vabd.u8 q7, q14, q15 @ abs(q2 - q3) + vmax.u8 q2, q2, q3 + vmax.u8 q3, q4, q5 + vmax.u8 q4, q6, q7 + vabd.u8 q5, q11, q12 @ abs(p0 - q0) + vmax.u8 q2, q2, q3 + vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2 + vabd.u8 q7, q10, q13 @ abs(p1 - q1) + vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) + vshr.u8 q7, q7, #1 + vcle.u8 q2, q2, q1 @ max(abs()) <= I + vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + vcle.u8 q5, q5, q0 + vand q2, q2, q5 @ fm + + vshrn.u16 d10, q2, #4 + vmov r2, r3, d10 + orrs r2, r2, r3 + @ If no pixels need filtering, just exit as soon as possible + beq 9f + + @ Calculate the normal inner loop filter for 2 or 4 pixels + ldr r3, [sp, #64] + vabd.u8 q3, q10, q11 @ abs(p1 - p0) + vabd.u8 q4, q13, q12 @ abs(q1 - q0) + + vsubl.u8 q5, d20, d26 @ p1 - q1 + vsubl.u8 q6, d21, d27 @ p1 - q1 + vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) + vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1) + vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1) + vdup.u8 d8, r3 @ H + lsr r3, r3, #8 + vdup.u8 d9, r3 @ H + vsubl.u8 q6, d24, d22 @ q0 - p0 + vsubl.u8 q7, d25, d23 @ q0 - p0 + vcle.u8 q3, q3, q4 @ hev + vmov.s16 q0, #3 + vand q3, q3, q2 @ !hev && fm && !flat8in + + vmul.s16 q6, q6, q0 @ 3 * (q0 - p0) + vmul.s16 q7, q7, q0 @ 3 * (q0 - p0) + vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0 + vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] + vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] + vmov.s8 q5, #4 + vqmovn.s16 d12, q6 + vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f + vmov.s8 q0, #3 + + vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127) + vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127) + vmovl.u8 q6, d22 @ p0 + vmovl.u8 q7, d23 @ p0 + vshr.s8 q5, q5, #3 @ f1 + vshr.s8 q0, q0, #3 @ f2 + + vaddw.s8 q6, q6, d0 @ p0 + f2 + vaddw.s8 q7, q7, d1 @ p0 + f2 + vqmovun.s16 d0, q6 @ out p0 + vmovl.u8 q6, d24 @ q0 + vqmovun.s16 d1, q7 @ out p0 + vmovl.u8 q7, d25 @ q0 + vsubw.s8 q6, q6, d10 @ q0 - f1 + vsubw.s8 q7, q7, d11 @ q0 - f1 + vqmovun.s16 d12, q6 @ out q0 + vqmovun.s16 d13, q7 @ out q0 + vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1 + vbit q11, q0, q2 @ if (fm && !flat8in) + vbit q12, q6, q2 + + vmovl.u8 q0, d20 @ p1 + vmovl.u8 q2, d21 @ p1 + vmovl.u8 q6, d26 @ q1 + vmovl.u8 q7, d27 @ q1 + vaddw.s8 q0, q0, d10 @ p1 + f + vaddw.s8 q2, q2, d11 @ p1 + f + vsubw.s8 q6, q6, d10 @ q1 - f + vsubw.s8 q7, q7, d11 @ q1 - f + vqmovun.s16 d0, q0 @ out p1 + vqmovun.s16 d1, q2 @ out p1 + vqmovun.s16 d12, q6 @ out q1 + vqmovun.s16 d13, q7 @ out q1 + vbit q10, q0, q3 @ if (!hev && fm && !flat8in) + vbit q13, q6, q3 +.endm + @ The input to and output from this macro is in the registers d16-d31, @ and d0-d7 are used as scratch registers. @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31 @@ -455,6 +558,94 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1 bx lr endfunc +function ff_vp9_loop_filter_v_44_16_neon, export=1 + vpush {q4-q7} + sub r12, r0, r1, lsl #2 + vld1.8 {q8}, [r12,:128], r1 @ p3 + vld1.8 {q12}, [r0, :128], r1 @ q0 + vld1.8 {q9}, [r12,:128], r1 @ p2 + vld1.8 {q13}, [r0, :128], r1 @ q1 + vld1.8 {q10}, [r12,:128], r1 @ p1 + vld1.8 {q14}, [r0, :128], r1 @ q2 + vld1.8 {q11}, [r12,:128], r1 @ p0 + vld1.8 {q15}, [r0, :128], r1 @ q3 + sub r0, r0, r1, lsl #2 + sub r12, r12, r1, lsl #1 + + loop_filter_q + + vst1.8 {q10}, [r12,:128], r1 + vst1.8 {q12}, [r0, :128], r1 + vst1.8 {q11}, [r12,:128], r1 + vst1.8 {q13}, [r0, :128], r1 +9: + vpop {q4-q7} + bx lr +endfunc + +function ff_vp9_loop_filter_h_44_16_neon, export=1 + vpush {q4-q7} + sub r12, r0, #4 + add r0, r12, r1, lsl #2 + vld1.8 {d16}, [r12], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d18}, [r12], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d20}, [r12], r1 + vld1.8 {d28}, [r0], r1 + vld1.8 {d22}, [r12], r1 + vld1.8 {d30}, [r0], r1 + mov r12, r0 + add r0, r0, r1, lsl #2 + vld1.8 {d17}, [r12], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d19}, [r12], r1 + vld1.8 {d27}, [r0], r1 + vld1.8 {d21}, [r12], r1 + vld1.8 {d29}, [r0], r1 + vld1.8 {d23}, [r12], r1 + vld1.8 {d31}, [r0], r1 + + @ Transpose the 16x8 pixels, as two 8x8 parts + transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15 + + loop_filter_q + + sub r12, r0, r1, lsl #4 + add r0, r12, r1, lsl #3 + @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the + @ outermost 2 pixels since they aren't changed. + add r12, r12, #2 + add r0, r0, #2 + + @ We only will write the mid 4 pixels back; after the loop filter, + @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels). + @ We need to transpose them to columns, done with a 4x4 transpose + @ (which in practice is four 4x4 transposes of the 4x4 blocks of + @ the 16x4 pixels; into 4x16 pixels). + transpose_4x4 q10, q11, q12, q13 + + vst1.32 {d20[0]}, [r12], r1 + vst1.32 {d21[0]}, [r0], r1 + vst1.32 {d22[0]}, [r12], r1 + vst1.32 {d23[0]}, [r0], r1 + vst1.32 {d24[0]}, [r12], r1 + vst1.32 {d25[0]}, [r0], r1 + vst1.32 {d26[0]}, [r12], r1 + vst1.32 {d27[0]}, [r0], r1 + vst1.32 {d20[1]}, [r12], r1 + vst1.32 {d21[1]}, [r0], r1 + vst1.32 {d22[1]}, [r12], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[1]}, [r12], r1 + vst1.32 {d25[1]}, [r0], r1 + vst1.32 {d26[1]}, [r12], r1 + vst1.32 {d27[1]}, [r0], r1 +9: + vpop {q4-q7} + bx lr +endfunc + function ff_vp9_loop_filter_v_8_8_neon, export=1 sub r12, r0, r1, lsl #2 vld1.8 {d20}, [r12,:64], r1 @ p3 From 402546a17233a8815307df9e14ff88cd70424537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 2 Jan 2017 22:50:38 +0200 Subject: [PATCH 4/9] arm: vp9itxfm: Avoid reloading the idct32 coefficients MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idct32x32 function actually pushed q4-q7 onto the stack even though it didn't clobber them; there are plenty of registers that can be used to allow keeping all the idct coefficients in registers without having to reload different subsets of them at different stages in the transform. Since the idct16 core transform avoids clobbering q4-q7 (but clobbers q2-q3 instead, to avoid needing to back up and restore q4-q7 at all in the idct16 function), and the lanewise vmul needs a register in the q0-q3 range, we move the stored coefficients from q2-q3 into q4-q5 while doing idct16. While keeping these coefficients in registers, we still can skip pushing q7. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_32x32_sub32_add_neon: 18553.8 17182.7 14303.3 12089.7 After: vp9_inv_dct_dct_32x32_sub32_add_neon: 18470.3 16717.7 14173.6 11860.8 Signed-off-by: Martin Storsjö --- libavcodec/arm/vp9itxfm_neon.S | 240 ++++++++++++++++----------------- 1 file changed, 117 insertions(+), 123 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 8dc4bbfa55..bed502eba2 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -1185,58 +1185,51 @@ function idct32x32_dc_add_neon endfunc .macro idct32_end - butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a + butterfly d16, d9, d8, d9 @ d16 = t16a, d9 = t19a butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 - butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a + butterfly d18, d10, d11, d10 @ d18 = t23a, d10 = t20a butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21 - butterfly d4, d28, d28, d30 @ d4 = t24a, d28 = t27a + butterfly d8, d28, d28, d30 @ d8 = t24a, d28 = t27a butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26 - butterfly d7, d29, d29, d31 @ d7 = t31a, d29 = t28a + butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 mbutterfly d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = t29a - mbutterfly d29, d5, d0[1], d0[2], q12, q15 @ d29 = t19, d5 = t28 - mbutterfly d28, d6, d0[1], d0[2], q12, q15, neg=1 @ d28 = t27, d6 = t20 + mbutterfly d29, d9, d0[1], d0[2], q12, q15 @ d29 = t19, d9 = t28 + mbutterfly d28, d10, d0[1], d0[2], q12, q15, neg=1 @ d28 = t27, d10 = t20 mbutterfly d26, d21, d0[1], d0[2], q12, q15, neg=1 @ d26 = t26a, d21 = t21a - butterfly d31, d24, d7, d4 @ d31 = t31, d24 = t24 + butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24 butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16 butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21 - butterfly_r d27, d28, d5, d28 @ d27 = t27a, d28 = t28a - butterfly d4, d26, d20, d26 @ d4 = t29, d26 = t26 - butterfly d19, d20, d29, d6 @ d19 = t19a, d20 = t20 - vmov d29, d4 @ d29 = t29 + butterfly_r d27, d28, d9, d28 @ d27 = t27a, d28 = t28a + butterfly d8, d26, d20, d26 @ d8 = t29, d26 = t26 + butterfly d19, d20, d29, d10 @ d19 = t19a, d20 = t20 + vmov d29, d8 @ d29 = t29 - mbutterfly0 d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27, d20 = t20 - mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a - mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 - mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a + mbutterfly0 d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27, d20 = t20 + mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a + mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22 + mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a bx lr .endm function idct32_odd - movrel r12, idct_coeffs - add r12, r12, #32 - vld1.16 {q0-q1}, [r12,:128] + mbutterfly d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a + mbutterfly d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a + mbutterfly d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a + mbutterfly d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a + mbutterfly d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a + mbutterfly d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a + mbutterfly d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a + mbutterfly d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a - mbutterfly d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a - mbutterfly d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a - mbutterfly d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a - mbutterfly d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a - mbutterfly d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a - mbutterfly d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a - mbutterfly d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a - mbutterfly d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a - - sub r12, r12, #32 - vld1.16 {q0}, [r12,:128] - - butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 - butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 - butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 - butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 + butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 + butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 + butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 + butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 @@ -1250,26 +1243,19 @@ function idct32_odd endfunc function idct32_odd_half - movrel r12, idct_coeffs - add r12, r12, #32 - vld1.16 {q0-q1}, [r12,:128] + mbutterfly_h1 d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a + mbutterfly_h2 d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a + mbutterfly_h1 d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a + mbutterfly_h2 d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a + mbutterfly_h1 d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a + mbutterfly_h2 d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a + mbutterfly_h1 d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a + mbutterfly_h2 d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a - mbutterfly_h1 d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a - mbutterfly_h2 d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a - mbutterfly_h1 d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a - mbutterfly_h2 d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a - mbutterfly_h1 d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a - mbutterfly_h2 d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a - mbutterfly_h1 d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a - mbutterfly_h2 d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a - - sub r12, r12, #32 - vld1.16 {q0}, [r12,:128] - - butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 - butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 - butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 - butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 + butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 + butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 + butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 + butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 @@ -1284,45 +1270,38 @@ function idct32_odd_half endfunc function idct32_odd_quarter - movrel r12, idct_coeffs - add r12, r12, #32 - vld1.16 {q0-q1}, [r12,:128] - - vmull.s16 q2, d16, d0[0] - vmull.s16 q14, d19, d1[3] - vmull.s16 q15, d16, d0[1] - vmull.s16 q11, d17, d3[2] - vmull.s16 q3, d17, d3[3] - vmull.s16 q13, d19, d1[2] - vmull.s16 q10, d18, d2[0] - vmull.s16 q12, d18, d2[1] - - sub r12, r12, #32 - vld1.16 {q0}, [r12,:128] + vmull.s16 q4, d16, d4[0] + vmull.s16 q14, d19, d5[3] + vmull.s16 q15, d16, d4[1] + vmull.s16 q11, d17, d7[2] + vmull.s16 q5, d17, d7[3] + vmull.s16 q13, d19, d5[2] + vmull.s16 q10, d18, d6[0] + vmull.s16 q12, d18, d6[1] vneg.s32 q14, q14 - vneg.s32 q3, q3 + vneg.s32 q5, q5 - vrshrn.s32 d4, q2, #14 - vrshrn.s32 d5, q14, #14 + vrshrn.s32 d8, q4, #14 + vrshrn.s32 d9, q14, #14 vrshrn.s32 d29, q15, #14 vrshrn.s32 d28, q11, #14 - vrshrn.s32 d7, q3, #14 + vrshrn.s32 d11, q5, #14 vrshrn.s32 d31, q13, #14 - vrshrn.s32 d6, q10, #14 + vrshrn.s32 d10, q10, #14 vrshrn.s32 d30, q12, #14 - mbutterfly_l q8, q9, d29, d4, d0[3], d1[0] - mbutterfly_l q13, q10, d31, d5, d0[3], d1[0] + mbutterfly_l q8, q9, d29, d8, d0[3], d1[0] + mbutterfly_l q13, q10, d31, d9, d0[3], d1[0] vrshrn.s32 d23, q8, #14 vrshrn.s32 d24, q9, #14 vneg.s32 q10, q10 vrshrn.s32 d27, q13, #14 vrshrn.s32 d20, q10, #14 - mbutterfly_l q8, q9, d30, d6, d1[1], d1[2] + mbutterfly_l q8, q9, d30, d10, d1[1], d1[2] vrshrn.s32 d21, q8, #14 vrshrn.s32 d26, q9, #14 - mbutterfly_l q8, q9, d28, d7, d1[1], d1[2] + mbutterfly_l q8, q9, d28, d11, d1[1], d1[2] vrshrn.s32 d25, q8, #14 vneg.s32 q9, q9 vrshrn.s32 d22, q9, #14 @@ -1343,8 +1322,11 @@ endfunc function idct32_1d_4x32_pass1\suffix\()_neon push {lr} - movrel r12, idct_coeffs - vld1.16 {q0-q1}, [r12,:128] + @ idct16 clobbers q2-q3 (since it doesn't clobber q4-q7 at all + @ when doing the normal 16x16 idct), so move the idct32_odd coeffs + @ to q4-q5 + vmov q4, q2 + vmov q5, q3 @ Double stride of the input, since we only read every other line mov r12, #128 @@ -1372,6 +1354,11 @@ function idct32_1d_4x32_pass1\suffix\()_neon bl idct16\suffix + @ Move the idct32_odd coeffs back into q2-q3 for idct32_odd; + @ the constants for a vmul with a lane must be in q0-q3. + vmov q2, q4 + vmov q3, q5 + @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @ contain the transposed 4x4 blocks. @@ -1407,24 +1394,24 @@ function idct32_1d_4x32_pass1\suffix\()_neon .endif add r2, r2, #64 - vmov.s16 d4, #0 + vmov.s16 d8, #0 @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) .ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64] - vst1.16 {d4}, [r2,:64], r12 + vst1.16 {d8}, [r2,:64], r12 .endr .endif .ifc \suffix,_quarter .irp i, 16, 17, 18, 19 vld1.16 {d\i}, [r2,:64] - vst1.16 {d4}, [r2,:64], r12 + vst1.16 {d8}, [r2,:64], r12 .endr .endif .ifc \suffix,_half .irp i, 16, 17, 18, 19, 20, 21, 22, 23 vld1.16 {d\i}, [r2,:64] - vst1.16 {d4}, [r2,:64], r12 + vst1.16 {d8}, [r2,:64], r12 .endr .endif @@ -1437,15 +1424,15 @@ function idct32_1d_4x32_pass1\suffix\()_neon @ from the output. .macro store_rev a, b, c, d .irp i, \a, \b, \c, \d - vld1.16 {d4}, [r0,:64] - vadd.s16 d4, d4, d\i - vst1.16 {d4}, [r0,:64]! + vld1.16 {d8}, [r0,:64] + vadd.s16 d8, d8, d\i + vst1.16 {d8}, [r0,:64]! vrev64.16 d\i, d\i .endr .irp i, \d, \c, \b, \a - vld1.16 {d4}, [r0,:64] - vsub.s16 d4, d4, d\i - vst1.16 {d4}, [r0,:64]! + vld1.16 {d8}, [r0,:64] + vsub.s16 d8, d8, d\i + vst1.16 {d8}, [r0,:64]! .endr .endm @@ -1466,8 +1453,8 @@ endfunc @ r2 = src (temp buffer) function idct32_1d_4x32_pass2\suffix\()_neon push {lr} - movrel r12, idct_coeffs - vld1.16 {q0-q1}, [r12,:128] + vmov q4, q2 + vmov q5, q3 mov r12, #128 @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) @@ -1492,6 +1479,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon bl idct16\suffix + vmov q2, q4 + vmov q3, q5 + .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vst1.16 {d\i}, [r2,:64], r12 .endr @@ -1524,38 +1514,38 @@ function idct32_1d_4x32_pass2\suffix\()_neon mov r12, #128 .macro load_acc_store a, b, c, d, neg=0 - vld1.16 {d4}, [r2,:64], r12 - vld1.16 {d5}, [r2,:64], r12 + vld1.16 {d8}, [r2,:64], r12 + vld1.16 {d9}, [r2,:64], r12 .if \neg == 0 - vadd.s16 d4, d4, d\a - vld1.16 {d6}, [r2,:64], r12 - vadd.s16 d5, d5, d\b - vld1.16 {d7}, [r2,:64], r12 - vadd.s16 d6, d6, d\c - vadd.s16 d7, d7, d\d + vadd.s16 d8, d8, d\a + vld1.16 {d10}, [r2,:64], r12 + vadd.s16 d9, d9, d\b + vld1.16 {d11}, [r2,:64], r12 + vadd.s16 d10, d10, d\c + vadd.s16 d11, d11, d\d .else - vsub.s16 d4, d4, d\a - vld1.16 {d6}, [r2,:64], r12 - vsub.s16 d5, d5, d\b - vld1.16 {d7}, [r2,:64], r12 - vsub.s16 d6, d6, d\c - vsub.s16 d7, d7, d\d + vsub.s16 d8, d8, d\a + vld1.16 {d10}, [r2,:64], r12 + vsub.s16 d9, d9, d\b + vld1.16 {d11}, [r2,:64], r12 + vsub.s16 d10, d10, d\c + vsub.s16 d11, d11, d\d .endif - vld1.32 {d2[]}, [r0,:32], r1 - vld1.32 {d2[1]}, [r0,:32], r1 - vrshr.s16 q2, q2, #6 - vld1.32 {d3[]}, [r0,:32], r1 - vrshr.s16 q3, q3, #6 - vld1.32 {d3[1]}, [r0,:32], r1 + vld1.32 {d12[]}, [r0,:32], r1 + vld1.32 {d12[1]}, [r0,:32], r1 + vrshr.s16 q4, q4, #6 + vld1.32 {d13[]}, [r0,:32], r1 + vrshr.s16 q5, q5, #6 + vld1.32 {d13[1]}, [r0,:32], r1 sub r0, r0, r1, lsl #2 - vaddw.u8 q2, q2, d2 - vaddw.u8 q3, q3, d3 - vqmovun.s16 d4, q2 - vqmovun.s16 d5, q3 - vst1.32 {d4[0]}, [r0,:32], r1 - vst1.32 {d4[1]}, [r0,:32], r1 - vst1.32 {d5[0]}, [r0,:32], r1 - vst1.32 {d5[1]}, [r0,:32], r1 + vaddw.u8 q4, q4, d12 + vaddw.u8 q5, q5, d13 + vqmovun.s16 d8, q4 + vqmovun.s16 d9, q5 + vst1.32 {d8[0]}, [r0,:32], r1 + vst1.32 {d8[1]}, [r0,:32], r1 + vst1.32 {d9[0]}, [r0,:32], r1 + vst1.32 {d9[1]}, [r0,:32], r1 .endm load_acc_store 31, 30, 29, 28 load_acc_store 27, 26, 25, 24 @@ -1584,7 +1574,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 cmp r3, #1 beq idct32x32_dc_add_neon push {r4-r8,lr} - vpush {q4-q7} + vpush {q4-q6} movrel r8, min_eob_idct_idct_32 + 2 @ Align the stack, allocate a temp buffer @@ -1598,6 +1588,10 @@ A and r7, sp, #15 mov r5, r1 mov r6, r2 + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128]! + vld1.16 {q2-q3}, [r12,:128] + cmp r3, #34 ble idct32x32_quarter_add_neon cmp r3, #135 @@ -1636,7 +1630,7 @@ A and r7, sp, #15 .endr add sp, sp, r7 - vpop {q4-q7} + vpop {q4-q6} pop {r4-r8,pc} endfunc @@ -1668,7 +1662,7 @@ function idct32x32_quarter_add_neon .endr add sp, sp, r7 - vpop {q4-q7} + vpop {q4-q6} pop {r4-r8,pc} endfunc @@ -1706,6 +1700,6 @@ function idct32x32_half_add_neon .endr add sp, sp, r7 - vpop {q4-q7} + vpop {q4-q6} pop {r4-r8,pc} endfunc From 65aa002d54433154a6924dc13e498bec98451ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 2 Jan 2017 22:08:41 +0200 Subject: [PATCH 5/9] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idct32x32 function actually pushed d8-d15 onto the stack even though it didn't clobber them; there are plenty of registers that can be used to allow keeping all the idct coefficients in registers without having to reload different subsets of them at different stages in the transform. After this, we still can skip pushing d12-d15. Before: vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3 After: vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3 Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp9itxfm_neon.S | 110 +++++++++++------------------ 1 file changed, 43 insertions(+), 67 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index d35f103a79..b6c2575236 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -1123,18 +1123,14 @@ endfunc .endm function idct32_odd - ld1 {v0.8h,v1.8h}, [x11] - - dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a - dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a - dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a - dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a - dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a - dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a - dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a - dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a - - ld1 {v0.8h}, [x10] + dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 @@ -1153,18 +1149,14 @@ function idct32_odd endfunc function idct32_odd_half - ld1 {v0.8h,v1.8h}, [x11] - - dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a - dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a - dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a - dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a - dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a - dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a - dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a - dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a - - ld1 {v0.8h}, [x10] + dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 @@ -1183,18 +1175,14 @@ function idct32_odd_half endfunc function idct32_odd_quarter - ld1 {v0.8h,v1.8h}, [x11] - - dsmull_h v4, v5, v16, v0.h[0] - dsmull_h v28, v29, v19, v0.h[7] - dsmull_h v30, v31, v16, v0.h[1] - dsmull_h v22, v23, v17, v1.h[6] - dsmull_h v7, v6, v17, v1.h[7] - dsmull_h v26, v27, v19, v0.h[6] - dsmull_h v20, v21, v18, v1.h[0] - dsmull_h v24, v25, v18, v1.h[1] - - ld1 {v0.8h}, [x10] + dsmull_h v4, v5, v16, v8.h[0] + dsmull_h v28, v29, v19, v8.h[7] + dsmull_h v30, v31, v16, v8.h[1] + dsmull_h v22, v23, v17, v9.h[6] + dsmull_h v7, v6, v17, v9.h[7] + dsmull_h v26, v27, v19, v8.h[6] + dsmull_h v20, v21, v18, v9.h[0] + dsmull_h v24, v25, v18, v9.h[1] neg v28.4s, v28.4s neg v29.4s, v29.4s @@ -1240,12 +1228,8 @@ endfunc // x1 = unused // x2 = src // x9 = double input stride -// x10 = idct_coeffs -// x11 = idct_coeffs + 32 function idct32_1d_8x32_pass1\suffix\()_neon mov x14, x30 - ld1 {v0.8h,v1.8h}, [x10] - movi v2.8h, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) @@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon .macro store_rev a, b // There's no rev128 instruction, but we reverse each 64 bit // half, and then flip them using an ext with 8 bytes offset. - rev64 v1.8h, \b + rev64 v3.8h, \b st1 {\a}, [x0], #16 - rev64 v0.8h, \a - ext v1.16b, v1.16b, v1.16b, #8 + rev64 v2.8h, \a + ext v3.16b, v3.16b, v3.16b, #8 st1 {\b}, [x0], #16 - ext v0.16b, v0.16b, v0.16b, #8 - st1 {v1.8h}, [x0], #16 - st1 {v0.8h}, [x0], #16 + ext v2.16b, v2.16b, v2.16b, #8 + st1 {v3.8h}, [x0], #16 + st1 {v2.8h}, [x0], #16 .endm store_rev v16.8h, v24.8h store_rev v17.8h, v25.8h @@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon // subtracted from the output. .macro store_rev a, b ld1 {v4.8h}, [x0] - rev64 v1.8h, \b + rev64 v3.8h, \b add v4.8h, v4.8h, \a - rev64 v0.8h, \a + rev64 v2.8h, \a st1 {v4.8h}, [x0], #16 - ext v1.16b, v1.16b, v1.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 ld1 {v5.8h}, [x0] - ext v0.16b, v0.16b, v0.16b, #8 + ext v2.16b, v2.16b, v2.16b, #8 add v5.8h, v5.8h, \b st1 {v5.8h}, [x0], #16 ld1 {v6.8h}, [x0] - sub v6.8h, v6.8h, v1.8h + sub v6.8h, v6.8h, v3.8h st1 {v6.8h}, [x0], #16 ld1 {v7.8h}, [x0] - sub v7.8h, v7.8h, v0.8h + sub v7.8h, v7.8h, v2.8h st1 {v7.8h}, [x0], #16 .endm @@ -1376,12 +1360,8 @@ endfunc // x2 = src (temp buffer) // x7 = negative double temp buffer stride // x9 = double temp buffer stride -// x10 = idct_coeffs -// x11 = idct_coeffs + 32 function idct32_1d_8x32_pass2\suffix\()_neon mov x14, x30 - ld1 {v0.8h,v1.8h}, [x10] - // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon sub v6.8h, v6.8h, \c sub v7.8h, v7.8h, \d .endif - ld1 {v0.8b}, [x0], x1 - ld1 {v1.8b}, [x0], x1 + ld1 {v10.8b}, [x0], x1 + ld1 {v11.8b}, [x0], x1 srshr v4.8h, v4.8h, #6 ld1 {v2.8b}, [x0], x1 srshr v5.8h, v5.8h, #6 - uaddw v4.8h, v4.8h, v0.8b + uaddw v4.8h, v4.8h, v10.8b ld1 {v3.8b}, [x0], x1 srshr v6.8h, v6.8h, #6 - uaddw v5.8h, v5.8h, v1.8b + uaddw v5.8h, v5.8h, v11.8b srshr v7.8h, v7.8h, #6 sub x0, x0, x1, lsl #2 uaddw v6.8h, v6.8h, v2.8b @@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 b.eq idct32x32_dc_add_neon movrel x10, idct_coeffs - add x11, x10, #32 movrel x12, min_eob_idct_idct_32, 2 mov x15, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! stp d10, d11, [sp, #-0x10]! stp d8, d9, [sp, #-0x10]! @@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 mov x9, #128 neg x7, x9 + ld1 {v0.8h,v1.8h}, [x10], #32 + ld1 {v8.8h,v9.8h}, [x10] + cmp w3, #34 b.le idct32x32_quarter_add_neon cmp w3, #135 @@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 ldp d8, d9, [sp], 0x10 ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 br x15 endfunc @@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon ldp d8, d9, [sp], 0x10 ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 br x15 endfunc From de06bdfe6c8abd8266d5c6f5c68e4df0060b61fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 31 Dec 2016 14:05:44 +0200 Subject: [PATCH 6/9] arm: vp9itxfm: Reorder the idct coefficients for better pairing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All elements are used pairwise, except for the first one. Previously, the 16th element was unused. Move the unused element to the second slot, to make the later element pairs not split across registers. This simplifies loading only parts of the coefficients, reducing the difference to the 16 bpp version. Signed-off-by: Martin Storsjö --- libavcodec/arm/vp9itxfm_neon.S | 124 ++++++++++++++++----------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index bed502eba2..1d4d6a7910 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -22,7 +22,7 @@ #include "neon.S" const itxfm4_coeffs, align=4 - .short 11585, 6270, 15137, 0 + .short 11585, 0, 6270, 15137 iadst4_coeffs: .short 5283, 15212, 9929, 13377 endconst @@ -30,8 +30,8 @@ endconst const iadst8_coeffs, align=4 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 idct_coeffs: - .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606 - .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0 + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 endconst @@ -224,14 +224,14 @@ endconst .endm .macro idct4 c0, c1, c2, c3 - vmull.s16 q13, \c1, d0[2] - vmull.s16 q11, \c1, d0[1] + vmull.s16 q13, \c1, d0[3] + vmull.s16 q11, \c1, d0[2] vadd.i16 d16, \c0, \c2 vsub.i16 d17, \c0, \c2 - vmlal.s16 q13, \c3, d0[1] + vmlal.s16 q13, \c3, d0[2] vmull.s16 q9, d16, d0[0] vmull.s16 q10, d17, d0[0] - vmlsl.s16 q11, \c3, d0[2] + vmlsl.s16 q11, \c3, d0[3] vrshrn.s32 d26, q13, #14 vrshrn.s32 d18, q9, #14 vrshrn.s32 d20, q10, #14 @@ -350,9 +350,9 @@ itxfm_func4x4 iwht, iwht .macro idct8 dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a - dmbutterfly d20, d21, d28, d29, d0[1], d0[2], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a - dmbutterfly d18, d19, d30, d31, d0[3], d1[0], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a - dmbutterfly d26, d27, d22, d23, d1[1], d1[2], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a + dmbutterfly d20, d21, d28, d29, d0[2], d0[3], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a + dmbutterfly d18, d19, d30, d31, d1[0], d1[1], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a + dmbutterfly d26, d27, d22, d23, d1[2], d1[3], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3 butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2 @@ -386,8 +386,8 @@ itxfm_func4x4 iwht, iwht vneg.s16 q15, q15 @ q15 = out[7] butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2 - dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d0[1], d0[2] @ q10,q11 = t5a, q5,q7 = t4a - dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d0[2], d0[1] @ q2,q3 = t6a, q13,q14 = t7a + dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d0[2], d0[3] @ q10,q11 = t5a, q5,q7 = t4a + dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d0[3], d0[2] @ q2,q3 = t6a, q13,q14 = t7a dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, q10, q11 @ q14 = out[6], q4 = t7 @@ -594,13 +594,13 @@ endfunc function idct16 mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a - mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a - mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a - mbutterfly d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 = t6a - mbutterfly d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 = t15a - mbutterfly d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 = t14a - mbutterfly d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 = t13a - mbutterfly d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 = t12a + mbutterfly d20, d28, d0[2], d0[3], q2, q3 @ d20 = t2a, d28 = t3a + mbutterfly d18, d30, d1[0], d1[1], q2, q3 @ d18 = t4a, d30 = t7a + mbutterfly d26, d22, d1[2], d1[3], q2, q3 @ d26 = t5a, d22 = t6a + mbutterfly d17, d31, d2[0], d2[1], q2, q3 @ d17 = t8a, d31 = t15a + mbutterfly d25, d23, d2[2], d2[3], q2, q3 @ d25 = t9a, d23 = t14a + mbutterfly d21, d27, d3[0], d3[1], q2, q3 @ d21 = t10a, d27 = t13a + mbutterfly d29, d19, d3[2], d3[3], q2, q3 @ d29 = t11a, d19 = t12a butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3 butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2 @@ -612,20 +612,20 @@ function idct16 butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a - mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a - mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a + mbutterfly d23, d25, d0[2], d0[3], q9, q15 @ d23 = t9a, d25 = t14a + mbutterfly d27, d21, d0[2], d0[3], q9, q15, neg=1 @ d27 = t13a, d21 = t10a idct16_end endfunc function idct16_half mbutterfly0_h d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a - mbutterfly_h1 d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a - mbutterfly_h1 d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a - mbutterfly_h2 d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 = t6a - mbutterfly_h1 d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 = t15a - mbutterfly_h2 d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 = t14a - mbutterfly_h1 d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 = t13a - mbutterfly_h2 d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 = t12a + mbutterfly_h1 d20, d28, d0[2], d0[3], q2, q3 @ d20 = t2a, d28 = t3a + mbutterfly_h1 d18, d30, d1[0], d1[1], q2, q3 @ d18 = t4a, d30 = t7a + mbutterfly_h2 d26, d22, d1[2], d1[3], q2, q3 @ d26 = t5a, d22 = t6a + mbutterfly_h1 d17, d31, d2[0], d2[1], q2, q3 @ d17 = t8a, d31 = t15a + mbutterfly_h2 d25, d23, d2[2], d2[3], q2, q3 @ d25 = t9a, d23 = t14a + mbutterfly_h1 d21, d27, d3[0], d3[1], q2, q3 @ d21 = t10a, d27 = t13a + mbutterfly_h2 d29, d19, d3[2], d3[3], q2, q3 @ d29 = t11a, d19 = t12a butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3 butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2 @@ -637,19 +637,19 @@ function idct16_half butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a - mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a - mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a + mbutterfly d23, d25, d0[2], d0[3], q9, q15 @ d23 = t9a, d25 = t14a + mbutterfly d27, d21, d0[2], d0[3], q9, q15, neg=1 @ d27 = t13a, d21 = t10a idct16_end endfunc function idct16_quarter - vmull.s16 q12, d19, d3[2] - vmull.s16 q2, d17, d1[3] - vmull.s16 q3, d18, d1[0] - vmull.s16 q15, d18, d0[3] + vmull.s16 q12, d19, d3[3] + vmull.s16 q2, d17, d2[0] + vmull.s16 q3, d18, d1[1] + vmull.s16 q15, d18, d1[0] vneg.s32 q12, q12 - vmull.s16 q14, d17, d2[0] - vmull.s16 q13, d19, d3[1] + vmull.s16 q14, d17, d2[1] + vmull.s16 q13, d19, d3[2] vmull.s16 q11, d16, d0[0] vrshrn.s32 d24, q12, #14 vrshrn.s32 d16, q2, #14 @@ -659,8 +659,8 @@ function idct16_quarter vrshrn.s32 d17, q13, #14 vrshrn.s32 d28, q11, #14 - mbutterfly_l q10, q11, d17, d24, d0[1], d0[2] - mbutterfly_l q9, q15, d29, d16, d0[1], d0[2] + mbutterfly_l q10, q11, d17, d24, d0[2], d0[3] + mbutterfly_l q9, q15, d29, d16, d0[2], d0[3] vneg.s32 q11, q11 vrshrn.s32 d27, q10, #14 vrshrn.s32 d21, q11, #14 @@ -697,16 +697,16 @@ function iadst16 movrel r12, idct_coeffs vld1.16 {q0}, [r12,:128] butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = t15a - mbutterfly_l q7, q6, d23, d24, d0[3], d1[0] @ q7 = t9, q6 = t8 + mbutterfly_l q7, q6, d23, d24, d1[0], d1[1] @ q7 = t9, q6 = t8 butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = t14a - mbutterfly_l q2, q3, d28, d19, d1[0], d0[3] @ q2 = t12, q3 = t13 + mbutterfly_l q2, q3, d28, d19, d1[1], d1[0] @ q2 = t12, q3 = t13 butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = t12a - mbutterfly_l q5, q4, d21, d26, d1[1], d1[2] @ q5 = t11, q4 = t10 + mbutterfly_l q5, q4, d21, d26, d1[2], d1[3] @ q5 = t11, q4 = t10 butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = t0 butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = t13a - mbutterfly_l q6, q7, d30, d17, d1[2], d1[1] @ q6 = t14, q7 = t15 + mbutterfly_l q6, q7, d30, d17, d1[3], d1[2] @ q6 = t14, q7 = t15 butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = t1 butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = t14a butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = t15a @@ -714,15 +714,15 @@ function iadst16 butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = t2 butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = t3 - mbutterfly_l q5, q4, d19, d28, d0[1], d0[2] @ q5 = t13, q4 = t12 - mbutterfly_l q6, q7, d30, d17, d0[2], d0[1] @ q6 = t14, q7 = t15 + mbutterfly_l q5, q4, d19, d28, d0[2], d0[3] @ q5 = t13, q4 = t12 + mbutterfly_l q6, q7, d30, d17, d0[3], d0[2] @ q6 = t14, q7 = t15 butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], d30 = t14a butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], d17 = t15a vneg.s16 d29, d29 @ d29 = out[13] - mbutterfly_l q5, q4, d4, d5, d0[1], d0[2] @ q5 = t5a, q4 = t4a - mbutterfly_l q6, q7, d7, d6, d0[2], d0[1] @ q6 = t6a, q7 = t7a + mbutterfly_l q5, q4, d4, d5, d0[2], d0[3] @ q5 = t5a, q4 = t4a + mbutterfly_l q6, q7, d7, d6, d0[3], d0[2] @ q6 = t6a, q7 = t7a butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = t2a butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = t10 @@ -1194,10 +1194,10 @@ endfunc butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 - mbutterfly d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = t29a - mbutterfly d29, d9, d0[1], d0[2], q12, q15 @ d29 = t19, d9 = t28 - mbutterfly d28, d10, d0[1], d0[2], q12, q15, neg=1 @ d28 = t27, d10 = t20 - mbutterfly d26, d21, d0[1], d0[2], q12, q15, neg=1 @ d26 = t26a, d21 = t21a + mbutterfly d27, d20, d0[2], d0[3], q12, q15 @ d27 = t18a, d20 = t29a + mbutterfly d29, d9, d0[2], d0[3], q12, q15 @ d29 = t19, d5 = t28 + mbutterfly d28, d10, d0[2], d0[3], q12, q15, neg=1 @ d28 = t27, d6 = t20 + mbutterfly d26, d21, d0[2], d0[3], q12, q15, neg=1 @ d26 = t26a, d21 = t21a butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24 butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a @@ -1235,10 +1235,10 @@ function idct32_odd butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 - mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a - mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a - mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a - mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a + mbutterfly d23, d24, d1[0], d1[1], q8, q9 @ d23 = t17a, d24 = t30a + mbutterfly d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a + mbutterfly d21, d26, d1[2], d1[3], q8, q9 @ d21 = t21a, d26 = t26a + mbutterfly d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a idct32_end endfunc @@ -1261,10 +1261,10 @@ function idct32_odd_half butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 - mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a - mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a - mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a - mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a + mbutterfly d23, d24, d1[0], d1[1], q8, q9 @ d23 = t17a, d24 = t30a + mbutterfly d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a + mbutterfly d21, d26, d1[2], d1[3], q8, q9 @ d21 = t21a, d26 = t26a + mbutterfly d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a idct32_end endfunc @@ -1291,17 +1291,17 @@ function idct32_odd_quarter vrshrn.s32 d10, q10, #14 vrshrn.s32 d30, q12, #14 - mbutterfly_l q8, q9, d29, d8, d0[3], d1[0] - mbutterfly_l q13, q10, d31, d9, d0[3], d1[0] + mbutterfly_l q8, q9, d29, d8, d1[0], d1[1] + mbutterfly_l q13, q10, d31, d9, d1[0], d1[1] vrshrn.s32 d23, q8, #14 vrshrn.s32 d24, q9, #14 vneg.s32 q10, q10 vrshrn.s32 d27, q13, #14 vrshrn.s32 d20, q10, #14 - mbutterfly_l q8, q9, d30, d10, d1[1], d1[2] + mbutterfly_l q8, q9, d30, d10, d1[2], d1[3] vrshrn.s32 d21, q8, #14 vrshrn.s32 d26, q9, #14 - mbutterfly_l q8, q9, d28, d11, d1[1], d1[2] + mbutterfly_l q8, q9, d28, d11, d1[2], d1[3] vrshrn.s32 d25, q8, #14 vneg.s32 q9, q9 vrshrn.s32 d22, q9, #14 From 09eb88a12e008d10a3f7a6be75d18ad98b368e68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 31 Dec 2016 14:18:31 +0200 Subject: [PATCH 7/9] aarch64: vp9itxfm: Reorder the idct coefficients for better pairing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All elements are used pairwise, except for the first one. Previously, the 16th element was unused. Move the unused element to the second slot, to make the later element pairs not split across registers. This simplifies loading only parts of the coefficients, reducing the difference to the 16 bpp version. Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp9itxfm_neon.S | 124 ++++++++++++++--------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index b6c2575236..d4fc2163aa 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -22,7 +22,7 @@ #include "neon.S" const itxfm4_coeffs, align=4 - .short 11585, 6270, 15137, 0 + .short 11585, 0, 6270, 15137 iadst4_coeffs: .short 5283, 15212, 9929, 13377 endconst @@ -30,8 +30,8 @@ endconst const iadst8_coeffs, align=4 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 idct_coeffs: - .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606 - .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0 + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 endconst @@ -192,14 +192,14 @@ endconst .endm .macro idct4 c0, c1, c2, c3 - smull v22.4s, \c1\().4h, v0.h[2] - smull v20.4s, \c1\().4h, v0.h[1] + smull v22.4s, \c1\().4h, v0.h[3] + smull v20.4s, \c1\().4h, v0.h[2] add v16.4h, \c0\().4h, \c2\().4h sub v17.4h, \c0\().4h, \c2\().4h - smlal v22.4s, \c3\().4h, v0.h[1] + smlal v22.4s, \c3\().4h, v0.h[2] smull v18.4s, v16.4h, v0.h[0] smull v19.4s, v17.4h, v0.h[0] - smlsl v20.4s, \c3\().4h, v0.h[2] + smlsl v20.4s, \c3\().4h, v0.h[3] rshrn v22.4h, v22.4s, #14 rshrn v18.4h, v18.4s, #14 rshrn v19.4h, v19.4s, #14 @@ -326,9 +326,9 @@ itxfm_func4x4 iwht, iwht .macro idct8 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a - dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a - dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a - dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a + dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a + dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a + dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a @@ -361,8 +361,8 @@ itxfm_func4x4 iwht, iwht dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] neg v19.8h, v19.8h // v19 = out[3] - dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a - dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a + dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a + dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 @@ -543,13 +543,13 @@ endfunc function idct16 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a - dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a - dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a - dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a - dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a - dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a - dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a - dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 @@ -561,20 +561,20 @@ function idct16 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a - dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a - dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a idct16_end endfunc function idct16_half dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a - dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a - dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a - dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a - dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a - dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a - dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a - dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 @@ -586,20 +586,20 @@ function idct16_half butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a - dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a - dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a idct16_end endfunc function idct16_quarter - dsmull_h v24, v25, v19, v1.h[6] - dsmull_h v4, v5, v17, v0.h[7] - dsmull_h v7, v6, v18, v0.h[4] - dsmull_h v30, v31, v18, v0.h[3] + dsmull_h v24, v25, v19, v1.h[7] + dsmull_h v4, v5, v17, v1.h[0] + dsmull_h v7, v6, v18, v0.h[5] + dsmull_h v30, v31, v18, v0.h[4] neg v24.4s, v24.4s neg v25.4s, v25.4s - dsmull_h v29, v28, v17, v1.h[0] - dsmull_h v26, v27, v19, v1.h[5] + dsmull_h v29, v28, v17, v1.h[1] + dsmull_h v26, v27, v19, v1.h[6] dsmull_h v22, v23, v16, v0.h[0] drshrn_h v24, v24, v25, #14 drshrn_h v16, v4, v5, #14 @@ -609,8 +609,8 @@ function idct16_quarter drshrn_h v17, v26, v27, #14 drshrn_h v28, v22, v23, #14 - dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2] - dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2] + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3] neg v22.4s, v22.4s neg v23.4s, v23.4s drshrn_h v27, v20, v21, #14 @@ -646,16 +646,16 @@ function iadst16 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 ld1 {v0.8h}, [x10] dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a - dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8 + dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a - dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13 + dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a - dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10 + dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a - dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a @@ -663,15 +663,15 @@ function iadst16 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 - dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12 - dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15 + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a neg v29.8h, v29.8h // v29 = out[13] - dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a - dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 @@ -1101,10 +1101,10 @@ endfunc butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 - dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a - dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28 - dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 - dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a @@ -1141,10 +1141,10 @@ function idct32_odd butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 - dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a - dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a - dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a - dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a idct32_end endfunc @@ -1167,10 +1167,10 @@ function idct32_odd_half butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 - dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a - dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a - dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a - dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a idct32_end endfunc @@ -1198,18 +1198,18 @@ function idct32_odd_quarter drshrn_h v6, v20, v21, #14 drshrn_h v30, v24, v25, #14 - dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4] - dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4] + dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5] drshrn_h v23, v16, v17, #14 drshrn_h v24, v18, v19, #14 neg v20.4s, v20.4s neg v21.4s, v21.4s drshrn_h v27, v27, v26, #14 drshrn_h v20, v20, v21, #14 - dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6] + dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7] drshrn_h v21, v16, v17, #14 drshrn_h v26, v18, v19, #14 - dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6] + dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7] drshrn_h v25, v16, v17, #14 neg v18.4s, v18.4s neg v19.4s, v19.4s From 08074c092d8c97d71c5986e5325e97ffc956119d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 31 Dec 2016 22:27:13 +0200 Subject: [PATCH 8/9] arm: vp9itxfm: Reorder iadst16 coeffs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This matches the order they are in the 16 bpp version. There they are in this order, to make sure we access them in the same order they are declared, easing loading only half of the coefficients at a time. This makes the 8 bpp version match the 16 bpp version better. Signed-off-by: Martin Storsjö --- libavcodec/arm/vp9itxfm_neon.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 1d4d6a7910..a612b25f4f 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -37,8 +37,8 @@ idct_coeffs: endconst const iadst16_coeffs, align=4 - .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 - .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 endconst @ Do four 4x4 transposes, using q registers for the subtransposes that don't @@ -678,19 +678,19 @@ function iadst16 vld1.16 {q0-q1}, [r12,:128] mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = t0 - mbutterfly_l q5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = t8 + mbutterfly_l q5, q4, d23, d24, d1[1], d1[0] @ q5 = t9, q4 = t8 butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = t9a mbutterfly_l q7, q6, d29, d18, d0[3], d0[2] @ q7 = t3, q6 = t2 butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = t8a - mbutterfly_l q3, q2, d21, d26, d2[3], d2[2] @ q3 = t11, q2 = t10 + mbutterfly_l q3, q2, d21, d26, d1[3], d1[2] @ q3 = t11, q2 = t10 butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = t11a - mbutterfly_l q5, q4, d27, d20, d1[1], d1[0] @ q5 = t5, q4 = t4 + mbutterfly_l q5, q4, d27, d20, d2[1], d2[0] @ q5 = t5, q4 = t4 butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = t10a mbutterfly_l q7, q6, d19, d28, d3[1], d3[0] @ q7 = t13, q6 = t12 butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = t13a - mbutterfly_l q3, q2, d25, d22, d1[3], d1[2] @ q3 = t7, q2 = t6 + mbutterfly_l q3, q2, d25, d22, d2[3], d2[2] @ q3 = t7, q2 = t6 butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = t12a mbutterfly_l q5, q4, d17, d30, d3[3], d3[2] @ q5 = t15, q4 = t14 From b8f66c0838b4c645227f23a35b4d54373da4c60a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 31 Dec 2016 22:27:13 +0200 Subject: [PATCH 9/9] aarch64: vp9itxfm: Reorder iadst16 coeffs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This matches the order they are in the 16 bpp version. There they are in this order, to make sure we access them in the same order they are declared, easing loading only half of the coefficients at a time. This makes the 8 bpp version match the 16 bpp version better. Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp9itxfm_neon.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index d4fc2163aa..93dc736f01 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -37,8 +37,8 @@ idct_coeffs: endconst const iadst16_coeffs, align=4 - .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 - .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 endconst // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14 @@ -628,19 +628,19 @@ function iadst16 ld1 {v0.8h,v1.8h}, [x11] dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 - dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a - dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10 + dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a - dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4 + dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a - dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6 + dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14