From f43079e11cb445e6b70b149d9cdb829091ec2155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 14 Nov 2016 12:32:26 +0200 Subject: [PATCH] aarch64: vp9: Add NEON itxfm routines This work is sponsored by, and copyright, Google. These are ported from the ARM version; thanks to the larger amount of registers available, we can do the 16x16 and 32x32 transforms in slices 8 pixels wide instead of 4. This gives a speedup of around 1.4x compared to the 32 bit version. The fact that aarch64 doesn't have the same d/q register aliasing makes some of the macros quite a bit simpler as well. Examples of runtimes vs the 32 bit version, on a Cortex A53: ARM AArch64 vp9_inv_adst_adst_4x4_add_neon: 90.0 87.7 vp9_inv_adst_adst_8x8_add_neon: 400.0 354.7 vp9_inv_adst_adst_16x16_add_neon: 2526.5 1827.2 vp9_inv_dct_dct_4x4_add_neon: 74.0 72.7 vp9_inv_dct_dct_8x8_add_neon: 271.0 256.7 vp9_inv_dct_dct_16x16_add_neon: 1960.7 1372.7 vp9_inv_dct_dct_32x32_add_neon: 11988.9 8088.3 vp9_inv_wht_wht_4x4_add_neon: 63.0 57.7 The speedup vs C code (2-4x) is smaller than in the 32 bit case, mostly because the C code ends up significantly faster (around 1.6x faster, with GCC 5.4) when built for aarch64. Examples of runtimes vs C on a Cortex A57 (for a slightly older version of the patch): A57 gcc-5.3 neon vp9_inv_adst_adst_4x4_add_neon: 152.2 60.0 vp9_inv_adst_adst_8x8_add_neon: 948.2 288.0 vp9_inv_adst_adst_16x16_add_neon: 4830.4 1380.5 vp9_inv_dct_dct_4x4_add_neon: 153.0 58.6 vp9_inv_dct_dct_8x8_add_neon: 789.2 180.2 vp9_inv_dct_dct_16x16_add_neon: 3639.6 917.1 vp9_inv_dct_dct_32x32_add_neon: 20462.1 4985.0 vp9_inv_wht_wht_4x4_add_neon: 91.0 49.8 The asm is around factor 3-4 faster than C on the cortex-a57 and the asm is around 30-50% faster on the a57 compared to the a53. This is an adapted cherry-pick from libav commit 3c9546dfafcdfe8e7860aff9ebbf609318220f29. Signed-off-by: Ronald S. Bultje --- libavcodec/aarch64/Makefile | 3 +- libavcodec/aarch64/vp9dsp_init_aarch64.c | 54 +- libavcodec/aarch64/vp9itxfm_neon.S | 1116 ++++++++++++++++++++++ 3 files changed, 1171 insertions(+), 2 deletions(-) create mode 100644 libavcodec/aarch64/vp9itxfm_neon.S diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index e7db95ea58..e8a7f7a7fb 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -42,4 +42,5 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o # decoders/encoders NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o -NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9mc_neon.o +NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \ + aarch64/vp9mc_neon.o diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c index 4adf363df4..284860856d 100644 --- a/libavcodec/aarch64/vp9dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c @@ -96,7 +96,7 @@ define_8tap_2d_funcs(16) define_8tap_2d_funcs(8) define_8tap_2d_funcs(4) -av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) +static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp, int bpp) { int cpu_flags = av_get_cpu_flags(); @@ -154,3 +154,55 @@ av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) init_mc_funcs_dirs(4, 4); } } + +#define define_itxfm(type_a, type_b, sz) \ +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) + +#define define_itxfm_funcs(sz) \ + define_itxfm(idct, idct, sz); \ + define_itxfm(iadst, idct, sz); \ + define_itxfm(idct, iadst, sz); \ + define_itxfm(iadst, iadst, sz) + +define_itxfm_funcs(4); +define_itxfm_funcs(8); +define_itxfm_funcs(16); +define_itxfm(idct, idct, 32); +define_itxfm(iwht, iwht, 4); + + +static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp, int bpp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (bpp != 8) + return; + + if (have_neon(cpu_flags)) { +#define init_itxfm(tx, sz) \ + dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \ + dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \ + dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon + +#define init_idct(tx, nm) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon + + init_itxfm(TX_4X4, 4x4); + init_itxfm(TX_8X8, 8x8); + init_itxfm(TX_16X16, 16x16); + init_idct(TX_32X32, idct_idct_32x32); + init_idct(4, iwht_iwht_4x4); + } +} + +av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) +{ + vp9dsp_mc_init_aarch64(dsp, bpp); + vp9dsp_itxfm_init_aarch64(dsp, bpp); +} diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S new file mode 100644 index 0000000000..7ce3116a14 --- /dev/null +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -0,0 +1,1116 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +const itxfm4_coeffs, align=4 + .short 11585, 6270, 15137, 0 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +idct_coeffs: + .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606 + .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 + .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 +endconst + +// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +// in/out are .8h registers; this can do with 4 temp registers, but is +// more efficient if 6 temp registers are available. +.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 +.if \neg > 0 + neg \tmp4\().4h, v0.4h +.endif + add \tmp1\().8h, \in1\().8h, \in2\().8h + sub \tmp2\().8h, \in1\().8h, \in2\().8h +.if \neg > 0 + smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0] + smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0] +.else + smull \tmp3\().4s, \tmp1\().4h, v0.h[0] + smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0] +.endif +.ifb \tmp5 + rshrn \out1\().4h, \tmp3\().4s, #14 + rshrn2 \out1\().8h, \tmp4\().4s, #14 + smull \tmp3\().4s, \tmp2\().4h, v0.h[0] + smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0] + rshrn \out2\().4h, \tmp3\().4s, #14 + rshrn2 \out2\().8h, \tmp4\().4s, #14 +.else + smull \tmp5\().4s, \tmp2\().4h, v0.h[0] + smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0] + rshrn \out1\().4h, \tmp3\().4s, #14 + rshrn2 \out1\().8h, \tmp4\().4s, #14 + rshrn \out2\().4h, \tmp5\().4s, #14 + rshrn2 \out2\().8h, \tmp6\().4s, #14 +.endif +.endm + +// out1,out2 = in1 * coef1 - in2 * coef2 +// out3,out4 = in1 * coef2 + in2 * coef1 +// out are 4 x .4s registers, in are 2 x .8h registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 + smull \out1\().4s, \in1\().4h, \coef1 + smull2 \out2\().4s, \in1\().8h, \coef1 + smull \out3\().4s, \in1\().4h, \coef2 + smull2 \out4\().4s, \in1\().8h, \coef2 + smlsl \out1\().4s, \in2\().4h, \coef2 + smlsl2 \out2\().4s, \in2\().8h, \coef2 + smlal \out3\().4s, \in2\().4h, \coef1 + smlal2 \out4\().4s, \in2\().8h, \coef1 +.endm + +// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +// inout are 2 x .8h registers +.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 +.if \neg > 0 + neg \tmp3\().4s, \tmp3\().4s + neg \tmp4\().4s, \tmp4\().4s +.endif + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// out1 = in1 + in2 +// out2 = in1 - in2 +.macro butterfly_8h out1, out2, in1, in2 + add \out1\().8h, \in1\().8h, \in2\().8h + sub \out2\().8h, \in1\().8h, \in2\().8h +.endm + +// out1 = in1 - in2 +// out2 = in1 + in2 +.macro butterfly_8h_r out1, out2, in1, in2 + sub \out1\().8h, \in1\().8h, \in2\().8h + add \out2\().8h, \in1\().8h, \in2\().8h +.endm + +// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +// out are 2 x .8h registers, in are 4 x .4s registers +.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + add \tmp1\().4s, \in1\().4s, \in3\().4s + add \tmp2\().4s, \in2\().4s, \in4\().4s + sub \tmp3\().4s, \in1\().4s, \in3\().4s + sub \tmp4\().4s, \in2\().4s, \in4\().4s + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + rshrn \out2\().4h, \tmp3\().4s, #14 + rshrn2 \out2\().8h, \tmp4\().4s, #14 +.endm + +.macro iwht4 c0, c1, c2, c3 + add \c0\().4h, \c0\().4h, \c1\().4h + sub v17.4h, \c2\().4h, \c3\().4h + sub v16.4h, \c0\().4h, v17.4h + sshr v16.4h, v16.4h, #1 + sub \c2\().4h, v16.4h, \c1\().4h + sub \c1\().4h, v16.4h, \c3\().4h + add \c3\().4h, v17.4h, \c2\().4h + sub \c0\().4h, \c0\().4h, \c1\().4h +.endm + +.macro idct4 c0, c1, c2, c3 + smull v22.4s, \c1\().4h, v0.h[2] + smull v20.4s, \c1\().4h, v0.h[1] + add v16.4h, \c0\().4h, \c2\().4h + sub v17.4h, \c0\().4h, \c2\().4h + smlal v22.4s, \c3\().4h, v0.h[1] + smull v18.4s, v16.4h, v0.h[0] + smull v19.4s, v17.4h, v0.h[0] + smlsl v20.4s, \c3\().4h, v0.h[2] + rshrn v22.4h, v22.4s, #14 + rshrn v18.4h, v18.4s, #14 + rshrn v19.4h, v19.4s, #14 + rshrn v20.4h, v20.4s, #14 + add \c0\().4h, v18.4h, v22.4h + sub \c3\().4h, v18.4h, v22.4h + add \c1\().4h, v19.4h, v20.4h + sub \c2\().4h, v19.4h, v20.4h +.endm + +.macro iadst4 c0, c1, c2, c3 + smull v16.4s, \c0\().4h, v0.h[4] + smlal v16.4s, \c2\().4h, v0.h[5] + smlal v16.4s, \c3\().4h, v0.h[6] + smull v17.4s, \c0\().4h, v0.h[6] + smlsl v17.4s, \c2\().4h, v0.h[4] + sub \c0\().4h, \c0\().4h, \c2\().4h + smlsl v17.4s, \c3\().4h, v0.h[5] + add \c0\().4h, \c0\().4h, \c3\().4h + smull v19.4s, \c1\().4h, v0.h[7] + smull v18.4s, \c0\().4h, v0.h[7] + add v20.4s, v16.4s, v19.4s + add v21.4s, v17.4s, v19.4s + rshrn \c0\().4h, v20.4s, #14 + add v16.4s, v16.4s, v17.4s + rshrn \c1\().4h, v21.4s, #14 + sub v16.4s, v16.4s, v19.4s + rshrn \c2\().4h, v18.4s, #14 + rshrn \c3\().4h, v16.4s, #14 +.endm + +// The public functions in this file have got the following signature: +// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel x4, itxfm4_coeffs + ld1 {v0.4h}, [x4] +.endif +.ifc \txfm1,iadst + movrel x4, iadst4_coeffs + ld1 {v0.d}[1], [x4] +.endif +.else + movrel x4, itxfm4_coeffs + ld1 {v0.8h}, [x4] +.endif + + movi v31.8h, #0 +.ifc \txfm1\()_\txfm2,idct_idct + cmp x3, #1 + b.ne 1f + // DC-only for idct/idct + ld1r {v2.4h}, [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + st1 {v31.h}[0], [x2] + dup v4.4h, v2.h[0] + mov v5.16b, v4.16b + mov v6.16b, v4.16b + mov v7.16b, v4.16b + b 2f +.endif + +1: + ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2] + st1 {v31.8h}, [x2], #16 + +.ifc \txfm1,iwht + sshr v4.4h, v4.4h, #2 + sshr v5.4h, v5.4h, #2 + sshr v6.4h, v6.4h, #2 + sshr v7.4h, v7.4h, #2 +.endif + + \txfm1\()4 v4, v5, v6, v7 + + st1 {v31.8h}, [x2], #16 + // Transpose 4x4 with 16 bit elements + transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19 + + \txfm2\()4 v4, v5, v6, v7 +2: + ld1r {v0.2s}, [x0], x1 + ld1r {v1.2s}, [x0], x1 +.ifnc \txfm1,iwht + srshr v4.4h, v4.4h, #4 + srshr v5.4h, v5.4h, #4 + srshr v6.4h, v6.4h, #4 + srshr v7.4h, v7.4h, #4 +.endif + uaddw v4.8h, v4.8h, v0.8b + uaddw v5.8h, v5.8h, v1.8b + ld1r {v2.2s}, [x0], x1 + ld1r {v3.2s}, [x0], x1 + sqxtun v0.8b, v4.8h + sqxtun v1.8b, v5.8h + sub x0, x0, x1, lsl #2 + + uaddw v6.8h, v6.8h, v2.8b + uaddw v7.8h, v7.8h, v3.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v2.8b, v6.8h + sqxtun v3.8b, v7.8h + + st1 {v1.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + + ret +endfunc +.endm + +itxfm_func4x4 idct, idct +itxfm_func4x4 iadst, idct +itxfm_func4x4 idct, iadst +itxfm_func4x4 iadst, iadst +itxfm_func4x4 iwht, iwht + + +.macro idct8 + dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a + dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a + dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a + dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a + + butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 + butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a + butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a + butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2 + + dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5 + + butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7] + butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6] + butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5] + butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4] +.endm + +.macro iadst8 + dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a + dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a + dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a + dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a + + dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4 + dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5 + dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6 + dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7 + + butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2 + butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3 + neg v23.8h, v23.8h // v23 = out[7] + + dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] + neg v19.8h, v19.8h // v19 = out[3] + + dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a + dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a + + dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 + dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 + neg v17.8h, v17.8h // v17 = out[1] + + dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5] + neg v21.8h, v21.8h // v21 = out[5] +.endm + + +.macro itxfm_func8x8 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 + // The iadst also uses a few coefficients from + // idct, so those always need to be loaded. +.ifc \txfm1\()_\txfm2,idct_idct + movrel x4, idct_coeffs + ld1 {v0.8h}, [x4] +.else + movrel x4, iadst8_coeffs + ld1 {v1.8h}, [x4], #16 + ld1 {v0.8h}, [x4] +.endif + + movi v2.16b, #0 + movi v3.16b, #0 + movi v4.16b, #0 + movi v5.16b, #0 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp x3, #1 + b.ne 1f + // DC-only for idct/idct + ld1r {v2.4h}, [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + st1 {v3.h}[0], [x2] + dup v16.8h, v2.h[0] + mov v17.16b, v16.16b + mov v18.16b, v16.16b + mov v19.16b, v16.16b + mov v20.16b, v16.16b + mov v21.16b, v16.16b + mov v22.16b, v16.16b + mov v23.16b, v16.16b + b 2f +.endif +1: + ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x2], #64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x2], #64 + sub x2, x2, #128 + st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64 + st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64 + + \txfm1\()8 + + // Transpose 8x8 with 16 bit elements + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + \txfm2\()8 +2: + mov x3, x0 + // Add into the destination + ld1 {v0.8b}, [x0], x1 + srshr v16.8h, v16.8h, #5 + ld1 {v1.8b}, [x0], x1 + srshr v17.8h, v17.8h, #5 + ld1 {v2.8b}, [x0], x1 + srshr v18.8h, v18.8h, #5 + uaddw v16.8h, v16.8h, v0.8b + ld1 {v3.8b}, [x0], x1 + srshr v19.8h, v19.8h, #5 + uaddw v17.8h, v17.8h, v1.8b + ld1 {v4.8b}, [x0], x1 + srshr v20.8h, v20.8h, #5 + uaddw v18.8h, v18.8h, v2.8b + sqxtun v0.8b, v16.8h + ld1 {v5.8b}, [x0], x1 + srshr v21.8h, v21.8h, #5 + uaddw v19.8h, v19.8h, v3.8b + sqxtun v1.8b, v17.8h + ld1 {v6.8b}, [x0], x1 + srshr v22.8h, v22.8h, #5 + uaddw v20.8h, v20.8h, v4.8b + sqxtun v2.8b, v18.8h + ld1 {v7.8b}, [x0], x1 + srshr v23.8h, v23.8h, #5 + uaddw v21.8h, v21.8h, v5.8b + sqxtun v3.8b, v19.8h + + st1 {v0.8b}, [x3], x1 + uaddw v22.8h, v22.8h, v6.8b + st1 {v1.8b}, [x3], x1 + sqxtun v4.8b, v20.8h + st1 {v2.8b}, [x3], x1 + uaddw v23.8h, v23.8h, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v5.8b, v21.8h + st1 {v4.8b}, [x3], x1 + sqxtun v6.8b, v22.8h + st1 {v5.8b}, [x3], x1 + sqxtun v7.8b, v23.8h + + st1 {v6.8b}, [x3], x1 + st1 {v7.8b}, [x3], x1 + + ret +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + + +function idct16x16_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + + ld1r {v2.4h}, [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + dup v2.8h, v2.h[0] + st1 {v1.h}[0], [x2] + + srshr v2.8h, v2.8h, #6 + + mov x4, #16 +1: + // Loop to add the constant from v2 into all 16x16 outputs + ld1 {v3.16b}, [x0] + uaddw v4.8h, v2.8h, v3.8b + uaddw2 v5.8h, v2.8h, v3.16b + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + subs x4, x4, #1 + b.ne 1b + + ret +endfunc + +.macro idct16 + dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + + butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a + butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 + butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 + butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4 + butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a + butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10 + butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13 + butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a + + dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a + dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 + + butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15] + butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14] + butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] + butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8] + butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13] + butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12] + butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] + butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] +.endm + +.macro iadst16 + ld1 {v0.8h,v1.8h}, [x11] + + dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8 + dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a + dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 + dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a + + dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10 + dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a + dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4 + dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a + + dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 + dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a + dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6 + dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a + + dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 + ld1 {v0.8h}, [x10] + dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a + dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8 + dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a + + dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13 + dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a + dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10 + butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 + dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a + + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15 + butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 + dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a + dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a + + butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 + butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 + + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15 + + dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a + dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a + neg v29.8h, v29.8h // v29 = out[13] + + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a + + butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a + butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 + + dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 + neg v19.8h, v19.8h // v19 = out[3] + dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 + + butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a + butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11 + + dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] + dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] + dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] + dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] + + neg v31.8h, v5.8h // v31 = out[15] + neg v17.8h, v3.8h // v17 = out[1] + + mov v16.16b, v2.16b + mov v30.16b, v4.16b +.endm + +// Helper macros; we can't use these expressions directly within +// e.g. .irp due to the extra concatenation \(). Therefore wrap +// them in macros to allow using .irp below. +.macro load i, src, inc + ld1 {v\i\().8h}, [\src], \inc +.endm +.macro store i, dst, inc + st1 {v\i\().8h}, [\dst], \inc +.endm +.macro load_clear i, src, inc + ld1 {v\i\().8h}, [\src] + st1 {v2.8h}, [\src], \inc +.endm + +// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, +// transpose into a horizontal 16x8 slice and store. +// x0 = dst (temp buffer) +// x1 = unused +// x2 = src +// x3 = slice offset +.macro itxfm16_1d_funcs txfm +function \txfm\()16_1d_8x16_pass1_neon + mov x9, #32 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr + + \txfm\()16 + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + cmp x3, #8 + b.eq 1f +.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 + store \i, x0, #16 +.endr + ret +1: + // Special case: For the last input column (x3 == 8), + // which would be stored as the last row in the temp buffer, + // don't store the first 8x8 block, but keep it in registers + // for the first slice of the second pass (where it is the + // last 8x8 block). +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + add x0, x0, #16 + store \i, x0, #16 +.endr + mov v24.16b, v16.16b + mov v25.16b, v17.16b + mov v26.16b, v18.16b + mov v27.16b, v19.16b + mov v28.16b, v20.16b + mov v29.16b, v21.16b + mov v30.16b, v22.16b + mov v31.16b, v23.16b + ret +endfunc + +// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, +// load the destination pixels (from a similar 8x16 slice), add and store back. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x3 = slice offset +function \txfm\()16_1d_8x16_pass2_neon + mov x9, #32 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + \txfm\()16 + +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 + srshr \coef0, \coef0, #6 + ld1 {v2.8b}, [x0], x1 + srshr \coef1, \coef1, #6 + ld1 {v3.8b}, [x3], x1 + srshr \coef2, \coef2, #6 + ld1 {v4.8b}, [x0], x1 + srshr \coef3, \coef3, #6 + uaddw \coef0, \coef0, v2.8b + ld1 {v5.8b}, [x3], x1 + uaddw \coef1, \coef1, v3.8b + srshr \coef4, \coef4, #6 + ld1 {v6.8b}, [x0], x1 + srshr \coef5, \coef5, #6 + ld1 {v7.8b}, [x3], x1 + sqxtun v2.8b, \coef0 + srshr \coef6, \coef6, #6 + sqxtun v3.8b, \coef1 + srshr \coef7, \coef7, #6 + uaddw \coef2, \coef2, v4.8b + ld1 {\tmp1}, [x0], x1 + uaddw \coef3, \coef3, v5.8b + ld1 {\tmp2}, [x3], x1 + sqxtun v4.8b, \coef2 + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + sqxtun v5.8b, \coef3 + uaddw \coef4, \coef4, v6.8b + st1 {v2.8b}, [x0], x1 + uaddw \coef5, \coef5, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v6.8b, \coef4 + st1 {v4.8b}, [x0], x1 + sqxtun v7.8b, \coef5 + st1 {v5.8b}, [x3], x1 + uaddw \coef6, \coef6, \tmp1 + st1 {v6.8b}, [x0], x1 + uaddw \coef7, \coef7, \tmp2 + st1 {v7.8b}, [x3], x1 + sqxtun \tmp1, \coef6 + sqxtun \tmp2, \coef7 + st1 {\tmp1}, [x0], x1 + st1 {\tmp2}, [x3], x1 +.endm + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b +.purgem load_add_store + + ret +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst + +.macro itxfm_func16x16 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 +.ifc \txfm1\()_\txfm2,idct_idct + cmp x3, #1 + b.eq idct16x16_dc_add_neon +.endif + mov x15, x30 + // iadst16 requires clobbering v8-v15, but idct16 doesn't need to. +.ifnc \txfm1\()_\txfm2,idct_idct + stp d14, d15, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! +.endif + + sub sp, sp, #512 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + movrel x10, idct_coeffs +.ifnc \txfm1\()_\txfm2,idct_idct + movrel x11, iadst16_coeffs +.endif +.ifc \txfm1,idct + ld1 {v0.8h,v1.8h}, [x10] +.endif + +.irp i, 0, 8 + add x0, sp, #(\i*32) + add x2, x6, #(\i*2) + mov x3, #\i + bl \txfm1\()16_1d_8x16_pass1_neon +.endr +.ifc \txfm1\()_\txfm2,iadst_idct + ld1 {v0.8h,v1.8h}, [x10] +.endif +.irp i, 0, 8 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + mov x3, #\i + bl \txfm2\()16_1d_8x16_pass2_neon +.endr + + add sp, sp, #512 +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d8, d9, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d14, d15, [sp], 0x10 +.endif + br x15 +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst + + +function idct32x32_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + + ld1r {v2.4h}, [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + dup v2.8h, v2.h[0] + st1 {v1.h}[0], [x2] + + srshr v0.8h, v2.8h, #6 + + mov x4, #32 +1: + // Loop to add the constant v0 into all 32x32 outputs + ld1 {v1.16b,v2.16b}, [x0] + uaddw v3.8h, v0.8h, v1.8b + uaddw2 v4.8h, v0.8h, v1.16b + uaddw v5.8h, v0.8h, v2.8b + uaddw2 v6.8h, v0.8h, v2.16b + sqxtun v3.8b, v3.8h + sqxtun2 v3.16b, v4.8h + sqxtun v4.8b, v5.8h + sqxtun2 v4.16b, v6.8h + st1 {v3.16b,v4.16b}, [x0], x1 + subs x4, x4, #1 + b.ne 1b + + ret +endfunc + +.macro idct32_odd + ld1 {v0.8h,v1.8h}, [x11] + + dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + ld1 {v0.8h}, [x10] + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + + butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a + butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 + butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a + butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21 + butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a + butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26 + butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a + butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 + + dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + + butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 + butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a + butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16 + butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a + butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21 + butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a + butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26 + butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20 + + dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20 + dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a + dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 + dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a +.endm + +// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. +// The 32-point IDCT can be decomposed into two 16-point IDCTs; +// a normal IDCT16 with every other input component (the even ones, with +// each output written twice), followed by a separate 16-point IDCT +// of the odd inputs, added/subtracted onto the outputs of the first idct16. +// x0 = dst (temp buffer) +// x1 = unused +// x2 = src +// x10 = idct_coeffs +// x11 = idct_coeffs + 32 +function idct32_1d_8x32_pass1_neon + ld1 {v0.8h,v1.8h}, [x10] + + // Double stride of the input, since we only read every other line + mov x9, #128 + movi v4.8h, #0 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x2] + st1 {v4.8h}, [x2], x9 +.endr + + idct16 + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the + // two transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the registers a, b horizontally, followed by the + // same registers b, a mirrored. +.macro store_rev a, b + // There's no rev128 instruction, but we reverse each 64 bit + // half, and then flip them using an ext with 8 bytes offset. + rev64 v1.8h, v\b\().8h + st1 {v\a\().8h}, [x0], #16 + rev64 v0.8h, v\a\().8h + ext v1.16b, v1.16b, v1.16b, #8 + st1 {v\b\().8h}, [x0], #16 + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v1.8h}, [x0], #16 + st1 {v0.8h}, [x0], #16 +.endm + store_rev 16, 24 + store_rev 17, 25 + store_rev 18, 26 + store_rev 19, 27 + store_rev 20, 28 + store_rev 21, 29 + store_rev 22, 30 + store_rev 23, 31 + sub x0, x0, #512 +.purgem store_rev + + // Move x2 back to the start of the input, and move + // to the first odd row + sub x2, x2, x9, lsl #4 + add x2, x2, #64 + + movi v4.8h, #0 + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x2] + st1 {v4.8h}, [x2], x9 +.endr + + idct32_odd + + transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 + transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 + + // Store the registers a, b horizontally, + // adding into the output first, and the mirrored, + // subtracted from the output. +.macro store_rev a, b + ld1 {v4.8h}, [x0] + rev64 v1.8h, v\b\().8h + add v4.8h, v4.8h, v\a\().8h + rev64 v0.8h, v\a\().8h + st1 {v4.8h}, [x0], #16 + ext v1.16b, v1.16b, v1.16b, #8 + ld1 {v5.8h}, [x0] + ext v0.16b, v0.16b, v0.16b, #8 + add v5.8h, v5.8h, v\b\().8h + st1 {v5.8h}, [x0], #16 + ld1 {v6.8h}, [x0] + sub v6.8h, v6.8h, v1.8h + st1 {v6.8h}, [x0], #16 + ld1 {v7.8h}, [x0] + sub v7.8h, v7.8h, v0.8h + st1 {v7.8h}, [x0], #16 +.endm + + store_rev 31, 23 + store_rev 30, 22 + store_rev 29, 21 + store_rev 28, 20 + store_rev 27, 19 + store_rev 26, 18 + store_rev 25, 17 + store_rev 24, 16 +.purgem store_rev + ret +endfunc + +// This is mostly the same as 8x32_pass1, but without the transpose, +// and use the source as temp buffer between the two idct passes, and +// add into the destination. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x10 = idct_coeffs +// x11 = idct_coeffs + 32 +function idct32_1d_8x32_pass2_neon + ld1 {v0.8h,v1.8h}, [x10] + + mov x9, #128 + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x2], x9 +.endr + sub x2, x2, x9, lsl #4 + + idct16 + + mov x9, #128 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x2], x9 +.endr + + sub x2, x2, x9, lsl #4 + add x2, x2, #64 + + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x2], x9 +.endr + sub x2, x2, x9, lsl #4 + sub x2, x2, #64 + + idct32_odd + + mov x9, #128 +.macro load_acc_store a, b, c, d, neg=0 + ld1 {v4.8h}, [x2], x9 + ld1 {v5.8h}, [x2], x9 +.if \neg == 0 + add v4.8h, v4.8h, v\a\().8h + ld1 {v6.8h}, [x2], x9 + add v5.8h, v5.8h, v\b\().8h + ld1 {v7.8h}, [x2], x9 + add v6.8h, v6.8h, v\c\().8h + add v7.8h, v7.8h, v\d\().8h +.else + sub v4.8h, v4.8h, v\a\().8h + ld1 {v6.8h}, [x2], x9 + sub v5.8h, v5.8h, v\b\().8h + ld1 {v7.8h}, [x2], x9 + sub v6.8h, v6.8h, v\c\().8h + sub v7.8h, v7.8h, v\d\().8h +.endif + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + srshr v4.8h, v4.8h, #6 + ld1 {v2.8b}, [x0], x1 + srshr v5.8h, v5.8h, #6 + uaddw v4.8h, v4.8h, v0.8b + ld1 {v3.8b}, [x0], x1 + srshr v6.8h, v6.8h, #6 + uaddw v5.8h, v5.8h, v1.8b + srshr v7.8h, v7.8h, #6 + sub x0, x0, x1, lsl #2 + uaddw v6.8h, v6.8h, v2.8b + sqxtun v4.8b, v4.8h + uaddw v7.8h, v7.8h, v3.8b + sqxtun v5.8b, v5.8h + st1 {v4.8b}, [x0], x1 + sqxtun v6.8b, v6.8h + st1 {v5.8b}, [x0], x1 + sqxtun v7.8b, v7.8h + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 +.endm + load_acc_store 31, 30, 29, 28 + load_acc_store 27, 26, 25, 24 + load_acc_store 23, 22, 21, 20 + load_acc_store 19, 18, 17, 16 + sub x2, x2, x9 + neg x9, x9 + load_acc_store 16, 17, 18, 19, 1 + load_acc_store 20, 21, 22, 23, 1 + load_acc_store 24, 25, 26, 27, 1 + load_acc_store 28, 29, 30, 31, 1 +.purgem load_acc_store + ret +endfunc + +function ff_vp9_idct_idct_32x32_add_neon, export=1 + cmp x3, #1 + b.eq idct32x32_dc_add_neon + + movrel x10, idct_coeffs + add x11, x10, #32 + + mov x15, x30 + + stp d14, d15, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + + sub sp, sp, #2048 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + +.irp i, 0, 8, 16, 24 + add x0, sp, #(\i*64) + add x2, x6, #(\i*2) + bl idct32_1d_8x32_pass1_neon +.endr +.irp i, 0, 8, 16, 24 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + bl idct32_1d_8x32_pass2_neon +.endr + + add sp, sp, #2048 + + ldp d8, d9, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d14, d15, [sp], 0x10 + + br x15 +endfunc