/* -*-arm64-*- * vim: syntax=arm64asm * * Copyright (c) 2022 J. Dekker * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/aarch64/asm.S" #define MAX_PB_SIZE 64 const qpel_filters, align=4 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte -1, 4,-10, 58, 17, -5, 1, 0 .byte -1, 4,-11, 40, 40,-11, 4, -1 .byte 0, 1, -5, 17, 58,-10, 4, -1 endconst const qpel_filters_abs, align=4 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 1, 4, 10, 58, 17, 5, 1, 0 .byte 1, 4, 11, 40, 40, 11, 4, 1 .byte 0, 1, 5, 17, 58, 10, 4, 1 endconst .macro load_filter m movrel x15, qpel_filters add x15, x15, \m, lsl #3 ld1 {v0.8b}, [x15] sxtl v0.8h, v0.8b .endm .macro load_qpel_filterb freg, xreg movrel \xreg, qpel_filters_abs add \xreg, \xreg, \freg, lsl #3 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4 ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg] .endm .macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7 umull \dst\().8h, \src1\().8b, v1.8b umlsl \dst\().8h, \src0\().8b, v0.8b umlsl \dst\().8h, \src2\().8b, v2.8b umlal \dst\().8h, \src3\().8b, v3.8b umlal \dst\().8h, \src4\().8b, v4.8b umlsl \dst\().8h, \src5\().8b, v5.8b umlal \dst\().8h, \src6\().8b, v6.8b umlsl \dst\().8h, \src7\().8b, v7.8b .endm .macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7 umull2 \dst\().8h, \src1\().16b, v1.16b umlsl2 \dst\().8h, \src0\().16b, v0.16b umlsl2 \dst\().8h, \src2\().16b, v2.16b umlal2 \dst\().8h, \src3\().16b, v3.16b umlal2 \dst\().8h, \src4\().16b, v4.16b umlsl2 \dst\().8h, \src5\().16b, v5.16b umlal2 \dst\().8h, \src6\().16b, v6.16b umlsl2 \dst\().8h, \src7\().16b, v7.16b .endm .macro load_qpel_filterh freg, xreg movrel \xreg, qpel_filters add \xreg, \xreg, \freg, lsl #3 ld1 {v0.8b}, [\xreg] sxtl v0.8h, v0.8b .endm .macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6 smull \dst\().4s, \src0\().4h, v0.h[0] smlal \dst\().4s, \src1\().4h, v0.h[1] smlal \dst\().4s, \src2\().4h, v0.h[2] smlal \dst\().4s, \src3\().4h, v0.h[3] smlal \dst\().4s, \src4\().4h, v0.h[4] smlal \dst\().4s, \src5\().4h, v0.h[5] smlal \dst\().4s, \src6\().4h, v0.h[6] smlal \dst\().4s, \src7\().4h, v0.h[7] .ifc \op, sshr sshr \dst\().4s, \dst\().4s, \shift .else \op \dst\().4h, \dst\().4s, \shift .endif .endm .macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6 smull2 \dstt\().4s, \src0\().8h, v0.h[0] smlal2 \dstt\().4s, \src1\().8h, v0.h[1] smlal2 \dstt\().4s, \src2\().8h, v0.h[2] smlal2 \dstt\().4s, \src3\().8h, v0.h[3] smlal2 \dstt\().4s, \src4\().8h, v0.h[4] smlal2 \dstt\().4s, \src5\().8h, v0.h[5] smlal2 \dstt\().4s, \src6\().8h, v0.h[6] smlal2 \dstt\().4s, \src7\().8h, v0.h[7] .ifc \op, sshr sshr \dst\().4s, \dstt\().4s, \shift .else \op \dst\().8h, \dstt\().4s, \shift .endif .endm .macro calc_all calc v23, v16, v17, v18, v19, v20, v21, v22, v23 b.eq 2f calc v16, v17, v18, v19, v20, v21, v22, v23, v16 b.eq 2f calc v17, v18, v19, v20, v21, v22, v23, v16, v17 b.eq 2f calc v18, v19, v20, v21, v22, v23, v16, v17, v18 b.eq 2f calc v19, v20, v21, v22, v23, v16, v17, v18, v19 b.eq 2f calc v20, v21, v22, v23, v16, v17, v18, v19, v20 b.eq 2f calc v21, v22, v23, v16, v17, v18, v19, v20, v21 b.eq 2f calc v22, v23, v16, v17, v18, v19, v20, v21, v22 b.hi 1b .endm .macro calc_all2 calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31 b.eq 2f calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17 b.eq 2f calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19 b.eq 2f calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21 b.eq 2f calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23 b.eq 2f calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25 b.eq 2f calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27 b.eq 2f calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29 b.hi 1b .endm .macro put_hevc type .ifc \type, qpel // void put_hevc_qpel_h(int16_t *dst, // uint8_t *_src, ptrdiff_t _srcstride, // int height, intptr_t mx, intptr_t my, int width) dst .req x0 dststride .req x7 src .req x1 srcstride .req x2 height .req x3 heightw .req w3 mx .req x4 width .req w6 .endif .ifc \type, qpel_uni // void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride, // uint8_t *_src, ptrdiff_t _srcstride, // int height, intptr_t mx, intptr_t my, int width) dst .req x0 dststride .req x1 src .req x2 srcstride .req x3 height .req x4 heightw .req w4 mx .req x5 width .req w7 .endif .ifc \type, qpel_bi // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride, // uint8_t *_src, ptrdiff_t _srcstride, // int16_t *src2, int height, intptr_t mx, // intptr_t my, int width) dst .req x0 dststride .req x1 src .req x2 srcstride .req x3 height .req x5 heightw .req w5 mx .req x6 width .req w8 .endif .ifc \type, qpel function ff_hevc_put_hevc_h4_8_neon, export=0 uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mul v23.4h, v16.4h, v0.h[0] mul v24.4h, v18.4h, v0.h[0] .irpc i, 1234567 ext v20.16b, v16.16b, v17.16b, #(2*\i) ext v21.16b, v18.16b, v19.16b, #(2*\i) mla v23.4h, v20.4h, v0.h[\i] mla v24.4h, v21.4h, v0.h[\i] .endr ret endfunc .endif function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1 load_filter mx .ifc \type, qpel_bi mov x16, #(MAX_PB_SIZE << 2) // src2bstridel add x15, x4, #(MAX_PB_SIZE << 1) // src2b .endif sub src, src, #3 mov mx, x30 .ifc \type, qpel mov dststride, #(MAX_PB_SIZE << 1) lsl x13, srcstride, #1 // srcstridel mov x14, #(MAX_PB_SIZE << 2) .else lsl x14, dststride, #1 // dststridel lsl x13, srcstride, #1 // srcstridel .endif add x10, dst, dststride // dstb add x12, src, srcstride // srcb 0: ld1 {v16.8b, v17.8b}, [src], x13 ld1 {v18.8b, v19.8b}, [x12], x13 .ifc \type, qpel_bi ld1 {v25.8h}, [ x4], x16 ld1 {v26.8h}, [x15], x16 .endif bl ff_hevc_put_hevc_h4_8_neon subs heightw, heightw, #2 .ifc \type, qpel st1 {v23.4h}, [dst], x14 st1 {v24.4h}, [x10], x14 .else .ifc \type, qpel_bi sqadd v23.4h, v23.4h, v25.4h sqadd v24.4h, v24.4h, v26.4h sqrshrun v23.8b, v23.8h, #7 sqrshrun v24.8b, v24.8h, #7 .else sqrshrun v23.8b, v23.8h, #6 sqrshrun v24.8b, v24.8h, #6 .endif st1 {v23.s}[0], [dst], x14 st1 {v24.s}[0], [x10], x14 .endif b.gt 0b // double line ret mx endfunc .ifc \type, qpel function ff_hevc_put_hevc_h8_8_neon, export=0 uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mul v23.8h, v16.8h, v0.h[0] mul v24.8h, v18.8h, v0.h[0] .irpc i, 1234567 ext v20.16b, v16.16b, v17.16b, #(2*\i) ext v21.16b, v18.16b, v19.16b, #(2*\i) mla v23.8h, v20.8h, v0.h[\i] mla v24.8h, v21.8h, v0.h[\i] .endr ret endfunc .endif function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1 load_filter mx .ifc \type, qpel_bi mov x16, #(MAX_PB_SIZE << 2) // src2bstridel add x15, x4, #(MAX_PB_SIZE << 1) // src2b .endif sub src, src, #3 mov mx, x30 .ifc \type, qpel mov dststride, #(MAX_PB_SIZE << 1) lsl x13, srcstride, #1 // srcstridel mov x14, #((MAX_PB_SIZE << 2) - 8) .else lsl x14, dststride, #1 // dststridel lsl x13, srcstride, #1 // srcstridel sub x14, x14, #4 .endif add x10, dst, dststride // dstb add x12, src, srcstride // srcb 0: ld1 {v16.8b, v17.8b}, [src], x13 ld1 {v18.8b, v19.8b}, [x12], x13 .ifc \type, qpel_bi ld1 {v25.8h}, [ x4], x16 ld1 {v26.8h}, [x15], x16 .endif bl ff_hevc_put_hevc_h8_8_neon subs heightw, heightw, #2 .ifc \type, qpel st1 {v23.4h}, [dst], #8 st1 {v24.4h}, [x10], #8 st1 {v23.s}[2], [dst], x14 st1 {v24.s}[2], [x10], x14 .else .ifc \type, qpel_bi sqadd v23.8h, v23.8h, v25.8h sqadd v24.8h, v24.8h, v26.8h sqrshrun v23.8b, v23.8h, #7 sqrshrun v24.8b, v24.8h, #7 .else sqrshrun v23.8b, v23.8h, #6 sqrshrun v24.8b, v24.8h, #6 .endif st1 {v23.s}[0], [dst], #4 st1 {v24.s}[0], [x10], #4 st1 {v23.h}[2], [dst], x14 st1 {v24.h}[2], [x10], x14 .endif b.gt 0b // double line ret mx endfunc function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1 load_filter mx .ifc \type, qpel_bi mov x16, #(MAX_PB_SIZE << 2) // src2bstridel add x15, x4, #(MAX_PB_SIZE << 1) // src2b .endif sub src, src, #3 mov mx, x30 .ifc \type, qpel mov dststride, #(MAX_PB_SIZE << 1) lsl x13, srcstride, #1 // srcstridel mov x14, #(MAX_PB_SIZE << 2) .else lsl x14, dststride, #1 // dststridel lsl x13, srcstride, #1 // srcstridel .endif add x10, dst, dststride // dstb add x12, src, srcstride // srcb 0: ld1 {v16.8b, v17.8b}, [src], x13 ld1 {v18.8b, v19.8b}, [x12], x13 .ifc \type, qpel_bi ld1 {v25.8h}, [ x4], x16 ld1 {v26.8h}, [x15], x16 .endif bl ff_hevc_put_hevc_h8_8_neon subs heightw, heightw, #2 .ifc \type, qpel st1 {v23.8h}, [dst], x14 st1 {v24.8h}, [x10], x14 .else .ifc \type, qpel_bi sqadd v23.8h, v23.8h, v25.8h sqadd v24.8h, v24.8h, v26.8h sqrshrun v23.8b, v23.8h, #7 sqrshrun v24.8b, v24.8h, #7 .else sqrshrun v23.8b, v23.8h, #6 sqrshrun v24.8b, v24.8h, #6 .endif st1 {v23.8b}, [dst], x14 st1 {v24.8b}, [x10], x14 .endif b.gt 0b // double line ret mx endfunc .ifc \type, qpel function ff_hevc_put_hevc_h16_8_neon, export=0 uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b mul v26.8h, v16.8h, v0.h[0] mul v27.8h, v17.8h, v0.h[0] mul v28.8h, v19.8h, v0.h[0] mul v29.8h, v20.8h, v0.h[0] .irpc i, 1234567 ext v22.16b, v16.16b, v17.16b, #(2*\i) ext v23.16b, v17.16b, v18.16b, #(2*\i) ext v24.16b, v19.16b, v20.16b, #(2*\i) ext v25.16b, v20.16b, v21.16b, #(2*\i) mla v26.8h, v22.8h, v0.h[\i] mla v27.8h, v23.8h, v0.h[\i] mla v28.8h, v24.8h, v0.h[\i] mla v29.8h, v25.8h, v0.h[\i] .endr ret endfunc .endif function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1 load_filter mx sxtw height, heightw .ifc \type, qpel_bi ldrh w8, [sp] // width mov x16, #(MAX_PB_SIZE << 2) // src2bstridel lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1)) add x15, x4, #(MAX_PB_SIZE << 1) // src2b .endif sub src, src, #3 mov mx, x30 .ifc \type, qpel mov dststride, #(MAX_PB_SIZE << 1) lsl x13, srcstride, #1 // srcstridel mov x14, #((MAX_PB_SIZE << 2) - 16) .else lsl x14, dststride, #1 // dststridel lsl x13, srcstride, #1 // srcstridel sub x14, x14, #8 .endif add x10, dst, dststride // dstb add x12, src, srcstride // srcb 0: mov x9, height 1: ld1 {v16.8b-v18.8b}, [src], x13 ld1 {v19.8b-v21.8b}, [x12], x13 uxtl v16.8h, v16.8b uxtl v19.8h, v19.8b bl ff_hevc_put_hevc_h16_8_neon subs x9, x9, #2 .ifc \type, qpel st1 {v26.8h}, [dst], #16 st1 {v28.8h}, [x10], #16 st1 {v27.4h}, [dst], x14 st1 {v29.4h}, [x10], x14 .else .ifc \type, qpel_bi ld1 {v16.8h, v17.8h}, [ x4], x16 ld1 {v18.8h, v19.8h}, [x15], x16 sqadd v26.8h, v26.8h, v16.8h sqadd v27.8h, v27.8h, v17.8h sqadd v28.8h, v28.8h, v18.8h sqadd v29.8h, v29.8h, v19.8h sqrshrun v26.8b, v26.8h, #7 sqrshrun v27.8b, v27.8h, #7 sqrshrun v28.8b, v28.8h, #7 sqrshrun v29.8b, v29.8h, #7 .else sqrshrun v26.8b, v26.8h, #6 sqrshrun v27.8b, v27.8h, #6 sqrshrun v28.8b, v28.8h, #6 sqrshrun v29.8b, v29.8h, #6 .endif st1 {v26.8b}, [dst], #8 st1 {v28.8b}, [x10], #8 st1 {v27.s}[0], [dst], x14 st1 {v29.s}[0], [x10], x14 .endif b.gt 1b // double line subs width, width, #12 // reset src msub src, srcstride, height, src msub x12, srcstride, height, x12 // reset dst msub dst, dststride, height, dst msub x10, dststride, height, x10 .ifc \type, qpel_bi // reset xsrc sub x4, x4, x17 sub x15, x15, x17 add x4, x4, #24 add x15, x15, #24 .endif add src, src, #12 add x12, x12, #12 .ifc \type, qpel add dst, dst, #24 add x10, x10, #24 .else add dst, dst, #12 add x10, x10, #12 .endif b.gt 0b ret mx endfunc function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1 load_filter mx sxtw height, heightw mov mx, x30 .ifc \type, qpel_bi ldrh w8, [sp] // width mov x16, #(MAX_PB_SIZE << 2) // src2bstridel add x15, x4, #(MAX_PB_SIZE << 1) // src2b .endif sub src, src, #3 mov mx, x30 .ifc \type, qpel mov dststride, #(MAX_PB_SIZE << 1) lsl x13, srcstride, #1 // srcstridel mov x14, #(MAX_PB_SIZE << 2) .else lsl x14, dststride, #1 // dststridel lsl x13, srcstride, #1 // srcstridel .endif add x10, dst, dststride // dstb add x12, src, srcstride // srcb 1: ld1 {v16.8b-v18.8b}, [src], x13 ld1 {v19.8b-v21.8b}, [x12], x13 uxtl v16.8h, v16.8b uxtl v19.8h, v19.8b bl ff_hevc_put_hevc_h16_8_neon subs height, height, #2 .ifc \type, qpel st1 {v26.8h, v27.8h}, [dst], x14 st1 {v28.8h, v29.8h}, [x10], x14 .else .ifc \type, qpel_bi ld1 {v16.8h, v17.8h}, [ x4], x16 ld1 {v18.8h, v19.8h}, [x15], x16 sqadd v26.8h, v26.8h, v16.8h sqadd v27.8h, v27.8h, v17.8h sqadd v28.8h, v28.8h, v18.8h sqadd v29.8h, v29.8h, v19.8h sqrshrun v26.8b, v26.8h, #7 sqrshrun v27.8b, v27.8h, #7 sqrshrun v28.8b, v28.8h, #7 sqrshrun v29.8b, v29.8h, #7 .else sqrshrun v26.8b, v26.8h, #6 sqrshrun v27.8b, v27.8h, #6 sqrshrun v28.8b, v28.8h, #6 sqrshrun v29.8b, v29.8h, #6 .endif st1 {v26.8b, v27.8b}, [dst], x14 st1 {v28.8b, v29.8b}, [x10], x14 .endif b.gt 1b // double line ret mx endfunc function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1 load_filter mx sxtw height, heightw mov mx, x30 .ifc \type, qpel_bi ldrh w8, [sp] // width mov x16, #(MAX_PB_SIZE << 2) // src2bstridel lsl x17, x5, #7 // src2b reset add x15, x4, #(MAX_PB_SIZE << 1) // src2b sub x16, x16, width, uxtw #1 .endif sub src, src, #3 mov mx, x30 .ifc \type, qpel mov dststride, #(MAX_PB_SIZE << 1) lsl x13, srcstride, #1 // srcstridel mov x14, #(MAX_PB_SIZE << 2) sub x14, x14, width, uxtw #1 .else lsl x14, dststride, #1 // dststridel lsl x13, srcstride, #1 // srcstridel sub x14, x14, width, uxtw .endif sub x13, x13, width, uxtw sub x13, x13, #8 add x10, dst, dststride // dstb add x12, src, srcstride // srcb 0: mov w9, width ld1 {v16.8b}, [src], #8 ld1 {v19.8b}, [x12], #8 uxtl v16.8h, v16.8b uxtl v19.8h, v19.8b 1: ld1 {v17.8b-v18.8b}, [src], #16 ld1 {v20.8b-v21.8b}, [x12], #16 bl ff_hevc_put_hevc_h16_8_neon subs w9, w9, #16 mov v16.16b, v18.16b mov v19.16b, v21.16b .ifc \type, qpel st1 {v26.8h, v27.8h}, [dst], #32 st1 {v28.8h, v29.8h}, [x10], #32 .else .ifc \type, qpel_bi ld1 {v20.8h, v21.8h}, [ x4], #32 ld1 {v22.8h, v23.8h}, [x15], #32 sqadd v26.8h, v26.8h, v20.8h sqadd v27.8h, v27.8h, v21.8h sqadd v28.8h, v28.8h, v22.8h sqadd v29.8h, v29.8h, v23.8h sqrshrun v26.8b, v26.8h, #7 sqrshrun v27.8b, v27.8h, #7 sqrshrun v28.8b, v28.8h, #7 sqrshrun v29.8b, v29.8h, #7 .else sqrshrun v26.8b, v26.8h, #6 sqrshrun v27.8b, v27.8h, #6 sqrshrun v28.8b, v28.8h, #6 sqrshrun v29.8b, v29.8h, #6 .endif st1 {v26.8b, v27.8b}, [dst], #16 st1 {v28.8b, v29.8b}, [x10], #16 .endif b.gt 1b // double line subs height, height, #2 add src, src, x13 add x12, x12, x13 add dst, dst, x14 add x10, x10, x14 .ifc \type, qpel_bi add x4, x4, x16 add x15, x15, x16 .endif b.gt 0b ret mx endfunc .unreq height .unreq heightw .unreq width .unreq src .unreq dst .unreq srcstride .unreq dststride .unreq mx .endm put_hevc qpel put_hevc qpel_uni put_hevc qpel_bi function ff_hevc_put_hevc_qpel_v4_8_neon, export=1 load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 mov x9, #(MAX_PB_SIZE * 2) sub x1, x1, x2 ldr s16, [x1] ldr s17, [x1, x2] add x1, x1, x2, lsl #1 ldr s18, [x1] ldr s19, [x1, x2] add x1, x1, x2, lsl #1 ldr s20, [x1] ldr s21, [x1, x2] add x1, x1, x2, lsl #1 ldr s22, [x1] add x1, x1, x2 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().s}[0], [x1], x2 movi v24.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 st1 {v24.4h}, [x0], x9 subs w3, w3, #1 b.eq 2f .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_v6_8_neon, export=1 load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 mov x9, #(MAX_PB_SIZE * 2 - 8) sub x1, x1, x2 ldr d16, [x1] ldr d17, [x1, x2] add x1, x1, x2, lsl #1 ldr d18, [x1] ldr d19, [x1, x2] add x1, x1, x2, lsl #1 ldr d20, [x1] ldr d21, [x1, x2] add x1, x1, x2, lsl #1 ldr d22, [x1] add x1, x1, x2 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8b}, [x1], x2 movi v24.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 st1 {v24.4h}, [x0], #8 st1 {v24.s}[2], [x0], x9 subs w3, w3, #1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_v8_8_neon, export=1 load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 mov x9, #(MAX_PB_SIZE * 2) sub x1, x1, x2 ldr d16, [x1] ldr d17, [x1, x2] add x1, x1, x2, lsl #1 ldr d18, [x1] ldr d19, [x1, x2] add x1, x1, x2, lsl #1 ldr d20, [x1] ldr d21, [x1, x2] add x1, x1, x2, lsl #1 ldr d22, [x1] add x1, x1, x2 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8b}, [x1], x2 movi v24.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 st1 {v24.8h}, [x0], x9 subs w3, w3, #1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_v12_8_neon, export=1 load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 mov x9, #(MAX_PB_SIZE * 2 - 16) sub x1, x1, x2 ldr q16, [x1] ldr q17, [x1, x2] add x1, x1, x2, lsl #1 ldr q18, [x1] ldr q19, [x1, x2] add x1, x1, x2, lsl #1 ldr q20, [x1] ldr q21, [x1, x2] add x1, x1, x2, lsl #1 ldr q22, [x1] add x1, x1, x2 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().16b}, [x1], x2 movi v24.8h, #0 movi v25.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 st1 {v24.8h}, [x0], #16 subs w3, w3, #1 st1 {v25.4h}, [x0], x9 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_v16_8_neon, export=1 load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 mov x9, #(MAX_PB_SIZE * 2) sub x1, x1, x2 ldr q16, [x1] ldr q17, [x1, x2] add x1, x1, x2, lsl #1 ldr q18, [x1] ldr q19, [x1, x2] add x1, x1, x2, lsl #1 ldr q20, [x1] ldr q21, [x1, x2] add x1, x1, x2, lsl #1 ldr q22, [x1] add x1, x1, x2 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().16b}, [x1], x2 movi v24.8h, #0 movi v25.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 subs w3, w3, #1 st1 {v24.8h, v25.8h}, [x0], x9 .endm 1: calc_all .purgem calc 2: ret endfunc // todo: reads #32 bytes function ff_hevc_put_hevc_qpel_v24_8_neon, export=1 sub sp, sp, #32 st1 {v8.8b, v9.8b, v10.8b}, [sp] load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 sub x1, x1, x2 mov x9, #(MAX_PB_SIZE * 2) ld1 {v16.16b, v17.16b}, [x1], x2 ld1 {v18.16b, v19.16b}, [x1], x2 ld1 {v20.16b, v21.16b}, [x1], x2 ld1 {v22.16b, v23.16b}, [x1], x2 ld1 {v24.16b, v25.16b}, [x1], x2 ld1 {v26.16b, v27.16b}, [x1], x2 ld1 {v28.16b, v29.16b}, [x1], x2 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2 movi v8.8h, #0 movi v9.8h, #0 movi v10.8h, #0 calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 subs w3, w3, #1 st1 {v8.8h, v9.8h, v10.8h}, [x0], x9 .endm 1: calc_all2 .purgem calc 2: ld1 {v8.8b, v9.8b, v10.8b}, [sp] add sp, sp, #32 ret endfunc function ff_hevc_put_hevc_qpel_v32_8_neon, export=1 sub sp, sp, #32 st1 {v8.8b-v11.8b}, [sp] load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 mov x9, #(MAX_PB_SIZE * 2) sub x1, x1, x2 ld1 {v16.16b, v17.16b}, [x1], x2 ld1 {v18.16b, v19.16b}, [x1], x2 ld1 {v20.16b, v21.16b}, [x1], x2 ld1 {v22.16b, v23.16b}, [x1], x2 ld1 {v24.16b, v25.16b}, [x1], x2 ld1 {v26.16b, v27.16b}, [x1], x2 ld1 {v28.16b, v29.16b}, [x1], x2 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2 movi v8.8h, #0 movi v9.8h, #0 movi v10.8h, #0 movi v11.8h, #0 calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 subs w3, w3, #1 st1 {v8.8h-v11.8h}, [x0], x9 .endm 1: calc_all2 .purgem calc 2: ld1 {v8.8b-v11.8b}, [sp], #32 ret endfunc function ff_hevc_put_hevc_qpel_v48_8_neon, export=1 stp x2, x3, [sp, #-48]! stp x0, x1, [sp, #16] stp x5, x30, [sp, #32] bl X(ff_hevc_put_hevc_qpel_v24_8_neon) ldr x5, [sp, #32] ldp x0, x1, [sp, #16] ldp x2, x3, [sp], #32 add x0, x0, #48 add x1, x1, #24 bl X(ff_hevc_put_hevc_qpel_v24_8_neon) ldr x30, [sp, #8] add sp, sp, #16 ret endfunc function ff_hevc_put_hevc_qpel_v64_8_neon, export=1 sub sp, sp, #32 st1 {v8.8b-v11.8b}, [sp] load_qpel_filterb x5, x4 sub x1, x1, x2, lsl #1 sub x1, x1, x2 mov x9, #(MAX_PB_SIZE * 2) 0: mov x8, x1 // src ld1 {v16.16b, v17.16b}, [x8], x2 mov w11, w3 // height ld1 {v18.16b, v19.16b}, [x8], x2 mov x10, x0 // dst ld1 {v20.16b, v21.16b}, [x8], x2 ld1 {v22.16b, v23.16b}, [x8], x2 ld1 {v24.16b, v25.16b}, [x8], x2 ld1 {v26.16b, v27.16b}, [x8], x2 ld1 {v28.16b, v29.16b}, [x8], x2 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2 movi v8.8h, #0 movi v9.8h, #0 movi v10.8h, #0 movi v11.8h, #0 calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 subs x11, x11, #1 st1 {v8.8h-v11.8h}, [x10], x9 .endm 1: calc_all2 .purgem calc 2: add x0, x0, #64 add x1, x1, #32 subs w6, w6, #32 b.hi 0b ld1 {v8.8b-v11.8b}, [sp], #32 ret endfunc function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1 load_qpel_filterb x7, x6 sub x2, x2, x3, lsl #1 sub x2, x2, x3 mov x12, #(MAX_PB_SIZE * 2) ld1 {v16.s}[0], [x2], x3 ld1 {v17.s}[0], [x2], x3 ld1 {v18.s}[0], [x2], x3 ld1 {v19.s}[0], [x2], x3 ld1 {v20.s}[0], [x2], x3 ld1 {v21.s}[0], [x2], x3 ld1 {v22.s}[0], [x2], x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().s}[0], [x2], x3 movi v24.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 ld1 {v25.4h}, [x4], x12 // src2 sqadd v24.8h, v24.8h, v25.8h sqrshrun v25.8b, v24.8h, #7 subs w5, w5, #1 st1 {v25.s}[0], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1 load_qpel_filterb x7, x6 sub x2, x2, x3, lsl #1 sub x2, x2, x3 ld1 {v16.8b}, [x2], x3 sub x1, x1, #4 ld1 {v17.8b}, [x2], x3 mov x12, #(MAX_PB_SIZE * 2) ld1 {v18.8b}, [x2], x3 ld1 {v19.8b}, [x2], x3 ld1 {v20.8b}, [x2], x3 ld1 {v21.8b}, [x2], x3 ld1 {v22.8b}, [x2], x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8b}, [x2], x3 movi v24.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 ld1 {v25.8h}, [x4], x12 // src2 sqadd v24.8h, v24.8h, v25.8h sqrshrun v25.8b, v24.8h, #7 st1 {v25.s}[0], [x0], #4 subs w5, w5, #1 st1 {v25.h}[2], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1 load_qpel_filterb x7, x6 sub x2, x2, x3, lsl #1 sub x2, x2, x3 mov x12, #(MAX_PB_SIZE * 2) ld1 {v16.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3 ld1 {v18.8b}, [x2], x3 ld1 {v19.8b}, [x2], x3 ld1 {v20.8b}, [x2], x3 ld1 {v21.8b}, [x2], x3 ld1 {v22.8b}, [x2], x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8b}, [x2], x3 movi v24.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 ld1 {v25.8h}, [x4], x12 // src2 sqadd v24.8h, v24.8h, v25.8h sqrshrun v25.8b, v24.8h, #7 subs w5, w5, #1 st1 {v25.8b}, [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1 load_qpel_filterb x7, x6 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x1, x1, #8 ld1 {v16.16b}, [x2], x3 mov x12, #(MAX_PB_SIZE * 2) ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x2], x3 ld1 {v19.16b}, [x2], x3 ld1 {v20.16b}, [x2], x3 ld1 {v21.16b}, [x2], x3 ld1 {v22.16b}, [x2], x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().16b}, [x2], x3 movi v24.8h, #0 movi v25.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 ld1 {v26.8h, v27.8h}, [x4], x12 // src2 sqadd v24.8h, v24.8h, v26.8h sqadd v25.8h, v25.8h, v27.8h sqrshrun v26.8b, v24.8h, #7 sqrshrun2 v26.16b, v25.8h, #7 st1 {v26.8b}, [x0], #8 subs w5, w5, #1 st1 {v26.s}[2], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1 load_qpel_filterb x7, x6 sub x2, x2, x3, lsl #1 sub x2, x2, x3 mov x12, #(MAX_PB_SIZE * 2) ld1 {v16.16b}, [x2], x3 ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x2], x3 ld1 {v19.16b}, [x2], x3 ld1 {v20.16b}, [x2], x3 ld1 {v21.16b}, [x2], x3 ld1 {v22.16b}, [x2], x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().16b}, [x2], x3 movi v24.8h, #0 movi v25.8h, #0 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 ld1 {v26.8h, v27.8h}, [x4], x12 // src2 sqadd v24.8h, v24.8h, v26.8h sqadd v25.8h, v25.8h, v27.8h sqrshrun v26.8b, v24.8h, #7 subs w5, w5, #1 sqrshrun2 v26.16b, v25.8h, #7 st1 {v26.16b}, [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] stp x0, x1, [sp, #32] stp x7, x30, [sp, #48] bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon) ldp x2, x3, [sp, #16] ldp x0, x1, [sp, #32] ldr x7, [sp, #48] ldp x4, x5, [sp], #48 add x0, x0, #16 add x2, x2, #16 add x4, x4, #32 bl X(ff_hevc_put_hevc_qpel_bi_v8_8_neon) ldr x30, [sp, #8] add sp, sp, #16 ret endfunc function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1 stp d8, d9, [sp, #-64]! stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] sub x2, x2, x3, lsl #1 sub x2, x2, x3 load_qpel_filterb x7, x6 ldr w6, [sp, #64] mov x12, #(MAX_PB_SIZE * 2) 0: mov x8, x2 // src ld1 {v16.16b, v17.16b}, [x8], x3 mov w11, w5 // height ld1 {v18.16b, v19.16b}, [x8], x3 mov x10, x0 // dst ld1 {v20.16b, v21.16b}, [x8], x3 mov x9, x4 // src2 ld1 {v22.16b, v23.16b}, [x8], x3 ld1 {v24.16b, v25.16b}, [x8], x3 ld1 {v26.16b, v27.16b}, [x8], x3 ld1 {v28.16b, v29.16b}, [x8], x3 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x3 movi v8.8h, #0 movi v9.8h, #0 movi v10.8h, #0 movi v11.8h, #0 calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15 ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12 // src2 sqadd v8.8h, v8.8h, v12.8h sqadd v9.8h, v9.8h, v13.8h sqadd v10.8h, v10.8h, v14.8h sqadd v11.8h, v11.8h, v15.8h sqrshrun v12.8b, v8.8h, #7 sqrshrun2 v12.16b, v9.8h, #7 sqrshrun v13.8b, v10.8h, #7 sqrshrun2 v13.16b, v11.8h, #7 subs x11, x11, #1 st1 {v12.16b, v13.16b}, [x10], x1 .endm 1: calc_all2 .purgem calc 2: add x0, x0, #32 // dst add x2, x2, #32 // src add x4, x4, #64 // src2 subs w6, w6, #32 b.ne 0b ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #64 ret endfunc function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1 mov x8, #32 str x8, [sp, #-80]! stp x4, x5, [sp, #16] stp x2, x3, [sp, #32] stp x0, x1, [sp, #48] stp x7, x30, [sp, #64] bl X(ff_hevc_put_hevc_qpel_bi_v32_8_neon) ldp x4, x5, [sp, #16] ldp x2, x3, [sp, #32] ldp x0, x1, [sp, #48] ldr x7, [sp, #64] add sp, sp, #64 add x0, x0, #32 add x2, x2, #32 add x4, x4, #64 bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon) ldr x30, [sp, #8] add sp, sp, #16 ret endfunc function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon) endfunc function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 1: ldr s0, [x2] ldr s1, [x2, x3] subs w4, w4, #2 add x2, x2, x3, lsl #1 str s0, [x0] str s1, [x0, x1] add x0, x0, x1, lsl #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1 sub x1, x1, #4 1: ldr d0, [x2] ldr d1, [x2, x3] subs w4, w4, #2 add x2, x2, x3, lsl #1 str s0, [x0], #4 st1 {v0.h}[2], [x0], x1 str s1, [x0], #4 st1 {v1.h}[2], [x0], x1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1 1: ldr d0, [x2] ldr d1, [x2, x3] subs w4, w4, #2 add x2, x2, x3, lsl #1 str d0, [x0] str d1, [x0, x1] add x0, x0, x1, lsl #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1 sub x1, x1, #8 1: ldr q0, [x2] ldr q1, [x2, x3] subs w4, w4, #2 add x2, x2, x3, lsl #1 str d0, [x0], #8 st1 {v0.s}[2], [x0], x1 str d1, [x0], #8 st1 {v1.s}[2], [x0], x1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1 1: ldr q0, [x2] ldr q1, [x2, x3] subs w4, w4, #2 add x2, x2, x3, lsl #1 str q0, [x0] str q1, [x0, x1] add x0, x0, x1, lsl #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1 1: ld1 {v0.8b, v1.8b, v2.8b}, [x2], x3 subs w4, w4, #1 st1 {v0.8b, v1.8b, v2.8b}, [x0], x1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1 1: ld1 {v0.16b, v1.16b}, [x2], x3 subs w4, w4, #1 st1 {v0.16b, v1.16b}, [x0], x1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1 1: ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3 subs w4, w4, #1 st1 {v0.16b, v1.16b, v2.16b}, [x0], x1 b.hi 1b ret endfunc function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1 1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 subs w4, w4, #1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1 load_qpel_filterb x6, x5 sub x2, x2, x3, lsl #1 sub x2, x2, x3 ldr s16, [x2] ldr s17, [x2, x3] add x2, x2, x3, lsl #1 ldr s18, [x2] ldr s19, [x2, x3] add x2, x2, x3, lsl #1 ldr s20, [x2] ldr s21, [x2, x3] add x2, x2, x3, lsl #1 ldr s22, [x2] add x2, x2, x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().s}[0], [x2], x3 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 sqrshrun v24.8b, v24.8h, #6 subs w4, w4, #1 st1 {v24.s}[0], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1 load_qpel_filterb x6, x5 sub x2, x2, x3, lsl #1 sub x1, x1, #4 sub x2, x2, x3 ldr d16, [x2] ldr d17, [x2, x3] add x2, x2, x3, lsl #1 ldr d18, [x2] ldr d19, [x2, x3] add x2, x2, x3, lsl #1 ldr d20, [x2] ldr d21, [x2, x3] add x2, x2, x3, lsl #1 ldr d22, [x2] add x2, x2, x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8b}, [x2], x3 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 sqrshrun v24.8b, v24.8h, #6 st1 {v24.s}[0], [x0], #4 subs w4, w4, #1 st1 {v24.h}[2], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1 load_qpel_filterb x6, x5 sub x2, x2, x3, lsl #1 sub x2, x2, x3 ldr d16, [x2] ldr d17, [x2, x3] add x2, x2, x3, lsl #1 ldr d18, [x2] ldr d19, [x2, x3] add x2, x2, x3, lsl #1 ldr d20, [x2] ldr d21, [x2, x3] add x2, x2, x3, lsl #1 ldr d22, [x2] add x2, x2, x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8b}, [x2], x3 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 sqrshrun v24.8b, v24.8h, #6 subs w4, w4, #1 st1 {v24.8b}, [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1 load_qpel_filterb x6, x5 sub x2, x2, x3, lsl #1 sub x1, x1, #8 sub x2, x2, x3 0: mov x8, x2 // src mov w11, w4 // height mov x10, x0 // dst ldr q16, [x8] ldr q17, [x8, x3] add x8, x8, x3, lsl #1 ldr q18, [x8] ldr q19, [x8, x3] add x8, x8, x3, lsl #1 ldr q20, [x8] ldr q21, [x8, x3] add x8, x8, x3, lsl #1 ldr q22, [x8] add x8, x8, x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().16b}, [x8], x3 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 sqrshrun v24.8b, v24.8h, #6 sqrshrun2 v24.16b, v25.8h, #6 st1 {v24.8b}, [x10], #8 subs x11, x11, #1 st1 {v24.s}[2], [x10], x1 .endm 1: calc_all .purgem calc 2: add x0, x0, #12 add x2, x2, #12 subs w7, w7, #12 b.ne 0b ret endfunc function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1 load_qpel_filterb x6, x5 sub x2, x2, x3, lsl #1 sub x2, x2, x3 0: mov x8, x2 // src mov w11, w4 // height mov x10, x0 // dst ldr q16, [x8] ldr q17, [x8, x3] add x8, x8, x3, lsl #1 ldr q18, [x8] ldr q19, [x8, x3] add x8, x8, x3, lsl #1 ldr q20, [x8] ldr q21, [x8, x3] add x8, x8, x3, lsl #1 ldr q22, [x8] add x8, x8, x3 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().16b}, [x8], x3 calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 sqrshrun v24.8b, v24.8h, #6 sqrshrun2 v24.16b, v25.8h, #6 subs x11, x11, #1 st1 {v24.16b}, [x10], x1 .endm 1: calc_all .purgem calc 2: add x0, x0, #16 add x2, x2, #16 subs w7, w7, #16 b.ne 0b ret endfunc function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon) endfunc function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ldr s0, [x2] ldr s1, [x2, x3] add x2, x2, x3, lsl #1 ushll v0.8h, v0.8b, #6 ushll v1.8h, v1.8b, #6 smull v0.4s, v0.4h, v30.4h smull v1.4s, v1.4h, v30.4h sqrshl v0.4s, v0.4s, v31.4s sqrshl v1.4s, v1.4s, v31.4s sqadd v0.4s, v0.4s, v29.4s sqadd v1.4s, v1.4s, v29.4s sqxtn v0.4h, v0.4s sqxtn v1.4h, v1.4s sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h str s0, [x0] str s1, [x0, x1] add x0, x0, x1, lsl #1 subs w4, w4, #2 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 sub x1, x1, #4 1: ldr d0, [x2] ldr d1, [x2, x3] add x2, x2, x3, lsl #1 ushll v0.8h, v0.8b, #6 ushll v1.8h, v1.8b, #6 smull v4.4s, v0.4h, v30.4h smull2 v5.4s, v0.8h, v30.8h smull v6.4s, v1.4h, v30.4h smull2 v7.4s, v1.8h, v30.8h sqrshl v4.4s, v4.4s, v31.4s sqrshl v5.4s, v5.4s, v31.4s sqrshl v6.4s, v6.4s, v31.4s sqrshl v7.4s, v7.4s, v31.4s sqadd v4.4s, v4.4s, v29.4s sqadd v5.4s, v5.4s, v29.4s sqadd v6.4s, v6.4s, v29.4s sqadd v7.4s, v7.4s, v29.4s sqxtn v0.4h, v4.4s sqxtn2 v0.8h, v5.4s sqxtn v1.4h, v6.4s sqxtn2 v1.8h, v7.4s sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h str s0, [x0], #4 st1 {v0.h}[2], [x0], x1 str s1, [x0], #4 st1 {v1.h}[2], [x0], x1 subs w4, w4, #2 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ldr d0, [x2] ldr d1, [x2, x3] add x2, x2, x3, lsl #1 ushll v0.8h, v0.8b, #6 ushll v1.8h, v1.8b, #6 smull v4.4s, v0.4h, v30.4h smull2 v5.4s, v0.8h, v30.8h smull v6.4s, v1.4h, v30.4h smull2 v7.4s, v1.8h, v30.8h sqrshl v4.4s, v4.4s, v31.4s sqrshl v5.4s, v5.4s, v31.4s sqrshl v6.4s, v6.4s, v31.4s sqrshl v7.4s, v7.4s, v31.4s sqadd v4.4s, v4.4s, v29.4s sqadd v5.4s, v5.4s, v29.4s sqadd v6.4s, v6.4s, v29.4s sqadd v7.4s, v7.4s, v29.4s sqxtn v0.4h, v4.4s sqxtn2 v0.8h, v5.4s sqxtn v1.4h, v6.4s sqxtn2 v1.8h, v7.4s sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h str d0, [x0] str d1, [x0, x1] add x0, x0, x1, lsl #1 subs w4, w4, #2 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 sub x1, x1, #8 1: ldr q0, [x2] ldr q1, [x2, x3] add x2, x2, x3, lsl #1 ushll v4.8h, v0.8b, #6 ushll2 v5.8h, v0.16b, #6 ushll v6.8h, v1.8b, #6 ushll2 v7.8h, v1.16b, #6 smull v16.4s, v4.4h, v30.4h smull2 v17.4s, v4.8h, v30.8h smull v18.4s, v5.4h, v30.4h smull2 v19.4s, v5.8h, v30.8h smull v20.4s, v6.4h, v30.4h smull2 v21.4s, v6.8h, v30.8h smull v22.4s, v7.4h, v30.4h smull2 v23.4s, v7.8h, v30.8h sqrshl v16.4s, v16.4s, v31.4s sqrshl v17.4s, v17.4s, v31.4s sqrshl v18.4s, v18.4s, v31.4s sqrshl v19.4s, v19.4s, v31.4s sqrshl v20.4s, v20.4s, v31.4s sqrshl v21.4s, v21.4s, v31.4s sqrshl v22.4s, v22.4s, v31.4s sqrshl v23.4s, v23.4s, v31.4s sqadd v16.4s, v16.4s, v29.4s sqadd v17.4s, v17.4s, v29.4s sqadd v18.4s, v18.4s, v29.4s sqadd v19.4s, v19.4s, v29.4s sqadd v20.4s, v20.4s, v29.4s sqadd v21.4s, v21.4s, v29.4s sqadd v22.4s, v22.4s, v29.4s sqadd v23.4s, v23.4s, v29.4s sqxtn v0.4h, v16.4s sqxtn2 v0.8h, v17.4s sqxtn v1.4h, v18.4s sqxtn2 v1.8h, v19.4s sqxtn v2.4h, v20.4s sqxtn2 v2.8h, v21.4s sqxtn v3.4h, v22.4s sqxtn2 v3.8h, v23.4s sqxtun v0.8b, v0.8h sqxtun2 v0.16b, v1.8h sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h str d0, [x0], #8 st1 {v0.s}[2], [x0], x1 str d2, [x0], #8 st1 {v2.s}[2], [x0], x1 subs w4, w4, #2 b.ne 1b ret endfunc .macro PEL_UNI_W_PIXEL_CALC s0, t0, t1, d0, d1, d2, d3 ushll \t0\().8h, \s0\().8b, #6 ushll2 \t1\().8h, \s0\().16b, #6 smull \d0\().4s, \t0\().4h, v30.4h smull2 \d1\().4s, \t0\().8h, v30.8h smull \d2\().4s, \t1\().4h, v30.4h smull2 \d3\().4s, \t1\().8h, v30.8h sqrshl \d0\().4s, \d0\().4s, v31.4s sqrshl \d1\().4s, \d1\().4s, v31.4s sqrshl \d2\().4s, \d2\().4s, v31.4s sqrshl \d3\().4s, \d3\().4s, v31.4s sqadd \d0\().4s, \d0\().4s, v29.4s sqadd \d1\().4s, \d1\().4s, v29.4s sqadd \d2\().4s, \d2\().4s, v29.4s sqadd \d3\().4s, \d3\().4s, v29.4s sqxtn \t0\().4h, \d0\().4s sqxtn2 \t0\().8h, \d1\().4s sqxtn \t1\().4h, \d2\().4s sqxtn2 \t1\().8h, \d3\().4s sqxtun \s0\().8b, \t0\().8h sqxtun2 \s0\().16b, \t1\().8h .endm function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ldr q0, [x2] ldr q1, [x2, x3] add x2, x2, x3, lsl #1 PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19 PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23 str q0, [x0] str q1, [x0, x1] add x0, x0, x1, lsl #1 subs w4, w4, #2 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ld1 {v0.16b, v1.16b}, [x2], x3 ushll v4.8h, v0.8b, #6 ushll2 v5.8h, v0.16b, #6 ushll v6.8h, v1.8b, #6 smull v16.4s, v4.4h, v30.4h smull2 v17.4s, v4.8h, v30.8h smull v18.4s, v5.4h, v30.4h smull2 v19.4s, v5.8h, v30.8h smull v20.4s, v6.4h, v30.4h smull2 v21.4s, v6.8h, v30.8h sqrshl v16.4s, v16.4s, v31.4s sqrshl v17.4s, v17.4s, v31.4s sqrshl v18.4s, v18.4s, v31.4s sqrshl v19.4s, v19.4s, v31.4s sqrshl v20.4s, v20.4s, v31.4s sqrshl v21.4s, v21.4s, v31.4s sqadd v16.4s, v16.4s, v29.4s sqadd v17.4s, v17.4s, v29.4s sqadd v18.4s, v18.4s, v29.4s sqadd v19.4s, v19.4s, v29.4s sqadd v20.4s, v20.4s, v29.4s sqadd v21.4s, v21.4s, v29.4s sqxtn v0.4h, v16.4s sqxtn2 v0.8h, v17.4s sqxtn v1.4h, v18.4s sqxtn2 v1.8h, v19.4s sqxtn v2.4h, v20.4s sqxtn2 v2.8h, v21.4s sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h sqxtun v2.8b, v2.8h st1 {v0.8b, v1.8b, v2.8b}, [x0], x1 subs w4, w4, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ld1 {v0.16b, v1.16b}, [x2], x3 PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19 PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23 st1 {v0.16b, v1.16b}, [x0], x1 subs w4, w4, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3 PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19 PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23 PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19 st1 {v0.16b, v1.16b, v2.16b}, [x0], x1 subs w4, w4, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 dup v31.4s, w10 dup v29.4s, w7 1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19 PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23 PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19 PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 subs w4, w4, #1 b.ne 1b ret endfunc .macro QPEL_UNI_W_V_HEADER ldur x12, [sp, #8] // my sub x2, x2, x3, lsl #1 sub x2, x2, x3 movrel x9, qpel_filters_abs add x9, x9, x12, lsl #3 ldr d28, [x9] dup v0.16b, v28.b[0] dup v1.16b, v28.b[1] dup v2.16b, v28.b[2] dup v3.16b, v28.b[3] dup v4.16b, v28.b[4] dup v5.16b, v28.b[5] dup v6.16b, v28.b[6] dup v7.16b, v28.b[7] mov w10, #-6 sub w10, w10, w5 dup v30.8h, w6 // wx dup v31.4s, w10 // shift dup v29.4s, w7 // ox .endm .macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7 umull \dst\().8h, \src1\().8b, v1.8b umlsl \dst\().8h, \src0\().8b, v0.8b umlsl \dst\().8h, \src2\().8b, v2.8b umlal \dst\().8h, \src3\().8b, v3.8b umlal \dst\().8h, \src4\().8b, v4.8b umlsl \dst\().8h, \src5\().8b, v5.8b umlal \dst\().8h, \src6\().8b, v6.8b umlsl \dst\().8h, \src7\().8b, v7.8b .endm .macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7 umull2 \dst\().8h, \src1\().16b, v1.16b umlsl2 \dst\().8h, \src0\().16b, v0.16b umlsl2 \dst\().8h, \src2\().16b, v2.16b umlal2 \dst\().8h, \src3\().16b, v3.16b umlal2 \dst\().8h, \src4\().16b, v4.16b umlsl2 \dst\().8h, \src5\().16b, v5.16b umlal2 \dst\().8h, \src6\().16b, v6.16b umlsl2 \dst\().8h, \src7\().16b, v7.16b .endm .macro QPEL_UNI_W_V_4 smull v24.4s, v24.4h, v30.4h sqrshl v24.4s, v24.4s, v31.4s sqadd v24.4s, v24.4s, v29.4s sqxtn v24.4h, v24.4s sqxtun v24.8b, v24.8h st1 {v24.s}[0], [x0], x1 .endm function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1 QPEL_UNI_W_V_HEADER ldr s16, [x2] ldr s17, [x2, x3] add x2, x2, x3, lsl #1 ldr s18, [x2] ldr s19, [x2, x3] add x2, x2, x3, lsl #1 ldr s20, [x2] ldr s21, [x2, x3] add x2, x2, x3, lsl #1 ldr s22, [x2] 1: ldr s23, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v24, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s16, [x2] QPEL_FILTER_B v24, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s17, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v24, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s18, [x2] QPEL_FILTER_B v24, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s19, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v24, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s20, [x2] QPEL_FILTER_B v24, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s21, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v24, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_UNI_W_V_4 subs w4, w4, #1 b.eq 2f ldr s22, [x2] QPEL_FILTER_B v24, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_UNI_W_V_4 subs w4, w4, #1 b.ne 1b 2: ret endfunc .macro QPEL_UNI_W_V_8 smull v24.4s, v26.4h, v30.4h smull2 v25.4s, v26.8h, v30.8h sqrshl v24.4s, v24.4s, v31.4s sqrshl v25.4s, v25.4s, v31.4s sqadd v24.4s, v24.4s, v29.4s sqadd v25.4s, v25.4s, v29.4s sqxtn v24.4h, v24.4s sqxtn2 v24.8h, v25.4s sqxtun v24.8b, v24.8h st1 {v24.d}[0], [x0], x1 .endm function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1 QPEL_UNI_W_V_HEADER ldr d16, [x2] ldr d17, [x2, x3] add x2, x2, x3, lsl #1 ldr d18, [x2] ldr d19, [x2, x3] add x2, x2, x3, lsl #1 ldr d20, [x2] ldr d21, [x2, x3] add x2, x2, x3, lsl #1 ldr d22, [x2] 1: ldr d23, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d16, [x2] QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d17, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d18, [x2] QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d19, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d20, [x2] QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d21, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_UNI_W_V_8 subs w4, w4, #1 b.eq 2f ldr d22, [x2] QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_UNI_W_V_8 subs w4, w4, #1 b.ne 1b 2: ret endfunc .macro QPEL_UNI_W_V_16 smull v24.4s, v26.4h, v30.4h smull2 v25.4s, v26.8h, v30.8h smull v26.4s, v27.4h, v30.4h smull2 v27.4s, v27.8h, v30.8h sqrshl v24.4s, v24.4s, v31.4s sqrshl v25.4s, v25.4s, v31.4s sqrshl v26.4s, v26.4s, v31.4s sqrshl v27.4s, v27.4s, v31.4s sqadd v24.4s, v24.4s, v29.4s sqadd v25.4s, v25.4s, v29.4s sqadd v26.4s, v26.4s, v29.4s sqadd v27.4s, v27.4s, v29.4s sqxtn v24.4h, v24.4s sqxtn2 v24.8h, v25.4s sqxtn v26.4h, v26.4s sqxtn2 v26.8h, v27.4s sqxtun v24.8b, v24.8h sqxtun2 v24.16b, v26.8h st1 {v24.16b}, [x0], x1 .endm function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1 QPEL_UNI_W_V_HEADER ldr q16, [x2] ldr q17, [x2, x3] add x2, x2, x3, lsl #1 ldr q18, [x2] ldr q19, [x2, x3] add x2, x2, x3, lsl #1 ldr q20, [x2] ldr q21, [x2, x3] add x2, x2, x3, lsl #1 ldr q22, [x2] 1: ldr q23, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q16, [x2] QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q17, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q18, [x2] QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q19, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q20, [x2] QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q21, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q22, [x2] QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_UNI_W_V_16 subs w4, w4, #1 b.ne 1b 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1 QPEL_UNI_W_V_HEADER ldur w13, [sp, #16] mov x14, x0 mov x15, x2 mov w11, w4 3: ldr q16, [x2] ldr q17, [x2, x3] add x2, x2, x3, lsl #1 ldr q18, [x2] ldr q19, [x2, x3] add x2, x2, x3, lsl #1 ldr q20, [x2] ldr q21, [x2, x3] add x2, x2, x3, lsl #1 ldr q22, [x2] 1: ldr q23, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q16, [x2] QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q17, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q18, [x2] QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q19, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q20, [x2] QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q21, [x2, x3] add x2, x2, x3, lsl #1 QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_UNI_W_V_16 subs w4, w4, #1 b.eq 2f ldr q22, [x2] QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_UNI_W_V_16 subs w4, w4, #1 b.ne 1b 2: subs w13, w13, #16 add x14, x14, #16 add x15, x15, #16 mov x0, x14 mov x2, x15 mov w4, w11 b.hi 3b ret endfunc #if HAVE_I8MM ENABLE_I8MM function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array str x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 add x0, sp, #48 mov x2, x3 add x3, x4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldr x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x6, x5 ldr d16, [sp] ldr d17, [sp, x9] add sp, sp, x9, lsl #1 ldr d18, [sp] ldr d19, [sp, x9] add sp, sp, x9, lsl #1 ldr d20, [sp] ldr d21, [sp, x9] add sp, sp, x9, lsl #1 ldr d22, [sp] add sp, sp, x9 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().4h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12 sqxtun v1.8b, v1.8h subs w4, w4, #1 st1 {v1.s}[0], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_hv6_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array str x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 add x0, sp, #48 mov x2, x3 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldr x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x6, x5 sub x1, x1, #4 ldr q16, [sp] ldr q17, [sp, x9] add sp, sp, x9, lsl #1 ldr q18, [sp] ldr q19, [sp, x9] add sp, sp, x9, lsl #1 ldr q20, [sp] ldr q21, [sp, x9] add sp, sp, x9, lsl #1 ldr q22, [sp] add sp, sp, x9 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12 calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12 sqxtun v1.8b, v1.8h st1 {v1.s}[0], [x0], #4 subs w4, w4, #1 st1 {v1.h}[2], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_hv8_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array str x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 add x0, sp, #48 mov x2, x3 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldr x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x6, x5 ldr q16, [sp] ldr q17, [sp, x9] add sp, sp, x9, lsl #1 ldr q18, [sp] ldr q19, [sp, x9] add sp, sp, x9, lsl #1 ldr q20, [sp] ldr q21, [sp, x9] add sp, sp, x9, lsl #1 ldr q22, [sp] add sp, sp, x9 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12 calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12 sqxtun v1.8b, v1.8h subs w4, w4, #1 st1 {v1.8b}, [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_hv12_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 mov x2, x3 add x0, sp, #48 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x6, x5 sub x1, x1, #8 ld1 {v16.8h, v17.8h}, [sp], x9 ld1 {v18.8h, v19.8h}, [sp], x9 ld1 {v20.8h, v21.8h}, [sp], x9 ld1 {v22.8h, v23.8h}, [sp], x9 ld1 {v24.8h, v25.8h}, [sp], x9 ld1 {v26.8h, v27.8h}, [sp], x9 ld1 {v28.8h, v29.8h}, [sp], x9 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12 calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12 calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12 sqxtun v1.8b, v1.8h sqxtun2 v1.16b, v2.8h st1 {v1.8b}, [x0], #8 subs w4, w4, #1 st1 {v1.s}[2], [x0], x1 .endm 1: calc_all2 .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_uni_hv16_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] add x0, sp, #48 sub x1, x2, x3, lsl #1 sub x1, x1, x3 mov x2, x3 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 .Lqpel_uni_hv16_loop: mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x6, x5 sub w12, w9, w7, lsl #1 0: mov x8, sp // src ld1 {v16.8h, v17.8h}, [x8], x9 mov w11, w4 // height ld1 {v18.8h, v19.8h}, [x8], x9 mov x10, x0 // dst ld1 {v20.8h, v21.8h}, [x8], x9 ld1 {v22.8h, v23.8h}, [x8], x9 ld1 {v24.8h, v25.8h}, [x8], x9 ld1 {v26.8h, v27.8h}, [x8], x9 ld1 {v28.8h, v29.8h}, [x8], x9 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12 calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12 calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12 calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12 sqxtun v1.8b, v1.8h subs x11, x11, #1 sqxtun2 v1.16b, v2.8h st1 {v1.16b}, [x10], x1 .endm 1: calc_all2 .purgem calc 2: add x0, x0, #16 add sp, sp, #32 subs w7, w7, #16 b.ne 0b add w10, w4, #6 add sp, sp, x12 // discard rest of first line lsl x10, x10, #7 add sp, sp, x10 // tmp_array without first line ret endfunc function ff_hevc_put_hevc_qpel_uni_hv24_8_neon_i8mm, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] stp x0, x1, [sp, #32] stp x6, x30, [sp, #48] mov x7, #16 bl X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon_i8mm) ldp x2, x3, [sp, #16] add x2, x2, #16 ldp x0, x1, [sp, #32] ldp x4, x5, [sp], #48 mov x7, #8 add x0, x0, #16 ldr x6, [sp] bl X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon_i8mm) ldr x30, [sp, #8] add sp, sp, #16 ret endfunc function ff_hevc_put_hevc_qpel_uni_hv32_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 add x0, sp, #48 sub x1, x1, x3 mov x2, x3 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 b .Lqpel_uni_hv16_loop endfunc function ff_hevc_put_hevc_qpel_uni_hv48_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 mov x2, x3 add x0, sp, #48 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h48_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 b .Lqpel_uni_hv16_loop endfunc function ff_hevc_put_hevc_qpel_uni_hv64_8_neon_i8mm, export=1 add w10, w4, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x6, [sp, #16] stp x0, x1, [sp, #32] add x0, sp, #48 sub x1, x2, x3, lsl #1 mov x2, x3 sub x1, x1, x3 add w3, w4, #7 mov x4, x5 bl X(ff_hevc_put_hevc_qpel_h64_8_neon_i8mm) ldp x4, x6, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 b .Lqpel_uni_hv16_loop endfunc .macro QPEL_UNI_W_H_HEADER ldr x12, [sp] sub x2, x2, #3 movrel x9, qpel_filters add x9, x9, x12, lsl #3 ld1r {v28.2d}, [x9] mov w10, #-6 sub w10, w10, w5 dup v30.4s, w6 // wx dup v31.4s, w10 // shift dup v29.4s, w7 // ox .endm function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER 1: ld1 {v0.16b}, [x2], x3 ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 ext v3.16b, v0.16b, v0.16b, #3 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d movi v16.16b, #0 movi v17.16b, #0 usdot v16.4s, v0.16b, v28.16b usdot v17.4s, v2.16b, v28.16b addp v16.4s, v16.4s, v17.4s mul v16.4s, v16.4s, v30.4s sqrshl v16.4s, v16.4s, v31.4s sqadd v16.4s, v16.4s, v29.4s sqxtn v16.4h, v16.4s sqxtun v16.8b, v16.8h str s16, [x0] add x0, x0, x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER sub x1, x1, #4 1: ld1 {v0.16b}, [x2], x3 ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 ext v3.16b, v0.16b, v0.16b, #3 ext v4.16b, v0.16b, v0.16b, #4 ext v5.16b, v0.16b, v0.16b, #5 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d zip1 v4.2d, v4.2d, v5.2d movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 usdot v16.4s, v0.16b, v28.16b usdot v17.4s, v2.16b, v28.16b usdot v18.4s, v4.16b, v28.16b addp v16.4s, v16.4s, v17.4s addp v18.4s, v18.4s, v18.4s mul v16.4s, v16.4s, v30.4s mul v18.2s, v18.2s, v30.2s sqrshl v16.4s, v16.4s, v31.4s sqrshl v18.2s, v18.2s, v31.2s sqadd v16.4s, v16.4s, v29.4s sqadd v18.2s, v18.2s, v29.2s sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v18.4s sqxtun v16.8b, v16.8h str s16, [x0], #4 st1 {v16.h}[2], [x0], x1 subs w4, w4, #1 b.hi 1b ret endfunc .macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 movi \d0\().16b, #0 movi \d1\().16b, #0 movi \d2\().16b, #0 movi \d3\().16b, #0 usdot \d0\().4s, \s0\().16b, v28.16b usdot \d1\().4s, \s1\().16b, v28.16b usdot \d2\().4s, \s2\().16b, v28.16b usdot \d3\().4s, \s3\().16b, v28.16b addp \d0\().4s, \d0\().4s, \d1\().4s addp \d2\().4s, \d2\().4s, \d3\().4s mul \d0\().4s, \d0\().4s, v30.4s mul \d2\().4s, \d2\().4s, v30.4s sqrshl \d0\().4s, \d0\().4s, v31.4s sqrshl \d2\().4s, \d2\().4s, v31.4s sqadd \d0\().4s, \d0\().4s, v29.4s sqadd \d2\().4s, \d2\().4s, v29.4s .endm .macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1 movi \d0\().16b, #0 movi \d1\().16b, #0 usdot \d0\().4s, \s0\().16b, v28.16b usdot \d1\().4s, \s1\().16b, v28.16b addp \d0\().4s, \d0\().4s, \d1\().4s mul \d0\().4s, \d0\().4s, v30.4s sqrshl \d0\().4s, \d0\().4s, v31.4s sqadd \d0\().4s, \d0\().4s, v29.4s .endm function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER 1: ld1 {v16.16b, v17.16b}, [x2], x3 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 zip1 v0.2d, v16.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d zip1 v4.2d, v4.2d, v5.2d zip1 v6.2d, v6.2d, v7.2d QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21 sqxtn v18.4h, v18.4s sqxtn2 v18.8h, v20.4s sqxtun v18.8b, v18.8h str d18, [x0] add x0, x0, x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER add x13, x0, #8 1: ld1 {v16.16b, v17.16b}, [x2], x3 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 zip1 v18.2d, v16.2d, v1.2d zip1 v19.2d, v2.2d, v3.2d zip1 v20.2d, v4.2d, v5.2d zip1 v21.2d, v6.2d, v7.2d zip2 v22.2d, v16.2d, v1.2d zip2 v23.2d, v2.2d, v3.2d QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6 QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25 sqxtn v0.4h, v0.4s sqxtn2 v0.8h, v4.4s sqxtn v1.4h, v24.4s sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h str d0, [x0] str s1, [x13] add x0, x0, x1 add x13, x13, x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER 1: ld1 {v16.16b, v17.16b}, [x2], x3 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15 sqxtn v0.4h, v18.4s sqxtn2 v0.8h, v22.4s sqxtn v1.4h, v20.4s sqxtn2 v1.8h, v24.4s trn1 v2.8h, v0.8h, v1.8h trn2 v3.8h, v0.8h, v1.8h sqxtun v0.8b, v2.8h sqxtun2 v0.16b, v3.8h st1 {v0.16b}, [x0], x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER sub x1, x1, #16 1: ld1 {v16.16b, v17.16b}, [x2], x3 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 sqxtn v18.4h, v18.4s sqxtn2 v18.8h, v22.4s sqxtn v19.4h, v20.4s sqxtn2 v19.8h, v24.4s trn1 v20.8h, v18.8h, v19.8h trn2 v21.8h, v18.8h, v19.8h sqxtun v26.8b, v20.8h sqxtun2 v26.16b, v21.8h // 0-15 ext v1.16b, v17.16b, v17.16b, #1 ext v2.16b, v17.16b, v17.16b, #2 ext v3.16b, v17.16b, v17.16b, #3 ext v4.16b, v17.16b, v17.16b, #4 ext v5.16b, v17.16b, v17.16b, #5 ext v6.16b, v17.16b, v17.16b, #6 ext v7.16b, v17.16b, v17.16b, #7 zip1 v0.2d, v17.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d zip1 v4.2d, v4.2d, v5.2d zip1 v6.2d, v6.2d, v7.2d QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21 sqxtn v18.4h, v18.4s sqxtn2 v18.8h, v20.4s sqxtun v27.8b, v18.8h st1 {v26.16b}, [x0], #16 st1 {v27.8b}, [x0], x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER 1: ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 sqxtn v0.4h, v0.4s sqxtn2 v0.8h, v22.4s sqxtn v19.4h, v20.4s sqxtn2 v19.8h, v24.4s trn1 v20.8h, v0.8h, v19.8h trn2 v21.8h, v0.8h, v19.8h sqxtun v26.8b, v20.8h sqxtun2 v26.16b, v21.8h // 0-15 ext v1.16b, v17.16b, v18.16b, #1 ext v2.16b, v17.16b, v18.16b, #2 ext v3.16b, v17.16b, v18.16b, #3 ext v4.16b, v17.16b, v18.16b, #4 ext v5.16b, v17.16b, v18.16b, #5 ext v6.16b, v17.16b, v18.16b, #6 ext v7.16b, v17.16b, v18.16b, #7 QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 sqxtn v0.4h, v0.4s sqxtn2 v0.8h, v22.4s sqxtn v19.4h, v20.4s sqxtn2 v19.8h, v24.4s trn1 v20.8h, v0.8h, v19.8h trn2 v21.8h, v0.8h, v19.8h sqxtun v27.8b, v20.8h sqxtun2 v27.16b, v21.8h // 16-31 st1 {v26.16b, v27.16b}, [x0], x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER 1: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v25.8b, v22.8h sqxtun2 v25.16b, v23.8h // 0-15 ext v1.16b, v17.16b, v18.16b, #1 ext v2.16b, v17.16b, v18.16b, #2 ext v3.16b, v17.16b, v18.16b, #3 ext v4.16b, v17.16b, v18.16b, #4 ext v5.16b, v17.16b, v18.16b, #5 ext v6.16b, v17.16b, v18.16b, #6 ext v7.16b, v17.16b, v18.16b, #7 QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v26.8b, v22.8h sqxtun2 v26.16b, v23.8h // 16-31 ext v1.16b, v18.16b, v19.16b, #1 ext v2.16b, v18.16b, v19.16b, #2 ext v3.16b, v18.16b, v19.16b, #3 ext v4.16b, v18.16b, v19.16b, #4 ext v5.16b, v18.16b, v19.16b, #5 ext v6.16b, v18.16b, v19.16b, #6 ext v7.16b, v18.16b, v19.16b, #7 QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v27.8b, v22.8h sqxtun2 v27.16b, v23.8h // 32-47 st1 {v25.16b, v26.16b, v27.16b}, [x0], x1 subs w4, w4, #1 b.hi 1b ret endfunc function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1 QPEL_UNI_W_H_HEADER sub x3, x3, #64 1: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v16.8b, v22.8h sqxtun2 v16.16b, v23.8h // 0-15 ext v1.16b, v17.16b, v18.16b, #1 ext v2.16b, v17.16b, v18.16b, #2 ext v3.16b, v17.16b, v18.16b, #3 ext v4.16b, v17.16b, v18.16b, #4 ext v5.16b, v17.16b, v18.16b, #5 ext v6.16b, v17.16b, v18.16b, #6 ext v7.16b, v17.16b, v18.16b, #7 QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v17.8b, v22.8h sqxtun2 v17.16b, v23.8h // 16-31 ext v1.16b, v18.16b, v19.16b, #1 ext v2.16b, v18.16b, v19.16b, #2 ext v3.16b, v18.16b, v19.16b, #3 ext v4.16b, v18.16b, v19.16b, #4 ext v5.16b, v18.16b, v19.16b, #5 ext v6.16b, v18.16b, v19.16b, #6 ext v7.16b, v18.16b, v19.16b, #7 QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 ld1 {v0.16b}, [x2], x3 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v18.8b, v22.8h sqxtun2 v18.16b, v23.8h // 32-47 ext v1.16b, v19.16b, v0.16b, #1 ext v2.16b, v19.16b, v0.16b, #2 ext v3.16b, v19.16b, v0.16b, #3 ext v4.16b, v19.16b, v0.16b, #4 ext v5.16b, v19.16b, v0.16b, #5 ext v6.16b, v19.16b, v0.16b, #6 ext v7.16b, v19.16b, v0.16b, #7 QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0 QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v23.4s trn1 v22.8h, v20.8h, v21.8h trn2 v23.8h, v20.8h, v21.8h sqxtun v19.8b, v22.8h sqxtun2 v19.16b, v23.8h // 48-63 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 subs w4, w4, #1 b.hi 1b ret endfunc .macro QPEL_H_HEADER movrel x9, qpel_filters add x9, x9, x4, lsl #3 ldr x11, [x9] dup v31.2d, x11 sub x1, x1, #3 .endm function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 1: ld1 {v0.16b}, [x1], x2 ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 ext v3.16b, v0.16b, v0.16b, #3 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d movi v16.16b, #0 movi v17.16b, #0 usdot v16.4s, v0.16b, v31.16b usdot v17.4s, v2.16b, v31.16b addp v16.4s, v16.4s, v17.4s sqxtn v16.4h, v16.4s str d16, [x0] add x0, x0, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 add x15, x0, #8 1: ld1 {v0.16b}, [x1], x2 ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 ext v3.16b, v0.16b, v0.16b, #3 ext v4.16b, v0.16b, v0.16b, #4 ext v5.16b, v0.16b, v0.16b, #5 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d zip1 v4.2d, v4.2d, v5.2d movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 usdot v16.4s, v0.16b, v31.16b usdot v17.4s, v2.16b, v31.16b usdot v18.4s, v4.16b, v31.16b addp v16.4s, v16.4s, v17.4s addp v18.4s, v18.4s, v18.4s sqxtn v16.4h, v16.4s sqxtn v18.4h, v18.4s str d16, [x0] str s18, [x15] add x0, x0, x10 add x15, x15, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 1: ld1 {v0.16b}, [x1], x2 ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 ext v3.16b, v0.16b, v0.16b, #3 ext v4.16b, v0.16b, v0.16b, #4 ext v5.16b, v0.16b, v0.16b, #5 ext v6.16b, v0.16b, v0.16b, #6 ext v7.16b, v0.16b, v0.16b, #7 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d zip1 v4.2d, v4.2d, v5.2d zip1 v6.2d, v6.2d, v7.2d movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 usdot v16.4s, v0.16b, v31.16b usdot v17.4s, v2.16b, v31.16b usdot v18.4s, v4.16b, v31.16b usdot v19.4s, v6.16b, v31.16b addp v16.4s, v16.4s, v17.4s addp v18.4s, v18.4s, v19.4s sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v18.4s str q16, [x0] add x0, x0, x10 subs w3, w3, #1 b.ne 1b ret endfunc .macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 movi \d0\().16b, #0 movi \d1\().16b, #0 movi \d2\().16b, #0 movi \d3\().16b, #0 usdot \d0\().4s, \s0\().16b, v31.16b usdot \d1\().4s, \s1\().16b, v31.16b usdot \d2\().4s, \s2\().16b, v31.16b usdot \d3\().4s, \s3\().16b, v31.16b .endm function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 add x15, x0, #16 1: ld1 {v16.16b, v17.16b}, [x1], x2 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 zip1 v18.2d, v4.2d, v5.2d zip1 v19.2d, v6.2d, v7.2d QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s movi v24.16b, #0 movi v25.16b, #0 usdot v24.4s, v18.16b, v31.16b usdot v25.4s, v19.16b, v31.16b addp v24.4s, v24.4s, v25.4s trn1 v26.4s, v20.4s, v21.4s trn2 v27.4s, v20.4s, v21.4s sqxtn v26.4h, v26.4s sqxtn v27.4h, v27.4s sqxtn2 v26.8h, v24.4s str q26, [x0] str d27, [x15] add x0, x0, x10 add x15, x15, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 1: ld1 {v16.16b, v17.16b}, [x1], x2 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v18.4h, v22.4s sqxtn2 v18.8h, v26.4s sqxtn v19.4h, v23.4s sqxtn2 v19.8h, v27.4s stp q18, q19, [x0] add x0, x0, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 add x15, x0, #32 1: ld1 {v16.16b, v17.16b}, [x1], x2 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v18.4h, v22.4s sqxtn2 v18.8h, v26.4s sqxtn v19.4h, v23.4s sqxtn2 v19.8h, v27.4s stp q18, q19, [x0] add x0, x0, x10 ext v1.16b, v17.16b, v17.16b, #1 ext v2.16b, v17.16b, v17.16b, #2 ext v3.16b, v17.16b, v17.16b, #3 ext v4.16b, v17.16b, v17.16b, #4 ext v5.16b, v17.16b, v17.16b, #5 ext v6.16b, v17.16b, v17.16b, #6 ext v7.16b, v17.16b, v17.16b, #7 zip1 v0.2d, v17.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d zip1 v4.2d, v4.2d, v5.2d zip1 v6.2d, v6.2d, v7.2d QPEL_H_CALC v0, v2, v4, v6, v20, v21, v22, v23 addp v20.4s, v20.4s, v21.4s addp v22.4s, v22.4s, v23.4s sqxtn v20.4h, v20.4s sqxtn2 v20.8h, v22.4s str q20, [x15] add x15, x15, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 add x15, x0, #32 1: ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0] add x0, x0, x10 ext v1.16b, v17.16b, v18.16b, #1 ext v2.16b, v17.16b, v18.16b, #2 ext v3.16b, v17.16b, v18.16b, #3 ext v4.16b, v17.16b, v18.16b, #4 ext v5.16b, v17.16b, v18.16b, #5 ext v6.16b, v17.16b, v18.16b, #6 ext v7.16b, v17.16b, v18.16b, #7 QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x15] add x15, x15, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1 QPEL_H_HEADER mov x10, #MAX_PB_SIZE * 2 - 64 1: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0], #32 ext v1.16b, v17.16b, v18.16b, #1 ext v2.16b, v17.16b, v18.16b, #2 ext v3.16b, v17.16b, v18.16b, #3 ext v4.16b, v17.16b, v18.16b, #4 ext v5.16b, v17.16b, v18.16b, #5 ext v6.16b, v17.16b, v18.16b, #6 ext v7.16b, v17.16b, v18.16b, #7 QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0], #32 ext v1.16b, v18.16b, v19.16b, #1 ext v2.16b, v18.16b, v19.16b, #2 ext v3.16b, v18.16b, v19.16b, #3 ext v4.16b, v18.16b, v19.16b, #4 ext v5.16b, v18.16b, v19.16b, #5 ext v6.16b, v18.16b, v19.16b, #6 ext v7.16b, v18.16b, v19.16b, #7 QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0] add x0, x0, x10 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1 QPEL_H_HEADER sub x2, x2, #64 1: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64 ext v1.16b, v16.16b, v17.16b, #1 ext v2.16b, v16.16b, v17.16b, #2 ext v3.16b, v16.16b, v17.16b, #3 ext v4.16b, v16.16b, v17.16b, #4 ext v5.16b, v16.16b, v17.16b, #5 ext v6.16b, v16.16b, v17.16b, #6 ext v7.16b, v16.16b, v17.16b, #7 QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0], #32 ext v1.16b, v17.16b, v18.16b, #1 ext v2.16b, v17.16b, v18.16b, #2 ext v3.16b, v17.16b, v18.16b, #3 ext v4.16b, v17.16b, v18.16b, #4 ext v5.16b, v17.16b, v18.16b, #5 ext v6.16b, v17.16b, v18.16b, #6 ext v7.16b, v17.16b, v18.16b, #7 QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0], #32 ext v1.16b, v18.16b, v19.16b, #1 ext v2.16b, v18.16b, v19.16b, #2 ext v3.16b, v18.16b, v19.16b, #3 ext v4.16b, v18.16b, v19.16b, #4 ext v5.16b, v18.16b, v19.16b, #5 ext v6.16b, v18.16b, v19.16b, #6 ext v7.16b, v18.16b, v19.16b, #7 QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0], #32 ld1 {v28.8b}, [x1], x2 ext v1.16b, v19.16b, v28.16b, #1 ext v2.16b, v19.16b, v28.16b, #2 ext v3.16b, v19.16b, v28.16b, #3 ext v4.16b, v19.16b, v28.16b, #4 ext v5.16b, v19.16b, v28.16b, #5 ext v6.16b, v19.16b, v28.16b, #6 ext v7.16b, v19.16b, v28.16b, #7 QPEL_H_CALC v19, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27 addp v20.4s, v20.4s, v22.4s addp v21.4s, v21.4s, v23.4s addp v24.4s, v24.4s, v26.4s addp v25.4s, v25.4s, v27.4s trn1 v22.4s, v20.4s, v21.4s trn2 v23.4s, v20.4s, v21.4s trn1 v26.4s, v24.4s, v25.4s trn2 v27.4s, v24.4s, v25.4s sqxtn v20.4h, v22.4s sqxtn2 v20.8h, v26.4s sqxtn v21.4h, v23.4s sqxtn2 v21.8h, v27.4s stp q20, q21, [x0], #32 subs w3, w3, #1 b.ne 1b ret endfunc function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1 add w10, w3, #7 mov x7, #128 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x5, x30, [sp, #-32]! stp x0, x3, [sp, #16] add x0, sp, #32 sub x1, x1, x2, lsl #1 add x3, x3, #7 sub x1, x1, x2 bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm) ldp x0, x3, [sp, #16] ldp x5, x30, [sp], #32 load_qpel_filterh x5, x4 ldr d16, [sp] ldr d17, [sp, x7] add sp, sp, x7, lsl #1 ldr d18, [sp] ldr d19, [sp, x7] add sp, sp, x7, lsl #1 ldr d20, [sp] ldr d21, [sp, x7] add sp, sp, x7, lsl #1 ldr d22, [sp] add sp, sp, x7 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().4h}, [sp], x7 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn subs w3, w3, #1 st1 {v1.4h}, [x0], x7 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1 add w10, w3, #7 mov x7, #128 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x5, x30, [sp, #-32]! stp x0, x3, [sp, #16] add x0, sp, #32 sub x1, x1, x2, lsl #1 add x3, x3, #7 sub x1, x1, x2 bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm) ldp x0, x3, [sp, #16] ldp x5, x30, [sp], #32 mov x8, #120 load_qpel_filterh x5, x4 ldr q16, [sp] ldr q17, [sp, x7] add sp, sp, x7, lsl #1 ldr q18, [sp] ldr q19, [sp, x7] add sp, sp, x7, lsl #1 ldr q20, [sp] ldr q21, [sp, x7] add sp, sp, x7, lsl #1 ldr q22, [sp] add sp, sp, x7 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8h}, [sp], x7 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2 st1 {v1.4h}, [x0], #8 subs w3, w3, #1 st1 {v1.s}[2], [x0], x8 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1 add w10, w3, #7 lsl x10, x10, #7 sub x1, x1, x2, lsl #1 sub sp, sp, x10 // tmp_array stp x5, x30, [sp, #-32]! stp x0, x3, [sp, #16] add x0, sp, #32 add x3, x3, #7 sub x1, x1, x2 bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm) ldp x0, x3, [sp, #16] ldp x5, x30, [sp], #32 mov x7, #128 load_qpel_filterh x5, x4 ldr q16, [sp] ldr q17, [sp, x7] add sp, sp, x7, lsl #1 ldr q18, [sp] ldr q19, [sp, x7] add sp, sp, x7, lsl #1 ldr q20, [sp] ldr q21, [sp, x7] add sp, sp, x7, lsl #1 ldr q22, [sp] add sp, sp, x7 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8h}, [sp], x7 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2 subs w3, w3, #1 st1 {v1.8h}, [x0], x7 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1 add w10, w3, #7 lsl x10, x10, #7 sub x1, x1, x2, lsl #1 sub sp, sp, x10 // tmp_array stp x5, x30, [sp, #-32]! stp x0, x3, [sp, #16] add x0, sp, #32 add x3, x3, #7 sub x1, x1, x2 bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm) ldp x0, x3, [sp, #16] ldp x5, x30, [sp], #32 mov x7, #128 load_qpel_filterh x5, x4 mov x8, #112 ld1 {v16.8h, v17.8h}, [sp], x7 ld1 {v18.8h, v19.8h}, [sp], x7 ld1 {v20.8h, v21.8h}, [sp], x7 ld1 {v22.8h, v23.8h}, [sp], x7 ld1 {v24.8h, v25.8h}, [sp], x7 ld1 {v26.8h, v27.8h}, [sp], x7 ld1 {v28.8h, v29.8h}, [sp], x7 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2 calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn st1 {v1.8h}, [x0], #16 subs w3, w3, #1 st1 {v2.4h}, [x0], x8 .endm 1: calc_all2 .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1 add w10, w3, #7 lsl x10, x10, #7 sub x1, x1, x2, lsl #1 sub sp, sp, x10 // tmp_array stp x5, x30, [sp, #-32]! stp x0, x3, [sp, #16] add x3, x3, #7 add x0, sp, #32 sub x1, x1, x2 bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm) ldp x0, x3, [sp, #16] ldp x5, x30, [sp], #32 mov x7, #128 load_qpel_filterh x5, x4 ld1 {v16.8h, v17.8h}, [sp], x7 ld1 {v18.8h, v19.8h}, [sp], x7 ld1 {v20.8h, v21.8h}, [sp], x7 ld1 {v22.8h, v23.8h}, [sp], x7 ld1 {v24.8h, v25.8h}, [sp], x7 ld1 {v26.8h, v27.8h}, [sp], x7 ld1 {v28.8h, v29.8h}, [sp], x7 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2 calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2 subs w3, w3, #1 st1 {v1.8h, v2.8h}, [x0], x7 .endm 1: calc_all2 .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] stp x0, x1, [sp, #32] str x30, [sp, #48] bl X(ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm) ldp x0, x1, [sp, #32] ldp x2, x3, [sp, #16] ldp x4, x5, [sp], #48 add x1, x1, #12 add x0, x0, #24 bl X(ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm) ldr x30, [sp], #16 ret endfunc function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1 add w10, w3, #7 sub x1, x1, x2, lsl #1 lsl x10, x10, #7 sub x1, x1, x2 sub sp, sp, x10 // tmp_array stp x5, x30, [sp, #-32]! stp x0, x3, [sp, #16] add x3, x3, #7 add x0, sp, #32 bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm) ldp x0, x3, [sp, #16] ldp x5, x30, [sp], #32 mov x7, #128 load_qpel_filterh x5, x4 0: mov x8, sp // src ld1 {v16.8h, v17.8h}, [x8], x7 mov w9, w3 // height ld1 {v18.8h, v19.8h}, [x8], x7 mov x5, x0 // dst ld1 {v20.8h, v21.8h}, [x8], x7 ld1 {v22.8h, v23.8h}, [x8], x7 ld1 {v24.8h, v25.8h}, [x8], x7 ld1 {v26.8h, v27.8h}, [x8], x7 ld1 {v28.8h, v29.8h}, [x8], x7 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2 calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2 subs x9, x9, #1 st1 {v1.8h, v2.8h}, [x5], x7 .endm 1: calc_all2 .purgem calc 2: add x0, x0, #32 add sp, sp, #32 subs w6, w6, #16 b.hi 0b add w10, w3, #6 add sp, sp, #64 // discard rest of first line lsl x10, x10, #7 add sp, sp, x10 // tmp_array without first line ret endfunc function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] stp x0, x1, [sp, #32] str x30, [sp, #48] bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm) ldp x0, x1, [sp, #32] ldp x2, x3, [sp, #16] ldp x4, x5, [sp], #48 add x1, x1, #24 add x0, x0, #48 bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm) ldr x30, [sp], #16 ret endfunc function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] stp x0, x1, [sp, #32] str x30, [sp, #48] mov x6, #32 bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm) ldp x0, x1, [sp, #32] ldp x2, x3, [sp, #16] ldp x4, x5, [sp], #48 add x1, x1, #32 add x0, x0, #64 mov x6, #32 bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm) ldr x30, [sp], #16 ret endfunc .macro QPEL_UNI_W_HV_HEADER width ldp x14, x15, [sp] // mx, my ldr w13, [sp, #16] // width stp x19, x30, [sp, #-80]! stp x20, x21, [sp, #16] stp x22, x23, [sp, #32] stp x24, x25, [sp, #48] stp x26, x27, [sp, #64] mov x19, sp mov x11, #9088 sub sp, sp, x11 mov x20, x0 mov x21, x1 mov x0, sp sub x1, x2, x3, lsl #1 sub x1, x1, x3 mov x2, x3 add w3, w4, #7 mov w22, w4 // height mov x4, x14 // mx mov x23, x15 // my mov w24, w6 // wx mov w25, w7 // ox mov w26, #-6 sub w26, w26, w5 // -shift mov w27, w13 // width bl X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_i8mm) movrel x9, qpel_filters add x9, x9, x23, lsl #3 ld1 {v0.8b}, [x9] sxtl v0.8h, v0.8b mov x10, #(MAX_PB_SIZE * 2) dup v28.4s, w24 dup v29.4s, w25 dup v30.4s, w26 .endm .macro QPEL_UNI_W_HV_END mov sp, x19 ldp x20, x21, [sp, #16] ldp x22, x23, [sp, #32] ldp x24, x25, [sp, #48] ldp x26, x27, [sp, #64] ldp x19, x30, [sp], #80 .endm .macro QPEL_UNI_W_HV_4 sshr v26.4s, v26.4s, #6 mul v24.4s, v26.4s, v28.4s sqrshl v24.4s, v24.4s, v30.4s sqadd v24.4s, v24.4s, v29.4s sqxtn v24.4h, v24.4s sqxtun v24.8b, v24.8h st1 {v24.s}[0], [x20], x21 .endm .macro QPEL_FILTER_H dst, src0, src1, src2, src3, src4, src5, src6, src7 smull \dst\().4s, \src0\().4h, v0.h[0] smlal \dst\().4s, \src1\().4h, v0.h[1] smlal \dst\().4s, \src2\().4h, v0.h[2] smlal \dst\().4s, \src3\().4h, v0.h[3] smlal \dst\().4s, \src4\().4h, v0.h[4] smlal \dst\().4s, \src5\().4h, v0.h[5] smlal \dst\().4s, \src6\().4h, v0.h[6] smlal \dst\().4s, \src7\().4h, v0.h[7] .endm .macro QPEL_FILTER_H2 dst, src0, src1, src2, src3, src4, src5, src6, src7 smull2 \dst\().4s, \src0\().8h, v0.h[0] smlal2 \dst\().4s, \src1\().8h, v0.h[1] smlal2 \dst\().4s, \src2\().8h, v0.h[2] smlal2 \dst\().4s, \src3\().8h, v0.h[3] smlal2 \dst\().4s, \src4\().8h, v0.h[4] smlal2 \dst\().4s, \src5\().8h, v0.h[5] smlal2 \dst\().4s, \src6\().8h, v0.h[6] smlal2 \dst\().4s, \src7\().8h, v0.h[7] .endm function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_i8mm, export=1 QPEL_UNI_W_HV_HEADER 4 ldr d16, [sp] ldr d17, [sp, x10] add sp, sp, x10, lsl #1 ldr d18, [sp] ldr d19, [sp, x10] add sp, sp, x10, lsl #1 ldr d20, [sp] ldr d21, [sp, x10] add sp, sp, x10, lsl #1 ldr d22, [sp] add sp, sp, x10 1: ldr d23, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d16, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d17, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d18, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d19, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d20, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d21, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.eq 2f ldr d22, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_UNI_W_HV_4 subs w22, w22, #1 b.hi 1b 2: QPEL_UNI_W_HV_END ret endfunc .macro QPEL_UNI_W_HV_8 sshr v26.4s, v26.4s, #6 sshr v27.4s, v27.4s, #6 mul v24.4s, v26.4s, v28.4s mul v25.4s, v27.4s, v28.4s sqrshl v24.4s, v24.4s, v30.4s sqrshl v25.4s, v25.4s, v30.4s sqadd v24.4s, v24.4s, v29.4s sqadd v25.4s, v25.4s, v29.4s sqxtn v24.4h, v24.4s sqxtn2 v24.8h, v25.4s sqxtun v24.8b, v24.8h st1 {v24.d}[0], [x20], x21 .endm function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_i8mm, export=1 QPEL_UNI_W_HV_HEADER 8 ldr q16, [sp] ldr q17, [sp, x10] add sp, sp, x10, lsl #1 ldr q18, [sp] ldr q19, [sp, x10] add sp, sp, x10, lsl #1 ldr q20, [sp] ldr q21, [sp, x10] add sp, sp, x10, lsl #1 ldr q22, [sp] add sp, sp, x10 1: ldr q23, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H2 v27, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q16, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H2 v27, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q17, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H2 v27, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q18, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H2 v27, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q19, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H2 v27, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q20, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H2 v27, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q21, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H2 v27, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.eq 2f ldr q22, [sp] add sp, sp, x10 QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H2 v27, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_UNI_W_HV_8 subs w22, w22, #1 b.hi 1b 2: QPEL_UNI_W_HV_END ret endfunc .macro QPEL_UNI_W_HV_16 sshr v24.4s, v24.4s, #6 sshr v25.4s, v25.4s, #6 sshr v26.4s, v26.4s, #6 sshr v27.4s, v27.4s, #6 mul v24.4s, v24.4s, v28.4s mul v25.4s, v25.4s, v28.4s mul v26.4s, v26.4s, v28.4s mul v27.4s, v27.4s, v28.4s sqrshl v24.4s, v24.4s, v30.4s sqrshl v25.4s, v25.4s, v30.4s sqrshl v26.4s, v26.4s, v30.4s sqrshl v27.4s, v27.4s, v30.4s sqadd v24.4s, v24.4s, v29.4s sqadd v25.4s, v25.4s, v29.4s sqadd v26.4s, v26.4s, v29.4s sqadd v27.4s, v27.4s, v29.4s sqxtn v24.4h, v24.4s sqxtn2 v24.8h, v25.4s sqxtn v26.4h, v26.4s sqxtn2 v26.8h, v27.4s sqxtun v24.8b, v24.8h sqxtun2 v24.16b, v26.8h st1 {v24.16b}, [x20], x21 .endm function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1 QPEL_UNI_W_HV_HEADER 16 ldp q16, q1, [sp] add sp, sp, x10 ldp q17, q2, [sp] add sp, sp, x10 ldp q18, q3, [sp] add sp, sp, x10 ldp q19, q4, [sp] add sp, sp, x10 ldp q20, q5, [sp] add sp, sp, x10 ldp q21, q6, [sp] add sp, sp, x10 ldp q22, q7, [sp] add sp, sp, x10 1: ldp q23, q31, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31 QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q16, q1, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1 QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q17, q2, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2 QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q18, q3, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3 QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q19, q4, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4 QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q20, q5, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5 QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q21, q6, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6 QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q22, q7, [sp] add sp, sp, x10 QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7 QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.hi 1b 2: QPEL_UNI_W_HV_END ret endfunc function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1 QPEL_UNI_W_HV_HEADER 32 mov x11, sp mov w12, w22 mov x13, x20 mov x14, sp 3: ldp q16, q1, [x11] add x11, x11, x10 ldp q17, q2, [x11] add x11, x11, x10 ldp q18, q3, [x11] add x11, x11, x10 ldp q19, q4, [x11] add x11, x11, x10 ldp q20, q5, [x11] add x11, x11, x10 ldp q21, q6, [x11] add x11, x11, x10 ldp q22, q7, [x11] add x11, x11, x10 1: ldp q23, q31, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31 QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q16, q1, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1 QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q17, q2, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2 QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q18, q3, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3 QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q19, q4, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4 QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q20, q5, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5 QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q21, q6, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6 QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q22, q7, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7 QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.hi 1b 2: subs w27, w27, #16 add x11, x14, #32 add x20, x13, #16 mov w22, w12 mov x14, x11 mov x13, x20 b.hi 3b QPEL_UNI_W_HV_END ret endfunc function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1 QPEL_UNI_W_HV_HEADER 64 mov x11, sp mov w12, w22 mov x13, x20 mov x14, sp 3: ldp q16, q1, [x11] add x11, x11, x10 ldp q17, q2, [x11] add x11, x11, x10 ldp q18, q3, [x11] add x11, x11, x10 ldp q19, q4, [x11] add x11, x11, x10 ldp q20, q5, [x11] add x11, x11, x10 ldp q21, q6, [x11] add x11, x11, x10 ldp q22, q7, [x11] add x11, x11, x10 1: ldp q23, q31, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23 QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31 QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q16, q1, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16 QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1 QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q17, q2, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17 QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2 QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q18, q3, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18 QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3 QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q19, q4, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19 QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4 QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q20, q5, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20 QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5 QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q21, q6, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21 QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6 QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.eq 2f ldp q22, q7, [x11] add x11, x11, x10 QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22 QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7 QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7 QPEL_UNI_W_HV_16 subs w22, w22, #1 b.hi 1b 2: subs w27, w27, #16 add x11, x14, #32 add x20, x13, #16 mov w22, w12 mov x14, x11 mov x13, x20 b.hi 3b QPEL_UNI_W_HV_END ret endfunc function ff_hevc_put_hevc_qpel_bi_hv4_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 add x0, sp, #48 mov x2, x3 add w3, w5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x7, x6 ld1 {v16.4h}, [sp], x9 ld1 {v17.4h}, [sp], x9 ld1 {v18.4h}, [sp], x9 ld1 {v19.4h}, [sp], x9 ld1 {v20.4h}, [sp], x9 ld1 {v21.4h}, [sp], x9 ld1 {v22.4h}, [sp], x9 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().4h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr ld1 {v5.4h}, [x4], x9 // src2 saddw v1.4s, v1.4s, v5.4h rshrn v1.4h, v1.4s, #7 sqxtun v1.8b, v1.8h subs w5, w5, #1 st1 {v1.s}[0], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_hv6_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 add x0, sp, #48 mov x2, x3 add x3, x5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x7, x6 sub x1, x1, #4 ld1 {v16.8h}, [sp], x9 ld1 {v17.8h}, [sp], x9 ld1 {v18.8h}, [sp], x9 ld1 {v19.8h}, [sp], x9 ld1 {v20.8h}, [sp], x9 ld1 {v21.8h}, [sp], x9 ld1 {v22.8h}, [sp], x9 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr ld1 {v5.8h}, [x4], x9 // src2 saddw v1.4s, v1.4s, v5.4h saddw2 v2.4s, v2.4s, v5.8h rshrn v1.4h, v1.4s, #7 rshrn2 v1.8h, v2.4s, #7 sqxtun v1.8b, v1.8h st1 {v1.s}[0], [x0], #4 subs w5, w5, #1 st1 {v1.h}[2], [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_hv8_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] sub x1, x2, x3, lsl #1 sub x1, x1, x3 add x0, sp, #48 mov x2, x3 add x3, x5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x9, #(MAX_PB_SIZE * 2) load_qpel_filterh x7, x6 ld1 {v16.8h}, [sp], x9 ld1 {v17.8h}, [sp], x9 ld1 {v18.8h}, [sp], x9 ld1 {v19.8h}, [sp], x9 ld1 {v20.8h}, [sp], x9 ld1 {v21.8h}, [sp], x9 ld1 {v22.8h}, [sp], x9 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 ld1 {\tmp\().8h}, [sp], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr ld1 {v5.8h}, [x4], x9 // src2 saddw v1.4s, v1.4s, v5.4h saddw2 v2.4s, v2.4s, v5.8h rshrn v1.4h, v1.4s, #7 rshrn2 v1.8h, v2.4s, #7 sqxtun v1.8b, v1.8h subs w5, w5, #1 st1 {v1.8b}, [x0], x1 .endm 1: calc_all .purgem calc 2: ret endfunc function ff_hevc_put_hevc_qpel_bi_hv12_8_neon_i8mm, export=1 stp x6, x7, [sp, #-80]! stp x4, x5, [sp, #16] stp x2, x3, [sp, #32] stp x0, x1, [sp, #48] str x30, [sp, #64] bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x2, x3, [sp, #32] ldp x0, x1, [sp, #48] ldp x6, x7, [sp], #64 add x4, x4, #16 add x2, x2, #8 add x0, x0, #8 bl X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon_i8mm) ldr x30, [sp], #16 ret endfunc function ff_hevc_put_hevc_qpel_bi_hv16_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] add x0, sp, #48 sub x1, x2, x3, lsl #1 sub x1, x1, x3 mov x2, x3 add w3, w5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x6, #16 // width .Lqpel_bi_hv16_loop: load_qpel_filterh x7, x8 mov x9, #(MAX_PB_SIZE * 2) mov x10, x6 0: mov x8, sp // src ld1 {v16.8h, v17.8h}, [x8], x9 mov w11, w5 // height ld1 {v18.8h, v19.8h}, [x8], x9 mov x12, x4 // src2 ld1 {v20.8h, v21.8h}, [x8], x9 mov x7, x0 // dst ld1 {v22.8h, v23.8h}, [x8], x9 ld1 {v24.8h, v25.8h}, [x8], x9 ld1 {v26.8h, v27.8h}, [x8], x9 ld1 {v28.8h, v29.8h}, [x8], x9 .macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9 calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr calc_qpelh v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr calc_qpelh2 v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr ld1 {v5.8h, v6.8h}, [x12], x9 // src2 saddw v1.4s, v1.4s, v5.4h saddw2 v2.4s, v2.4s, v5.8h saddw v3.4s, v3.4s, v6.4h saddw2 v4.4s, v4.4s, v6.8h rshrn v1.4h, v1.4s, #7 rshrn2 v1.8h, v2.4s, #7 rshrn v2.4h, v3.4s, #7 rshrn2 v2.8h, v4.4s, #7 sqxtun v1.8b, v1.8h sqxtun2 v1.16b, v2.8h subs x11, x11, #1 st1 {v1.16b}, [x7], x1 .endm 1: calc_all2 .purgem calc 2: add x0, x0, #16 add sp, sp, #32 subs x10, x10, #16 add x4, x4, #32 b.ne 0b add w10, w5, #7 lsl x10, x10, #7 sub x10, x10, x6, lsl #1 // part of first line add sp, sp, x10 // tmp_array without first line ret endfunc function ff_hevc_put_hevc_qpel_bi_hv24_8_neon_i8mm, export=1 stp x6, x7, [sp, #-80]! stp x4, x5, [sp, #16] stp x2, x3, [sp, #32] stp x0, x1, [sp, #48] str x30, [sp, #64] bl X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x2, x3, [sp, #32] ldp x0, x1, [sp, #48] ldp x6, x7, [sp], #64 add x4, x4, #32 add x2, x2, #16 add x0, x0, #16 bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon_i8mm) ldr x30, [sp], #16 ret endfunc function ff_hevc_put_hevc_qpel_bi_hv32_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] add x0, sp, #48 sub x1, x2, x3, lsl #1 mov x2, x3 sub x1, x1, x3 add w3, w5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x6, #32 // width b .Lqpel_bi_hv16_loop endfunc function ff_hevc_put_hevc_qpel_bi_hv48_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] add x0, sp, #48 sub x1, x2, x3, lsl #1 mov x2, x3 sub x1, x1, x3 add w3, w5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h48_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x6, #48 // width b .Lqpel_bi_hv16_loop endfunc function ff_hevc_put_hevc_qpel_bi_hv64_8_neon_i8mm, export=1 add w10, w5, #7 lsl x10, x10, #7 sub sp, sp, x10 // tmp_array stp x7, x30, [sp, #-48]! stp x4, x5, [sp, #16] stp x0, x1, [sp, #32] add x0, sp, #48 sub x1, x2, x3, lsl #1 mov x2, x3 sub x1, x1, x3 add w3, w5, #7 mov x4, x6 bl X(ff_hevc_put_hevc_qpel_h64_8_neon_i8mm) ldp x4, x5, [sp, #16] ldp x0, x1, [sp, #32] ldp x7, x30, [sp], #48 mov x6, #64 // width b .Lqpel_bi_hv16_loop endfunc DISABLE_I8MM #endif // HAVE_I8MM