1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-19 05:49:09 +02:00
FFmpeg/libavcodec/aarch64/hevcdsp_qpel_neon.S
Logan Lyu 0b7356c1b4 lavc/aarch64: new optimization for 8-bit hevc_pel_uni_w_pixels and qpel_uni_w_v
Signed-off-by: Martin Storsjö <martin@martin.st>
2023-06-06 12:50:18 +03:00

1195 lines
41 KiB
ArmAsm

/* -*-arm64-*-
* vim: syntax=arm64asm
*
* Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define MAX_PB_SIZE 64
const qpel_filters, align=4
.byte 0, 0, 0, 0, 0, 0, 0, 0
.byte -1, 4,-10, 58, 17, -5, 1, 0
.byte -1, 4,-11, 40, 40,-11, 4, -1
.byte 0, 1, -5, 17, 58,-10, 4, -1
endconst
const qpel_filters_abs, align=4
.byte 0, 0, 0, 0, 0, 0, 0, 0
.byte 1, 4, 10, 58, 17, 5, 1, 0
.byte 1, 4, 11, 40, 40, 11, 4, 1
.byte 0, 1, 5, 17, 58, 10, 4, 1
endconst
.macro load_filter m
movrel x15, qpel_filters
add x15, x15, \m, lsl #3
ld1 {v0.8b}, [x15]
sxtl v0.8h, v0.8b
.endm
.macro put_hevc type
.ifc \type, qpel
// void put_hevc_qpel_h(int16_t *dst,
// uint8_t *_src, ptrdiff_t _srcstride,
// int height, intptr_t mx, intptr_t my, int width)
dst .req x0
dststride .req x7
src .req x1
srcstride .req x2
height .req x3
heightw .req w3
mx .req x4
width .req w6
.endif
.ifc \type, qpel_uni
// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
// uint8_t *_src, ptrdiff_t _srcstride,
// int height, intptr_t mx, intptr_t my, int width)
dst .req x0
dststride .req x1
src .req x2
srcstride .req x3
height .req x4
heightw .req w4
mx .req x5
width .req w7
.endif
.ifc \type, qpel_bi
// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
// uint8_t *_src, ptrdiff_t _srcstride,
// int16_t *src2, int height, intptr_t mx,
// intptr_t my, int width)
dst .req x0
dststride .req x1
src .req x2
srcstride .req x3
height .req x5
heightw .req w5
mx .req x6
width .req w8
.endif
.ifc \type, qpel
function ff_hevc_put_hevc_h4_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
mul v23.4h, v16.4h, v0.h[0]
mul v24.4h, v18.4h, v0.h[0]
.irpc i, 1234567
ext v20.16b, v16.16b, v17.16b, #(2*\i)
ext v21.16b, v18.16b, v19.16b, #(2*\i)
mla v23.4h, v20.4h, v0.h[\i]
mla v24.4h, v21.4h, v0.h[\i]
.endr
ret
endfunc
.endif
function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
load_filter mx
.ifc \type, qpel_bi
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #(MAX_PB_SIZE << 2)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: ld1 {v16.8b, v17.8b}, [src], x13
ld1 {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
ld1 {v25.8h}, [ x4], x16
ld1 {v26.8h}, [x15], x16
.endif
bl ff_hevc_put_hevc_h4_8_neon
subs heightw, heightw, #2
.ifc \type, qpel
st1 {v23.4h}, [dst], x14
st1 {v24.4h}, [x10], x14
.else
.ifc \type, qpel_bi
sqadd v23.4h, v23.4h, v25.4h
sqadd v24.4h, v24.4h, v26.4h
sqrshrun v23.8b, v23.8h, #7
sqrshrun v24.8b, v24.8h, #7
.else
sqrshrun v23.8b, v23.8h, #6
sqrshrun v24.8b, v24.8h, #6
.endif
st1 {v23.s}[0], [dst], x14
st1 {v24.s}[0], [x10], x14
.endif
b.gt 0b // double line
ret mx
endfunc
.ifc \type, qpel
function ff_hevc_put_hevc_h8_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
mul v23.8h, v16.8h, v0.h[0]
mul v24.8h, v18.8h, v0.h[0]
.irpc i, 1234567
ext v20.16b, v16.16b, v17.16b, #(2*\i)
ext v21.16b, v18.16b, v19.16b, #(2*\i)
mla v23.8h, v20.8h, v0.h[\i]
mla v24.8h, v21.8h, v0.h[\i]
.endr
ret
endfunc
.endif
function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
load_filter mx
.ifc \type, qpel_bi
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #((MAX_PB_SIZE << 2) - 8)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, #4
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: ld1 {v16.8b, v17.8b}, [src], x13
ld1 {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
ld1 {v25.8h}, [ x4], x16
ld1 {v26.8h}, [x15], x16
.endif
bl ff_hevc_put_hevc_h8_8_neon
subs heightw, heightw, #2
.ifc \type, qpel
st1 {v23.4h}, [dst], #8
st1 {v24.4h}, [x10], #8
st1 {v23.s}[2], [dst], x14
st1 {v24.s}[2], [x10], x14
.else
.ifc \type, qpel_bi
sqadd v23.8h, v23.8h, v25.8h
sqadd v24.8h, v24.8h, v26.8h
sqrshrun v23.8b, v23.8h, #7
sqrshrun v24.8b, v24.8h, #7
.else
sqrshrun v23.8b, v23.8h, #6
sqrshrun v24.8b, v24.8h, #6
.endif
st1 {v23.s}[0], [dst], #4
st1 {v24.s}[0], [x10], #4
st1 {v23.h}[2], [dst], x14
st1 {v24.h}[2], [x10], x14
.endif
b.gt 0b // double line
ret mx
endfunc
function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
load_filter mx
.ifc \type, qpel_bi
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #(MAX_PB_SIZE << 2)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: ld1 {v16.8b, v17.8b}, [src], x13
ld1 {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
ld1 {v25.8h}, [ x4], x16
ld1 {v26.8h}, [x15], x16
.endif
bl ff_hevc_put_hevc_h8_8_neon
subs heightw, heightw, #2
.ifc \type, qpel
st1 {v23.8h}, [dst], x14
st1 {v24.8h}, [x10], x14
.else
.ifc \type, qpel_bi
sqadd v23.8h, v23.8h, v25.8h
sqadd v24.8h, v24.8h, v26.8h
sqrshrun v23.8b, v23.8h, #7
sqrshrun v24.8b, v24.8h, #7
.else
sqrshrun v23.8b, v23.8h, #6
sqrshrun v24.8b, v24.8h, #6
.endif
st1 {v23.8b}, [dst], x14
st1 {v24.8b}, [x10], x14
.endif
b.gt 0b // double line
ret mx
endfunc
.ifc \type, qpel
function ff_hevc_put_hevc_h16_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
mul v26.8h, v16.8h, v0.h[0]
mul v27.8h, v17.8h, v0.h[0]
mul v28.8h, v19.8h, v0.h[0]
mul v29.8h, v20.8h, v0.h[0]
.irpc i, 1234567
ext v22.16b, v16.16b, v17.16b, #(2*\i)
ext v23.16b, v17.16b, v18.16b, #(2*\i)
ext v24.16b, v19.16b, v20.16b, #(2*\i)
ext v25.16b, v20.16b, v21.16b, #(2*\i)
mla v26.8h, v22.8h, v0.h[\i]
mla v27.8h, v23.8h, v0.h[\i]
mla v28.8h, v24.8h, v0.h[\i]
mla v29.8h, v25.8h, v0.h[\i]
.endr
subs x9, x9, #2
ret
endfunc
.endif
function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
load_filter mx
sxtw height, heightw
.ifc \type, qpel_bi
ldrh w8, [sp] // width
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1))
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #((MAX_PB_SIZE << 2) - 16)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, #8
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: mov x9, height
1: ld1 {v16.8b-v18.8b}, [src], x13
ld1 {v19.8b-v21.8b}, [x12], x13
bl ff_hevc_put_hevc_h16_8_neon
.ifc \type, qpel
st1 {v26.8h}, [dst], #16
st1 {v28.8h}, [x10], #16
st1 {v27.4h}, [dst], x14
st1 {v29.4h}, [x10], x14
.else
.ifc \type, qpel_bi
ld1 {v16.8h, v17.8h}, [ x4], x16
ld1 {v18.8h, v19.8h}, [x15], x16
sqadd v26.8h, v26.8h, v16.8h
sqadd v27.8h, v27.8h, v17.8h
sqadd v28.8h, v28.8h, v18.8h
sqadd v29.8h, v29.8h, v19.8h
sqrshrun v26.8b, v26.8h, #7
sqrshrun v27.8b, v27.8h, #7
sqrshrun v28.8b, v28.8h, #7
sqrshrun v29.8b, v29.8h, #7
.else
sqrshrun v26.8b, v26.8h, #6
sqrshrun v27.8b, v27.8h, #6
sqrshrun v28.8b, v28.8h, #6
sqrshrun v29.8b, v29.8h, #6
.endif
st1 {v26.8b}, [dst], #8
st1 {v28.8b}, [x10], #8
st1 {v27.s}[0], [dst], x14
st1 {v29.s}[0], [x10], x14
.endif
b.gt 1b // double line
subs width, width, #12
// reset src
msub src, srcstride, height, src
msub x12, srcstride, height, x12
// reset dst
msub dst, dststride, height, dst
msub x10, dststride, height, x10
.ifc \type, qpel_bi
// reset xsrc
sub x4, x4, x17
sub x15, x15, x17
add x4, x4, #24
add x15, x15, #24
.endif
add src, src, #12
add x12, x12, #12
.ifc \type, qpel
add dst, dst, #24
add x10, x10, #24
.else
add dst, dst, #12
add x10, x10, #12
.endif
b.gt 0b
ret mx
endfunc
function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
load_filter mx
sxtw height, heightw
mov mx, x30
.ifc \type, qpel_bi
ldrh w8, [sp] // width
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
lsl x17, x5, #7 // src2b reset
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #((MAX_PB_SIZE << 2) - 16)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, #8
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: mov x9, height
1: ld1 {v16.8b-v18.8b}, [src], x13
ld1 {v19.8b-v21.8b}, [x12], x13
bl ff_hevc_put_hevc_h16_8_neon
.ifc \type, qpel
st1 {v26.8h}, [dst], #16
st1 {v28.8h}, [x10], #16
st1 {v27.8h}, [dst], x14
st1 {v29.8h}, [x10], x14
.else
.ifc \type, qpel_bi
ld1 {v16.8h, v17.8h}, [ x4], x16
ld1 {v18.8h, v19.8h}, [x15], x16
sqadd v26.8h, v26.8h, v16.8h
sqadd v27.8h, v27.8h, v17.8h
sqadd v28.8h, v28.8h, v18.8h
sqadd v29.8h, v29.8h, v19.8h
sqrshrun v26.8b, v26.8h, #7
sqrshrun v27.8b, v27.8h, #7
sqrshrun v28.8b, v28.8h, #7
sqrshrun v29.8b, v29.8h, #7
.else
sqrshrun v26.8b, v26.8h, #6
sqrshrun v27.8b, v27.8h, #6
sqrshrun v28.8b, v28.8h, #6
sqrshrun v29.8b, v29.8h, #6
.endif
st1 {v26.8b}, [dst], #8
st1 {v28.8b}, [x10], #8
st1 {v27.8b}, [dst], x14
st1 {v29.8b}, [x10], x14
.endif
b.gt 1b // double line
subs width, width, #16
// reset src
msub src, srcstride, height, src
msub x12, srcstride, height, x12
// reset dst
msub dst, dststride, height, dst
msub x10, dststride, height, x10
.ifc \type, qpel_bi
// reset xsrc
sub x4, x4, x17
sub x15, x15, x17
add x4, x4, #32
add x15, x15, #32
.endif
add src, src, #16
add x12, x12, #16
.ifc \type, qpel
add dst, dst, #32
add x10, x10, #32
.else
add dst, dst, #16
add x10, x10, #16
.endif
b.gt 0b
ret mx
endfunc
.unreq height
.unreq heightw
.unreq width
.unreq src
.unreq dst
.unreq srcstride
.unreq dststride
.unreq mx
.endm
put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ldr s0, [x2]
ldr s1, [x2, x3]
add x2, x2, x3, lsl #1
ushll v0.8h, v0.8b, #6
ushll v1.8h, v1.8b, #6
smull v0.4s, v0.4h, v30.4h
smull v1.4s, v1.4h, v30.4h
sqrshl v0.4s, v0.4s, v31.4s
sqrshl v1.4s, v1.4s, v31.4s
sqadd v0.4s, v0.4s, v29.4s
sqadd v1.4s, v1.4s, v29.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
str s0, [x0]
str s1, [x0, x1]
add x0, x0, x1, lsl #1
subs w4, w4, #2
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
sub x1, x1, #4
1:
ldr d0, [x2]
ldr d1, [x2, x3]
add x2, x2, x3, lsl #1
ushll v0.8h, v0.8b, #6
ushll v1.8h, v1.8b, #6
smull v4.4s, v0.4h, v30.4h
smull2 v5.4s, v0.8h, v30.8h
smull v6.4s, v1.4h, v30.4h
smull2 v7.4s, v1.8h, v30.8h
sqrshl v4.4s, v4.4s, v31.4s
sqrshl v5.4s, v5.4s, v31.4s
sqrshl v6.4s, v6.4s, v31.4s
sqrshl v7.4s, v7.4s, v31.4s
sqadd v4.4s, v4.4s, v29.4s
sqadd v5.4s, v5.4s, v29.4s
sqadd v6.4s, v6.4s, v29.4s
sqadd v7.4s, v7.4s, v29.4s
sqxtn v0.4h, v4.4s
sqxtn2 v0.8h, v5.4s
sqxtn v1.4h, v6.4s
sqxtn2 v1.8h, v7.4s
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
str s0, [x0], #4
st1 {v0.h}[2], [x0], x1
str s1, [x0], #4
st1 {v1.h}[2], [x0], x1
subs w4, w4, #2
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ldr d0, [x2]
ldr d1, [x2, x3]
add x2, x2, x3, lsl #1
ushll v0.8h, v0.8b, #6
ushll v1.8h, v1.8b, #6
smull v4.4s, v0.4h, v30.4h
smull2 v5.4s, v0.8h, v30.8h
smull v6.4s, v1.4h, v30.4h
smull2 v7.4s, v1.8h, v30.8h
sqrshl v4.4s, v4.4s, v31.4s
sqrshl v5.4s, v5.4s, v31.4s
sqrshl v6.4s, v6.4s, v31.4s
sqrshl v7.4s, v7.4s, v31.4s
sqadd v4.4s, v4.4s, v29.4s
sqadd v5.4s, v5.4s, v29.4s
sqadd v6.4s, v6.4s, v29.4s
sqadd v7.4s, v7.4s, v29.4s
sqxtn v0.4h, v4.4s
sqxtn2 v0.8h, v5.4s
sqxtn v1.4h, v6.4s
sqxtn2 v1.8h, v7.4s
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
str d0, [x0]
str d1, [x0, x1]
add x0, x0, x1, lsl #1
subs w4, w4, #2
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
sub x1, x1, #8
1:
ldr q0, [x2]
ldr q1, [x2, x3]
add x2, x2, x3, lsl #1
ushll v4.8h, v0.8b, #6
ushll2 v5.8h, v0.16b, #6
ushll v6.8h, v1.8b, #6
ushll2 v7.8h, v1.16b, #6
smull v16.4s, v4.4h, v30.4h
smull2 v17.4s, v4.8h, v30.8h
smull v18.4s, v5.4h, v30.4h
smull2 v19.4s, v5.8h, v30.8h
smull v20.4s, v6.4h, v30.4h
smull2 v21.4s, v6.8h, v30.8h
smull v22.4s, v7.4h, v30.4h
smull2 v23.4s, v7.8h, v30.8h
sqrshl v16.4s, v16.4s, v31.4s
sqrshl v17.4s, v17.4s, v31.4s
sqrshl v18.4s, v18.4s, v31.4s
sqrshl v19.4s, v19.4s, v31.4s
sqrshl v20.4s, v20.4s, v31.4s
sqrshl v21.4s, v21.4s, v31.4s
sqrshl v22.4s, v22.4s, v31.4s
sqrshl v23.4s, v23.4s, v31.4s
sqadd v16.4s, v16.4s, v29.4s
sqadd v17.4s, v17.4s, v29.4s
sqadd v18.4s, v18.4s, v29.4s
sqadd v19.4s, v19.4s, v29.4s
sqadd v20.4s, v20.4s, v29.4s
sqadd v21.4s, v21.4s, v29.4s
sqadd v22.4s, v22.4s, v29.4s
sqadd v23.4s, v23.4s, v29.4s
sqxtn v0.4h, v16.4s
sqxtn2 v0.8h, v17.4s
sqxtn v1.4h, v18.4s
sqxtn2 v1.8h, v19.4s
sqxtn v2.4h, v20.4s
sqxtn2 v2.8h, v21.4s
sqxtn v3.4h, v22.4s
sqxtn2 v3.8h, v23.4s
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun2 v2.16b, v3.8h
str d0, [x0], #8
st1 {v0.s}[2], [x0], x1
str d2, [x0], #8
st1 {v2.s}[2], [x0], x1
subs w4, w4, #2
b.ne 1b
ret
endfunc
.macro PEL_UNI_W_PIXEL_CALC s0, t0, t1, d0, d1, d2, d3
ushll \t0\().8h, \s0\().8b, #6
ushll2 \t1\().8h, \s0\().16b, #6
smull \d0\().4s, \t0\().4h, v30.4h
smull2 \d1\().4s, \t0\().8h, v30.8h
smull \d2\().4s, \t1\().4h, v30.4h
smull2 \d3\().4s, \t1\().8h, v30.8h
sqrshl \d0\().4s, \d0\().4s, v31.4s
sqrshl \d1\().4s, \d1\().4s, v31.4s
sqrshl \d2\().4s, \d2\().4s, v31.4s
sqrshl \d3\().4s, \d3\().4s, v31.4s
sqadd \d0\().4s, \d0\().4s, v29.4s
sqadd \d1\().4s, \d1\().4s, v29.4s
sqadd \d2\().4s, \d2\().4s, v29.4s
sqadd \d3\().4s, \d3\().4s, v29.4s
sqxtn \t0\().4h, \d0\().4s
sqxtn2 \t0\().8h, \d1\().4s
sqxtn \t1\().4h, \d2\().4s
sqxtn2 \t1\().8h, \d3\().4s
sqxtun \s0\().8b, \t0\().8h
sqxtun2 \s0\().16b, \t1\().8h
.endm
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ldr q0, [x2]
ldr q1, [x2, x3]
add x2, x2, x3, lsl #1
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
str q0, [x0]
str q1, [x0, x1]
add x0, x0, x1, lsl #1
subs w4, w4, #2
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ld1 {v0.16b, v1.16b}, [x2], x3
ushll v4.8h, v0.8b, #6
ushll2 v5.8h, v0.16b, #6
ushll v6.8h, v1.8b, #6
smull v16.4s, v4.4h, v30.4h
smull2 v17.4s, v4.8h, v30.8h
smull v18.4s, v5.4h, v30.4h
smull2 v19.4s, v5.8h, v30.8h
smull v20.4s, v6.4h, v30.4h
smull2 v21.4s, v6.8h, v30.8h
sqrshl v16.4s, v16.4s, v31.4s
sqrshl v17.4s, v17.4s, v31.4s
sqrshl v18.4s, v18.4s, v31.4s
sqrshl v19.4s, v19.4s, v31.4s
sqrshl v20.4s, v20.4s, v31.4s
sqrshl v21.4s, v21.4s, v31.4s
sqadd v16.4s, v16.4s, v29.4s
sqadd v17.4s, v17.4s, v29.4s
sqadd v18.4s, v18.4s, v29.4s
sqadd v19.4s, v19.4s, v29.4s
sqadd v20.4s, v20.4s, v29.4s
sqadd v21.4s, v21.4s, v29.4s
sqxtn v0.4h, v16.4s
sqxtn2 v0.8h, v17.4s
sqxtn v1.4h, v18.4s
sqxtn2 v1.8h, v19.4s
sqxtn v2.4h, v20.4s
sqxtn2 v2.8h, v21.4s
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
st1 {v0.8b, v1.8b, v2.8b}, [x0], x1
subs w4, w4, #1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ld1 {v0.16b, v1.16b}, [x2], x3
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
st1 {v0.16b, v1.16b}, [x0], x1
subs w4, w4, #1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
subs w4, w4, #1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
1:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
subs w4, w4, #1
b.ne 1b
ret
endfunc
.macro QPEL_UNI_W_V_HEADER
ldur x12, [sp, #8] // my
sub x2, x2, x3, lsl #1
sub x2, x2, x3
movrel x9, qpel_filters_abs
add x9, x9, x12, lsl #3
ldr d28, [x9]
dup v0.16b, v28.b[0]
dup v1.16b, v28.b[1]
dup v2.16b, v28.b[2]
dup v3.16b, v28.b[3]
dup v4.16b, v28.b[4]
dup v5.16b, v28.b[5]
dup v6.16b, v28.b[6]
dup v7.16b, v28.b[7]
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6 // wx
dup v31.4s, w10 // shift
dup v29.4s, w7 // ox
.endm
.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
umull \dst\().8h, \src1\().8b, v1.8b
umlsl \dst\().8h, \src0\().8b, v0.8b
umlsl \dst\().8h, \src2\().8b, v2.8b
umlal \dst\().8h, \src3\().8b, v3.8b
umlal \dst\().8h, \src4\().8b, v4.8b
umlsl \dst\().8h, \src5\().8b, v5.8b
umlal \dst\().8h, \src6\().8b, v6.8b
umlsl \dst\().8h, \src7\().8b, v7.8b
.endm
.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
umull2 \dst\().8h, \src1\().16b, v1.16b
umlsl2 \dst\().8h, \src0\().16b, v0.16b
umlsl2 \dst\().8h, \src2\().16b, v2.16b
umlal2 \dst\().8h, \src3\().16b, v3.16b
umlal2 \dst\().8h, \src4\().16b, v4.16b
umlsl2 \dst\().8h, \src5\().16b, v5.16b
umlal2 \dst\().8h, \src6\().16b, v6.16b
umlsl2 \dst\().8h, \src7\().16b, v7.16b
.endm
.macro QPEL_UNI_W_V_4
smull v24.4s, v24.4h, v30.4h
sqrshl v24.4s, v24.4s, v31.4s
sqadd v24.4s, v24.4s, v29.4s
sqxtn v24.4h, v24.4s
sqxtun v24.8b, v24.8h
st1 {v24.s}[0], [x0], x1
.endm
function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
QPEL_UNI_W_V_HEADER
ldr s16, [x2]
ldr s17, [x2, x3]
add x2, x2, x3, lsl #1
ldr s18, [x2]
ldr s19, [x2, x3]
add x2, x2, x3, lsl #1
ldr s20, [x2]
ldr s21, [x2, x3]
add x2, x2, x3, lsl #1
ldr s22, [x2]
1: ldr s23, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v24, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s16, [x2]
QPEL_FILTER_B v24, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s17, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v24, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s18, [x2]
QPEL_FILTER_B v24, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s19, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v24, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s20, [x2]
QPEL_FILTER_B v24, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s21, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v24, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_UNI_W_V_4
subs w4, w4, #1
b.eq 2f
ldr s22, [x2]
QPEL_FILTER_B v24, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_UNI_W_V_4
subs w4, w4, #1
b.ne 1b
2:
ret
endfunc
.macro QPEL_UNI_W_V_8
smull v24.4s, v26.4h, v30.4h
smull2 v25.4s, v26.8h, v30.8h
sqrshl v24.4s, v24.4s, v31.4s
sqrshl v25.4s, v25.4s, v31.4s
sqadd v24.4s, v24.4s, v29.4s
sqadd v25.4s, v25.4s, v29.4s
sqxtn v24.4h, v24.4s
sqxtn2 v24.8h, v25.4s
sqxtun v24.8b, v24.8h
st1 {v24.d}[0], [x0], x1
.endm
function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
QPEL_UNI_W_V_HEADER
ldr d16, [x2]
ldr d17, [x2, x3]
add x2, x2, x3, lsl #1
ldr d18, [x2]
ldr d19, [x2, x3]
add x2, x2, x3, lsl #1
ldr d20, [x2]
ldr d21, [x2, x3]
add x2, x2, x3, lsl #1
ldr d22, [x2]
1: ldr d23, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d16, [x2]
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d17, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d18, [x2]
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d19, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d20, [x2]
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d21, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_UNI_W_V_8
subs w4, w4, #1
b.eq 2f
ldr d22, [x2]
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_UNI_W_V_8
subs w4, w4, #1
b.ne 1b
2:
ret
endfunc
.macro QPEL_UNI_W_V_16
smull v24.4s, v26.4h, v30.4h
smull2 v25.4s, v26.8h, v30.8h
smull v26.4s, v27.4h, v30.4h
smull2 v27.4s, v27.8h, v30.8h
sqrshl v24.4s, v24.4s, v31.4s
sqrshl v25.4s, v25.4s, v31.4s
sqrshl v26.4s, v26.4s, v31.4s
sqrshl v27.4s, v27.4s, v31.4s
sqadd v24.4s, v24.4s, v29.4s
sqadd v25.4s, v25.4s, v29.4s
sqadd v26.4s, v26.4s, v29.4s
sqadd v27.4s, v27.4s, v29.4s
sqxtn v24.4h, v24.4s
sqxtn2 v24.8h, v25.4s
sqxtn v26.4h, v26.4s
sqxtn2 v26.8h, v27.4s
sqxtun v24.8b, v24.8h
sqxtun2 v24.16b, v26.8h
st1 {v24.16b}, [x0], x1
.endm
function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
QPEL_UNI_W_V_HEADER
ldr q16, [x2]
ldr q17, [x2, x3]
add x2, x2, x3, lsl #1
ldr q18, [x2]
ldr q19, [x2, x3]
add x2, x2, x3, lsl #1
ldr q20, [x2]
ldr q21, [x2, x3]
add x2, x2, x3, lsl #1
ldr q22, [x2]
1: ldr q23, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q16, [x2]
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q17, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q18, [x2]
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q19, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q20, [x2]
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q21, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q22, [x2]
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_UNI_W_V_16
subs w4, w4, #1
b.ne 1b
2:
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
QPEL_UNI_W_V_HEADER
ldur w13, [sp, #16]
mov x14, x0
mov x15, x2
mov w11, w4
3:
ldr q16, [x2]
ldr q17, [x2, x3]
add x2, x2, x3, lsl #1
ldr q18, [x2]
ldr q19, [x2, x3]
add x2, x2, x3, lsl #1
ldr q20, [x2]
ldr q21, [x2, x3]
add x2, x2, x3, lsl #1
ldr q22, [x2]
1: ldr q23, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q16, [x2]
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q17, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q18, [x2]
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q19, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q20, [x2]
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q21, [x2, x3]
add x2, x2, x3, lsl #1
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_UNI_W_V_16
subs w4, w4, #1
b.eq 2f
ldr q22, [x2]
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_UNI_W_V_16
subs w4, w4, #1
b.ne 1b
2:
subs w13, w13, #16
add x14, x14, #16
add x15, x15, #16
mov x0, x14
mov x2, x15
mov w4, w11
b.hi 3b
ret
endfunc