1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00
FFmpeg/libavcodec/aarch64/h264dsp_neon.S
Janne Grunau 28a8b5413b h264/aarch64: add intra loop filter neon asm
Add my neon asm from x264 relicensed under the LGPL 2.1 or later. Ported
(x264 uses nv12 chroma) and optimized.

Cycle count for checkasm --bench on a Snapdragon 820e:
h264_h_loop_filter_luma_intra_8bpp_c: 60.0
h264_h_loop_filter_luma_intra_8bpp_neon: 54.2
h264_v_loop_filter_luma_intra_8bpp_c: 148.3
h264_v_loop_filter_luma_intra_8bpp_neon: 73.8
h264_h_loop_filter_chroma_intra_8bpp_c: 27.8
h264_h_loop_filter_chroma_intra_8bpp_neon: 21.4
h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 15.8
h264_h_loop_filter_chroma_mbaff_intra_8bpp_neon: 15.7
h264_v_loop_filter_chroma_intra_8bpp_c: 45.8
h264_v_loop_filter_chroma_intra_8bpp_neon: 17.3
2019-01-26 12:05:10 +01:00

804 lines
29 KiB
ArmAsm

/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
.macro h264_loop_filter_start
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.S[0], w6
and w6, w6, w6, lsl #16
b.eq 1f
ands w6, w6, w6, lsl #8
b.ge 2f
1:
ret
2:
.endm
.macro h264_loop_filter_luma
dup v22.16B, w2 // alpha
uxtl v24.8H, v24.8B
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
uxtl v24.4S, v24.4H
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
sli v24.8H, v24.8H, #8
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
sli v24.4S, v24.4S, #16
cmhi v21.16B, v22.16B, v21.16B // < alpha
dup v22.16B, w3 // beta
cmlt v23.16B, v24.16B, #0
cmhi v28.16B, v22.16B, v28.16B // < beta
cmhi v30.16B, v22.16B, v30.16B // < beta
bic v21.16B, v21.16B, v23.16B
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
and v21.16B, v21.16B, v30.16B // < beta
shrn v30.8b, v21.8h, #4
mov x7, v30.d[0]
cmhi v17.16B, v22.16B, v17.16B // < beta
cmhi v19.16B, v22.16B, v19.16B // < beta
cbz x7, 9f
and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B
urhadd v28.16B, v16.16B, v0.16B
sub v21.16B, v24.16B, v17.16B
uqadd v23.16B, v18.16B, v24.16B
uhadd v20.16B, v20.16B, v28.16B
sub v21.16B, v21.16B, v19.16B
uhadd v28.16B, v4.16B, v28.16B
umin v23.16B, v23.16B, v20.16B
uqsub v22.16B, v18.16B, v24.16B
uqadd v4.16B, v2.16B, v24.16B
umax v23.16B, v23.16B, v22.16B
uqsub v22.16B, v2.16B, v24.16B
umin v28.16B, v4.16B, v28.16B
uxtl v4.8H, v0.8B
umax v28.16B, v28.16B, v22.16B
uxtl2 v20.8H, v0.16B
usubw v4.8H, v4.8H, v16.8B
usubw2 v20.8H, v20.8H, v16.16B
shl v4.8H, v4.8H, #2
shl v20.8H, v20.8H, #2
uaddw v4.8H, v4.8H, v18.8B
uaddw2 v20.8H, v20.8H, v18.16B
usubw v4.8H, v4.8H, v2.8B
usubw2 v20.8H, v20.8H, v2.16B
rshrn v4.8B, v4.8H, #3
rshrn2 v4.16B, v20.8H, #3
bsl v17.16B, v23.16B, v18.16B
bsl v19.16B, v28.16B, v2.16B
neg v23.16B, v21.16B
uxtl v28.8H, v16.8B
smin v4.16B, v4.16B, v21.16B
uxtl2 v21.8H, v16.16B
smax v4.16B, v4.16B, v23.16B
uxtl v22.8H, v0.8B
uxtl2 v24.8H, v0.16B
saddw v28.8H, v28.8H, v4.8B
saddw2 v21.8H, v21.8H, v4.16B
ssubw v22.8H, v22.8H, v4.8B
ssubw2 v24.8H, v24.8H, v4.16B
sqxtun v16.8B, v28.8H
sqxtun2 v16.16B, v21.8H
sqxtun v0.8B, v22.8H
sqxtun2 v0.16B, v24.8H
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
sxtw x1, w1
ld1 {v0.16B}, [x0], x1
ld1 {v2.16B}, [x0], x1
ld1 {v4.16B}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16B}, [x0], x1
ld1 {v18.16B}, [x0], x1
ld1 {v16.16B}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16B}, [x0], x1
st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0]
9:
ret
endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sxtw x1, w1
sub x0, x0, #4
ld1 {v6.8B}, [x0], x1
ld1 {v20.8B}, [x0], x1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0], x1
ld1 {v4.8B}, [x0], x1
ld1 {v26.8B}, [x0], x1
ld1 {v6.D}[1], [x0], x1
ld1 {v20.D}[1], [x0], x1
ld1 {v18.D}[1], [x0], x1
ld1 {v16.D}[1], [x0], x1
ld1 {v0.D}[1], [x0], x1
ld1 {v2.D}[1], [x0], x1
ld1 {v4.D}[1], [x0], x1
ld1 {v26.D}[1], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
h264_loop_filter_luma
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v19.S}[0], [x0], x1
st1 {v17.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v19.S}[1], [x0], x1
st1 {v17.S}[2], [x0], x1
st1 {v16.S}[2], [x0], x1
st1 {v0.S}[2], [x0], x1
st1 {v19.S}[2], [x0], x1
st1 {v17.S}[3], [x0], x1
st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1
9:
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cbnz w4, 1f
ret
1:
sxtw x1, w1
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
cmhi v19.16b, v30.16b, v16.16b // < alpha
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
movi v29.16b, #2
ushr v30.16b, v30.16b, #2 // alpha >> 2
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
and v19.16b, v19.16b, v17.16b
and v19.16b, v19.16b, v18.16b
shrn v20.8b, v19.8h, #4
mov x4, v20.d[0]
cbz x4, 9f
ushll v20.8h, v6.8b, #1
ushll v22.8h, v1.8b, #1
ushll2 v21.8h, v6.16b, #1
ushll2 v23.8h, v1.16b, #1
uaddw v20.8h, v20.8h, v7.8b
uaddw v22.8h, v22.8h, v0.8b
uaddw2 v21.8h, v21.8h, v7.16b
uaddw2 v23.8h, v23.8h, v0.16b
uaddw v20.8h, v20.8h, v1.8b
uaddw v22.8h, v22.8h, v6.8b
uaddw2 v21.8h, v21.8h, v1.16b
uaddw2 v23.8h, v23.8h, v6.16b
rshrn v24.8b, v20.8h, #2 // p0'_1
rshrn v25.8b, v22.8h, #2 // q0'_1
rshrn2 v24.16b, v21.8h, #2 // p0'_1
rshrn2 v25.16b, v23.8h, #2 // q0'_1
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
and v17.16b, v16.16b, v17.16b // if_2 && if_3
and v18.16b, v16.16b, v18.16b // if_2 && if_4
not v30.16b, v17.16b
not v31.16b, v18.16b
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8h, v5.8b, v7.8b
uaddl2 v27.8h, v5.16b, v7.16b
uaddw v26.8h, v26.8h, v0.8b
uaddw2 v27.8h, v27.8h, v0.16b
add v20.8h, v20.8h, v26.8h
add v21.8h, v21.8h, v27.8h
uaddw v20.8h, v20.8h, v0.8b
uaddw2 v21.8h, v21.8h, v0.16b
rshrn v20.8b, v20.8h, #3 // p0'_2
rshrn2 v20.16b, v21.8h, #3 // p0'_2
uaddw v26.8h, v26.8h, v6.8b
uaddw2 v27.8h, v27.8h, v6.16b
rshrn v21.8b, v26.8h, #2 // p1'_2
rshrn2 v21.16b, v27.8h, #2 // p1'_2
uaddl v28.8h, v4.8b, v5.8b
uaddl2 v29.8h, v4.16b, v5.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v19.8b, v28.8h, #3 // p2'_2
rshrn2 v19.16b, v29.8h, #3 // p2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8h, v2.8b, v0.8b
uaddl2 v27.8h, v2.16b, v0.16b
uaddw v26.8h, v26.8h, v7.8b
uaddw2 v27.8h, v27.8h, v7.16b
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
uaddw v22.8h, v22.8h, v7.8b
uaddw2 v23.8h, v23.8h, v7.16b
rshrn v22.8b, v22.8h, #3 // q0'_2
rshrn2 v22.16b, v23.8h, #3 // q0'_2
uaddw v26.8h, v26.8h, v1.8b
uaddw2 v27.8h, v27.8h, v1.16b
rshrn v23.8b, v26.8h, #2 // q1'_2
rshrn2 v23.16b, v27.8h, #2 // q1'_2
uaddl v28.8h, v2.8b, v3.8b
uaddl2 v29.8h, v2.16b, v3.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v26.8b, v28.8h, #3 // q2'_2
rshrn2 v26.16b, v29.8h, #3 // q2'_2
bit v7.16b, v24.16b, v30.16b // p0'_1
bit v0.16b, v25.16b, v31.16b // q0'_1
bit v7.16b, v20.16b, v17.16b // p0'_2
bit v6.16b, v21.16b, v17.16b // p1'_2
bit v5.16b, v19.16b, v17.16b // p2'_2
bit v0.16b, v22.16b, v18.16b // q0'_2
bit v1.16b, v23.16b, v18.16b // q1'_2
bit v2.16b, v26.16b, v18.16b // q2'_2
.endm
function ff_h264_v_loop_filter_luma_intra_neon, export=1
h264_loop_filter_start_intra
ld1 {v0.16b}, [x0], x1 // q0
ld1 {v1.16b}, [x0], x1 // q1
ld1 {v2.16b}, [x0], x1 // q2
ld1 {v3.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16b}, [x0], x1 // p3
ld1 {v5.16b}, [x0], x1 // p2
ld1 {v6.16b}, [x0], x1 // p1
ld1 {v7.16b}, [x0] // p0
h264_loop_filter_luma_intra
sub x0, x0, x1, lsl #1
st1 {v5.16b}, [x0], x1 // p2
st1 {v6.16b}, [x0], x1 // p1
st1 {v7.16b}, [x0], x1 // p0
st1 {v0.16b}, [x0], x1 // q0
st1 {v1.16b}, [x0], x1 // q1
st1 {v2.16b}, [x0] // q2
9:
ret
endfunc
function ff_h264_h_loop_filter_luma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, #4
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v5.d}[1], [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[1], [x0], x1
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
h264_loop_filter_luma_intra
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
sub x0, x0, x1, lsl #4
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[1], [x0], x1
st1 {v6.d}[1], [x0], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
st1 {v3.d}[1], [x0], x1
9:
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
dup v23.8B, w3 // beta
uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
cmhi v26.8B, v22.8B, v26.8B // < alpha
cmhi v28.8B, v23.8B, v28.8B // < beta
cmhi v30.8B, v23.8B, v30.8B // < beta
uxtl v4.8H, v0.8B
and v26.8B, v26.8B, v28.8B
usubw v4.8H, v4.8H, v16.8B
and v26.8B, v26.8B, v30.8B
shl v4.8H, v4.8H, #2
mov x2, v26.d[0]
sli v24.8H, v24.8H, #8
uaddw v4.8H, v4.8H, v18.8B
cbz x2, 9f
usubw v4.8H, v4.8H, v2.8B
rshrn v4.8B, v4.8H, #3
smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B
smax v4.8B, v4.8B, v25.8B
uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B
saddw v28.8H, v28.8H, v4.8B
ssubw v22.8H, v22.8H, v4.8B
sqxtun v16.8B, v28.8H
sqxtun v0.8B, v22.8H
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sxtw x1, w1
sub x0, x0, x1, lsl #1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sxtw x1, w1
sub x0, x0, #2
ld1 {v18.S}[0], [x0], x1
ld1 {v16.S}[0], [x0], x1
ld1 {v0.S}[0], [x0], x1
ld1 {v2.S}[0], [x0], x1
ld1 {v18.S}[1], [x0], x1
ld1 {v16.S}[1], [x0], x1
ld1 {v0.S}[1], [x0], x1
ld1 {v2.S}[1], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v2.S}[0], [x0], x1
st1 {v18.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1
9:
ret
endfunc
.macro h264_loop_filter_chroma_intra
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
cmhi v26.8b, v30.8b, v26.8b // < alpha
cmhi v27.8b, v31.8b, v27.8b // < beta
cmhi v28.8b, v31.8b, v28.8b // < beta
and v26.8b, v26.8b, v27.8b
and v26.8b, v26.8b, v28.8b
mov x2, v26.d[0]
ushll v4.8h, v18.8b, #1
ushll v6.8h, v19.8b, #1
cbz x2, 9f
uaddl v20.8h, v16.8b, v19.8b
uaddl v22.8h, v17.8b, v18.8b
add v20.8h, v20.8h, v4.8h
add v22.8h, v22.8h, v6.8h
uqrshrn v24.8b, v20.8h, #2
uqrshrn v25.8b, v22.8h, #2
bit v16.8b, v24.8b, v26.8b
bit v17.8b, v25.8b, v26.8b
.endm
function ff_h264_v_loop_filter_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, x1, lsl #1
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x0]
h264_loop_filter_chroma_intra
sub x0, x0, x1, lsl #1
st1 {v16.8b}, [x0], x1
st1 {v17.8b}, [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #2
sub x0, x0, #1
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.b,v17.b}[0], [x0], x1
st2 {v16.b,v17.b}[1], [x0], x1
st2 {v16.b,v17.b}[2], [x0], x1
st2 {v16.b,v17.b}[3], [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #2
sub x0, x0, #1
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
ld1 {v18.s}[1], [x4], x1
ld1 {v16.s}[1], [x4], x1
ld1 {v17.s}[1], [x4], x1
ld1 {v19.s}[1], [x4]
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.b,v17.b}[0], [x0], x1
st2 {v16.b,v17.b}[1], [x0], x1
st2 {v16.b,v17.b}[2], [x0], x1
st2 {v16.b,v17.b}[3], [x0], x1
st2 {v16.b,v17.b}[4], [x0], x1
st2 {v16.b,v17.b}[5], [x0], x1
st2 {v16.b,v17.b}[6], [x0], x1
st2 {v16.b,v17.b}[7], [x0], x1
9:
ret
endfunc
.macro biweight_16 macs, macd
dup v0.16B, w5
dup v1.16B, w6
mov v4.16B, v16.16B
mov v6.16B, v16.16B
1: subs w3, w3, #2
ld1 {v20.16B}, [x0], x2
\macd v4.8H, v0.8B, v20.8B
\macd\()2 v6.8H, v0.16B, v20.16B
ld1 {v22.16B}, [x1], x2
\macs v4.8H, v1.8B, v22.8B
\macs\()2 v6.8H, v1.16B, v22.16B
mov v24.16B, v16.16B
ld1 {v28.16B}, [x0], x2
mov v26.16B, v16.16B
\macd v24.8H, v0.8B, v28.8B
\macd\()2 v26.8H, v0.16B, v28.16B
ld1 {v30.16B}, [x1], x2
\macs v24.8H, v1.8B, v30.8B
\macs\()2 v26.8H, v1.16B, v30.16B
sshl v4.8H, v4.8H, v18.8H
sshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
sshl v24.8H, v24.8H, v18.8H
sshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
mov v6.16B, v16.16B
st1 {v4.16B}, [x7], x2
mov v4.16B, v16.16B
st1 {v24.16B}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_8 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B, v16.16B
1: subs w3, w3, #2
ld1 {v4.8B}, [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.8B}, [x1], x2
\macs v2.8H, v1.8B, v5.8B
ld1 {v6.8B}, [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.8B}, [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.8B}, [x7], x2
mov v2.16B, v16.16B
st1 {v4.8B}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_4 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B,v16.16B
1: subs w3, w3, #4
ld1 {v4.S}[0], [x0], x2
ld1 {v4.S}[1], [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.S}[0], [x1], x2
ld1 {v5.S}[1], [x1], x2
\macs v2.8H, v1.8B, v5.8B
b.lt 2f
ld1 {v6.S}[0], [x0], x2
ld1 {v6.S}[1], [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.S}[0], [x1], x2
ld1 {v7.S}[1], [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
mov v2.16B, v16.16B
st1 {v4.S}[0], [x7], x2
st1 {v4.S}[1], [x7], x2
b.ne 1b
ret
2: sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
ret
.endm
.macro biweight_func w
function ff_biweight_h264_pixels_\w\()_neon, export=1
sxtw x2, w2
lsr w8, w5, #31
add w7, w7, #1
eor w8, w8, w6, lsr #30
orr w7, w7, #1
dup v18.8H, w4
lsl w7, w7, w4
not v18.16B, v18.16B
dup v16.8H, w7
mov x7, x0
cbz w8, 10f
subs w8, w8, #1
b.eq 20f
subs w8, w8, #1
b.eq 30f
b 40f
10: biweight_\w umlal, umlal
20: neg w5, w5
biweight_\w umlal, umlsl
30: neg w5, w5
neg w6, w6
biweight_\w umlsl, umlsl
40: neg w6, w6
biweight_\w umlsl, umlal
endfunc
.endm
biweight_func 16
biweight_func 8
biweight_func 4
.macro weight_16 add
dup v0.16B, w4
1: subs w2, w2, #2
ld1 {v20.16B}, [x0], x1
umull v4.8H, v0.8B, v20.8B
umull2 v6.8H, v0.16B, v20.16B
ld1 {v28.16B}, [x0], x1
umull v24.8H, v0.8B, v28.8B
umull2 v26.8H, v0.16B, v28.16B
\add v4.8H, v16.8H, v4.8H
srshl v4.8H, v4.8H, v18.8H
\add v6.8H, v16.8H, v6.8H
srshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
\add v24.8H, v16.8H, v24.8H
srshl v24.8H, v24.8H, v18.8H
\add v26.8H, v16.8H, v26.8H
srshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
st1 {v4.16B}, [x5], x1
st1 {v24.16B}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_8 add
dup v0.8B, w4
1: subs w2, w2, #2
ld1 {v4.8B}, [x0], x1
umull v2.8H, v0.8B, v4.8B
ld1 {v6.8B}, [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.8B}, [x5], x1
st1 {v4.8B}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_4 add
dup v0.8B, w4
1: subs w2, w2, #4
ld1 {v4.S}[0], [x0], x1
ld1 {v4.S}[1], [x0], x1
umull v2.8H, v0.8B, v4.8B
b.lt 2f
ld1 {v6.S}[0], [x0], x1
ld1 {v6.S}[1], [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8h, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
st1 {v4.S}[0], [x5], x1
st1 {v4.S}[1], [x5], x1
b.ne 1b
ret
2: \add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
ret
.endm
.macro weight_func w
function ff_weight_h264_pixels_\w\()_neon, export=1
sxtw x1, w1
cmp w3, #1
mov w6, #1
lsl w5, w5, w3
dup v16.8H, w5
mov x5, x0
b.le 20f
sub w6, w6, w3
dup v18.8H, w6
cmp w4, #0
b.lt 10f
weight_\w shadd
10: neg w4, w4
weight_\w shsub
20: neg w6, w3
dup v18.8H, w6
cmp w4, #0
b.lt 10f
weight_\w add
10: neg w4, w4
weight_\w sub
endfunc
.endm
weight_func 16
weight_func 8
weight_func 4