mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
5c363d3e59
* commit '7e42d5f0ab2aeac811fd01e122627c9198b13f01': aarch64: vp8: Optimize vp8_idct_add_neon for aarch64 Merged-by: James Almer <jamrial@gmail.com>
1791 lines
65 KiB
ArmAsm
1791 lines
65 KiB
ArmAsm
/*
|
|
* VP8 NEON optimisations
|
|
*
|
|
* Copyright (c) 2010 Rob Clark <rob@ti.com>
|
|
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
|
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
|
|
* Copyright (c) 2019 Martin Storsjo <martin@martin.st>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
#include "neon.S"
|
|
|
|
function ff_vp8_luma_dc_wht_neon, export=1
|
|
ld1 {v0.4h - v3.4h}, [x1]
|
|
movi v30.8h, #0
|
|
|
|
add v4.4h, v0.4h, v3.4h
|
|
add v6.4h, v1.4h, v2.4h
|
|
st1 {v30.8h}, [x1], #16
|
|
sub v7.4h, v1.4h, v2.4h
|
|
sub v5.4h, v0.4h, v3.4h
|
|
st1 {v30.8h}, [x1]
|
|
add v0.4h, v4.4h, v6.4h
|
|
add v1.4h, v5.4h, v7.4h
|
|
sub v2.4h, v4.4h, v6.4h
|
|
sub v3.4h, v5.4h, v7.4h
|
|
|
|
movi v16.4h, #3
|
|
|
|
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
add v0.4h, v0.4h, v16.4h
|
|
|
|
add v4.4h, v0.4h, v3.4h
|
|
add v6.4h, v1.4h, v2.4h
|
|
sub v7.4h, v1.4h, v2.4h
|
|
sub v5.4h, v0.4h, v3.4h
|
|
add v0.4h, v4.4h, v6.4h
|
|
add v1.4h, v5.4h, v7.4h
|
|
sub v2.4h, v4.4h, v6.4h
|
|
sub v3.4h, v5.4h, v7.4h
|
|
|
|
sshr v0.4h, v0.4h, #3
|
|
sshr v1.4h, v1.4h, #3
|
|
sshr v2.4h, v2.4h, #3
|
|
sshr v3.4h, v3.4h, #3
|
|
|
|
mov x3, #32
|
|
st1 {v0.h}[0], [x0], x3
|
|
st1 {v1.h}[0], [x0], x3
|
|
st1 {v2.h}[0], [x0], x3
|
|
st1 {v3.h}[0], [x0], x3
|
|
st1 {v0.h}[1], [x0], x3
|
|
st1 {v1.h}[1], [x0], x3
|
|
st1 {v2.h}[1], [x0], x3
|
|
st1 {v3.h}[1], [x0], x3
|
|
st1 {v0.h}[2], [x0], x3
|
|
st1 {v1.h}[2], [x0], x3
|
|
st1 {v2.h}[2], [x0], x3
|
|
st1 {v3.h}[2], [x0], x3
|
|
st1 {v0.h}[3], [x0], x3
|
|
st1 {v1.h}[3], [x0], x3
|
|
st1 {v2.h}[3], [x0], x3
|
|
st1 {v3.h}[3], [x0], x3
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vp8_idct_add_neon, export=1
|
|
ld1 {v0.8b - v3.8b}, [x1]
|
|
mov w4, #20091
|
|
movk w4, #35468/2, lsl #16
|
|
dup v4.2s, w4
|
|
|
|
smull v26.4s, v1.4h, v4.h[0]
|
|
smull v27.4s, v3.4h, v4.h[0]
|
|
sqdmulh v20.4h, v1.4h, v4.h[1]
|
|
sqdmulh v23.4h, v3.4h, v4.h[1]
|
|
shrn v21.4h, v26.4s, #16
|
|
shrn v22.4h, v27.4s, #16
|
|
add v21.4h, v21.4h, v1.4h
|
|
add v22.4h, v22.4h, v3.4h
|
|
|
|
add v16.4h, v0.4h, v2.4h
|
|
sub v17.4h, v0.4h, v2.4h
|
|
|
|
add v18.4h, v21.4h, v23.4h
|
|
sub v19.4h, v20.4h, v22.4h
|
|
|
|
add v0.4h, v16.4h, v18.4h
|
|
add v1.4h, v17.4h, v19.4h
|
|
sub v3.4h, v16.4h, v18.4h
|
|
sub v2.4h, v17.4h, v19.4h
|
|
|
|
transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
|
|
|
|
movi v29.8h, #0
|
|
smull v26.4s, v1.4h, v4.h[0]
|
|
st1 {v29.8h}, [x1], #16
|
|
smull v27.4s, v3.4h, v4.h[0]
|
|
st1 {v29.16b}, [x1]
|
|
sqdmulh v21.4h, v1.4h, v4.h[1]
|
|
sqdmulh v23.4h, v3.4h, v4.h[1]
|
|
shrn v20.4h, v26.4s, #16
|
|
shrn v22.4h, v27.4s, #16
|
|
add v20.4h, v20.4h, v1.4h
|
|
add v22.4h, v22.4h, v3.4h
|
|
add v16.4h, v0.4h, v2.4h
|
|
sub v17.4h, v0.4h, v2.4h
|
|
|
|
add v18.4h, v20.4h, v23.4h
|
|
ld1 {v24.s}[0], [x0], x2
|
|
sub v19.4h, v21.4h, v22.4h
|
|
ld1 {v25.s}[0], [x0], x2
|
|
add v0.4h, v16.4h, v18.4h
|
|
add v1.4h, v17.4h, v19.4h
|
|
ld1 {v26.s}[0], [x0], x2
|
|
sub v3.4h, v16.4h, v18.4h
|
|
sub v2.4h, v17.4h, v19.4h
|
|
ld1 {v27.s}[0], [x0], x2
|
|
srshr v0.4h, v0.4h, #3
|
|
srshr v1.4h, v1.4h, #3
|
|
srshr v2.4h, v2.4h, #3
|
|
srshr v3.4h, v3.4h, #3
|
|
|
|
sub x0, x0, x2, lsl #2
|
|
|
|
transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
|
|
|
|
uaddw v0.8h, v0.8h, v24.8b
|
|
uaddw v1.8h, v1.8h, v25.8b
|
|
uaddw v2.8h, v2.8h, v26.8b
|
|
uaddw v3.8h, v3.8h, v27.8b
|
|
sqxtun v0.8b, v0.8h
|
|
sqxtun v1.8b, v1.8h
|
|
sqxtun v2.8b, v2.8h
|
|
sqxtun v3.8b, v3.8h
|
|
|
|
st1 {v0.s}[0], [x0], x2
|
|
st1 {v1.s}[0], [x0], x2
|
|
st1 {v2.s}[0], [x0], x2
|
|
st1 {v3.s}[0], [x0], x2
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vp8_idct_dc_add4uv_neon, export=1
|
|
movi v0.4h, #0
|
|
mov x3, #32
|
|
ld1r {v16.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
ld1r {v17.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
ld1r {v18.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
ld1r {v19.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
ins v16.d[1], v17.d[0]
|
|
ins v18.d[1], v19.d[0]
|
|
mov x3, x0
|
|
srshr v16.8h, v16.8h, #3 // dc >>= 3
|
|
ld1 {v0.8b}, [x0], x2
|
|
srshr v18.8h, v18.8h, #3
|
|
ld1 {v1.8b}, [x0], x2
|
|
uaddw v20.8h, v16.8h, v0.8b
|
|
ld1 {v2.8b}, [x0], x2
|
|
uaddw v0.8h, v16.8h, v1.8b
|
|
ld1 {v3.8b}, [x0], x2
|
|
uaddw v22.8h, v16.8h, v2.8b
|
|
ld1 {v4.8b}, [x0], x2
|
|
uaddw v2.8h, v16.8h, v3.8b
|
|
ld1 {v5.8b}, [x0], x2
|
|
uaddw v24.8h, v18.8h, v4.8b
|
|
ld1 {v6.8b}, [x0], x2
|
|
uaddw v4.8h, v18.8h, v5.8b
|
|
ld1 {v7.8b}, [x0], x2
|
|
uaddw v26.8h, v18.8h, v6.8b
|
|
sqxtun v20.8b, v20.8h
|
|
uaddw v6.8h, v18.8h, v7.8b
|
|
sqxtun v21.8b, v0.8h
|
|
sqxtun v22.8b, v22.8h
|
|
st1 {v20.8b}, [x3], x2
|
|
sqxtun v23.8b, v2.8h
|
|
st1 {v21.8b}, [x3], x2
|
|
sqxtun v24.8b, v24.8h
|
|
st1 {v22.8b}, [x3], x2
|
|
sqxtun v25.8b, v4.8h
|
|
st1 {v23.8b}, [x3], x2
|
|
sqxtun v26.8b, v26.8h
|
|
st1 {v24.8b}, [x3], x2
|
|
sqxtun v27.8b, v6.8h
|
|
st1 {v25.8b}, [x3], x2
|
|
st1 {v26.8b}, [x3], x2
|
|
st1 {v27.8b}, [x3], x2
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vp8_idct_dc_add4y_neon, export=1
|
|
movi v0.16b, #0
|
|
mov x3, #32
|
|
ld1r {v16.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
ld1r {v17.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
zip1 v16.2d, v16.2d, v17.2d
|
|
ld1r {v18.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
ld1r {v19.4h}, [x1]
|
|
st1 {v0.h}[0], [x1], x3
|
|
zip1 v18.2d, v18.2d, v19.2d
|
|
srshr v16.8h, v16.8h, #3 // dc >>= 3
|
|
ld1 {v0.16b}, [x0], x2
|
|
srshr v18.8h, v18.8h, #3
|
|
ld1 {v1.16b}, [x0], x2
|
|
uaddw v20.8h, v16.8h, v0.8b
|
|
ld1 {v2.16b}, [x0], x2
|
|
uaddw2 v0.8h, v18.8h, v0.16b
|
|
ld1 {v3.16b}, [x0], x2
|
|
uaddw v21.8h, v16.8h, v1.8b
|
|
uaddw2 v1.8h, v18.8h, v1.16b
|
|
uaddw v22.8h, v16.8h, v2.8b
|
|
uaddw2 v2.8h, v18.8h, v2.16b
|
|
uaddw v23.8h, v16.8h, v3.8b
|
|
uaddw2 v3.8h, v18.8h, v3.16b
|
|
sub x0, x0, x2, lsl #2
|
|
sqxtun v20.8b, v20.8h
|
|
sqxtun2 v20.16b, v0.8h
|
|
sqxtun v21.8b, v21.8h
|
|
sqxtun2 v21.16b, v1.8h
|
|
sqxtun v22.8b, v22.8h
|
|
st1 {v20.16b}, [x0], x2
|
|
sqxtun2 v22.16b, v2.8h
|
|
st1 {v21.16b}, [x0], x2
|
|
sqxtun v23.8b, v23.8h
|
|
st1 {v22.16b}, [x0], x2
|
|
sqxtun2 v23.16b, v3.8h
|
|
st1 {v23.16b}, [x0], x2
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vp8_idct_dc_add_neon, export=1
|
|
mov w3, #0
|
|
ld1r {v2.8h}, [x1]
|
|
strh w3, [x1]
|
|
srshr v2.8h, v2.8h, #3
|
|
ld1 {v0.s}[0], [x0], x2
|
|
ld1 {v0.s}[1], [x0], x2
|
|
uaddw v3.8h, v2.8h, v0.8b
|
|
ld1 {v1.s}[0], [x0], x2
|
|
ld1 {v1.s}[1], [x0], x2
|
|
uaddw v4.8h, v2.8h, v1.8b
|
|
sqxtun v0.8b, v3.8h
|
|
sqxtun v1.8b, v4.8h
|
|
sub x0, x0, x2, lsl #2
|
|
st1 {v0.s}[0], [x0], x2
|
|
st1 {v0.s}[1], [x0], x2
|
|
st1 {v1.s}[0], [x0], x2
|
|
st1 {v1.s}[1], [x0], x2
|
|
ret
|
|
endfunc
|
|
|
|
// Register layout:
|
|
// P3..Q3 -> v0..v7
|
|
// flim_E -> v22
|
|
// flim_I -> v23
|
|
// hev_thresh -> x5
|
|
//
|
|
.macro vp8_loop_filter, inner=0, simple=0, hev_thresh
|
|
.if \simple
|
|
uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
|
|
uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
|
|
uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
|
|
ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
|
|
uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
|
|
movi v21.16b, #0x80
|
|
cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
|
|
.else
|
|
// calculate hev and normal_limit:
|
|
uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
|
|
uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
|
|
uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
|
|
uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
|
|
cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
|
|
cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
|
|
cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
|
|
cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
|
|
and v16.16b, v17.16b, v16.16b
|
|
uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
|
|
and v16.16b, v16.16b, v19.16b
|
|
uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
|
|
and v16.16b, v16.16b, v18.16b
|
|
cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
|
|
cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
|
|
uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
|
|
uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
|
|
and v16.16b, v16.16b, v18.16b
|
|
uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
|
|
and v16.16b, v16.16b, v19.16b
|
|
ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
|
|
dup v23.16b, \hev_thresh // hev_thresh
|
|
uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
|
|
cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
|
|
cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
|
|
cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
|
|
and v16.16b, v16.16b, v19.16b
|
|
movi v21.16b, #0x80
|
|
orr v17.16b, v20.16b, v22.16b
|
|
.endif
|
|
|
|
// at this point:
|
|
// v16: normal_limit
|
|
// v17: hev
|
|
|
|
// convert to signed value:
|
|
eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
|
|
eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
|
|
|
|
movi v20.8h, #3
|
|
ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
|
|
ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
|
|
eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
|
|
eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
|
|
mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
|
|
mul v19.8h, v19.8h, v20.8h
|
|
|
|
sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
|
|
movi v22.16b, #4
|
|
movi v23.16b, #3
|
|
.if \inner
|
|
and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
|
|
.endif
|
|
saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
|
|
saddw2 v19.8h, v19.8h, v20.16b
|
|
sqxtn v18.8b, v18.8h // narrow result back into v18
|
|
sqxtn2 v18.16b, v19.8h
|
|
.if !\inner && !\simple
|
|
eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
|
|
eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
|
|
.endif
|
|
and v18.16b, v18.16b, v16.16b // w &= normal_limit
|
|
|
|
// registers used at this point..
|
|
// v0 -> P3 (don't corrupt)
|
|
// v1-v6 -> PS2-QS2
|
|
// v7 -> Q3 (don't corrupt)
|
|
// v17 -> hev
|
|
// v18 -> w
|
|
// v21 -> #0x80
|
|
// v22 -> #4
|
|
// v23 -> #3
|
|
// v16, v19, v29 -> unused
|
|
//
|
|
// filter_common: is4tap==1
|
|
// c1 = clamp(w + 4) >> 3;
|
|
// c2 = clamp(w + 3) >> 3;
|
|
// Q0 = s2u(QS0 - c1);
|
|
// P0 = s2u(PS0 + c2);
|
|
|
|
.if \simple
|
|
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
|
|
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
|
|
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
|
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
|
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
|
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
|
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
|
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
|
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
|
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
|
.elseif \inner
|
|
// the !is4tap case of filter_common, only used for inner blocks
|
|
// c3 = ((c1&~hev) + 1) >> 1;
|
|
// Q1 = s2u(QS1 - c3);
|
|
// P1 = s2u(PS1 + c3);
|
|
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
|
|
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
|
|
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
|
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
|
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
|
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
|
bic v19.16b, v19.16b, v17.16b // c1 & ~hev
|
|
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
|
srshr v19.16b, v19.16b, #1 // c3 >>= 1
|
|
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
|
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
|
|
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
|
|
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
|
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
|
.else
|
|
and v20.16b, v18.16b, v17.16b // w & hev
|
|
sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
|
|
sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
|
|
sshr v19.16b, v19.16b, #3 // c1 >>= 3
|
|
sshr v20.16b, v20.16b, #3 // c2 >>= 3
|
|
bic v18.16b, v18.16b, v17.16b // w &= ~hev
|
|
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
|
|
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
|
|
|
|
// filter_mbedge:
|
|
// a = clamp((27*w + 63) >> 7);
|
|
// Q0 = s2u(QS0 - a);
|
|
// P0 = s2u(PS0 + a);
|
|
// a = clamp((18*w + 63) >> 7);
|
|
// Q1 = s2u(QS1 - a);
|
|
// P1 = s2u(PS1 + a);
|
|
// a = clamp((9*w + 63) >> 7);
|
|
// Q2 = s2u(QS2 - a);
|
|
// P2 = s2u(PS2 + a);
|
|
movi v17.8h, #63
|
|
sshll v22.8h, v18.8b, #3
|
|
sshll2 v23.8h, v18.16b, #3
|
|
saddw v22.8h, v22.8h, v18.8b
|
|
saddw2 v23.8h, v23.8h, v18.16b
|
|
add v16.8h, v17.8h, v22.8h
|
|
add v17.8h, v17.8h, v23.8h // 9*w + 63
|
|
add v19.8h, v16.8h, v22.8h
|
|
add v20.8h, v17.8h, v23.8h // 18*w + 63
|
|
add v22.8h, v19.8h, v22.8h
|
|
add v23.8h, v20.8h, v23.8h // 27*w + 63
|
|
sqshrn v16.8b, v16.8h, #7
|
|
sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
|
|
sqshrn v19.8b, v19.8h, #7
|
|
sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
|
|
sqshrn v22.8b, v22.8h, #7
|
|
sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
|
|
sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
|
|
sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
|
|
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
|
|
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
|
|
sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
|
|
sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
|
|
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
|
|
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
|
|
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
|
|
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
|
|
eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
|
|
eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
|
|
.endif
|
|
.endm
|
|
|
|
.macro vp8_v_loop_filter16 name, inner=0, simple=0
|
|
function ff_vp8_v_loop_filter16\name\()_neon, export=1
|
|
sub x0, x0, x1, lsl #1+!\simple
|
|
|
|
// Load pixels:
|
|
.if !\simple
|
|
ld1 {v0.16b}, [x0], x1 // P3
|
|
ld1 {v1.16b}, [x0], x1 // P2
|
|
.endif
|
|
ld1 {v2.16b}, [x0], x1 // P1
|
|
ld1 {v3.16b}, [x0], x1 // P0
|
|
ld1 {v4.16b}, [x0], x1 // Q0
|
|
ld1 {v5.16b}, [x0], x1 // Q1
|
|
.if !\simple
|
|
ld1 {v6.16b}, [x0], x1 // Q2
|
|
ld1 {v7.16b}, [x0] // Q3
|
|
dup v23.16b, w3 // flim_I
|
|
.endif
|
|
dup v22.16b, w2 // flim_E
|
|
|
|
vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
|
|
|
|
// back up to P2: dst -= stride * 6
|
|
sub x0, x0, x1, lsl #2
|
|
.if !\simple
|
|
sub x0, x0, x1, lsl #1
|
|
|
|
// Store pixels:
|
|
st1 {v1.16b}, [x0], x1 // P2
|
|
.endif
|
|
st1 {v2.16b}, [x0], x1 // P1
|
|
st1 {v3.16b}, [x0], x1 // P0
|
|
st1 {v4.16b}, [x0], x1 // Q0
|
|
st1 {v5.16b}, [x0], x1 // Q1
|
|
.if !\simple
|
|
st1 {v6.16b}, [x0] // Q2
|
|
.endif
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
vp8_v_loop_filter16
|
|
vp8_v_loop_filter16 _inner, inner=1
|
|
vp8_v_loop_filter16 _simple, simple=1
|
|
|
|
.macro vp8_v_loop_filter8uv name, inner=0
|
|
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
|
|
sub x0, x0, x2, lsl #2
|
|
sub x1, x1, x2, lsl #2
|
|
// Load pixels:
|
|
ld1 {v0.d}[0], [x0], x2 // P3
|
|
ld1 {v0.d}[1], [x1], x2 // P3
|
|
ld1 {v1.d}[0], [x0], x2 // P2
|
|
ld1 {v1.d}[1], [x1], x2 // P2
|
|
ld1 {v2.d}[0], [x0], x2 // P1
|
|
ld1 {v2.d}[1], [x1], x2 // P1
|
|
ld1 {v3.d}[0], [x0], x2 // P0
|
|
ld1 {v3.d}[1], [x1], x2 // P0
|
|
ld1 {v4.d}[0], [x0], x2 // Q0
|
|
ld1 {v4.d}[1], [x1], x2 // Q0
|
|
ld1 {v5.d}[0], [x0], x2 // Q1
|
|
ld1 {v5.d}[1], [x1], x2 // Q1
|
|
ld1 {v6.d}[0], [x0], x2 // Q2
|
|
ld1 {v6.d}[1], [x1], x2 // Q2
|
|
ld1 {v7.d}[0], [x0] // Q3
|
|
ld1 {v7.d}[1], [x1] // Q3
|
|
|
|
dup v22.16b, w3 // flim_E
|
|
dup v23.16b, w4 // flim_I
|
|
|
|
vp8_loop_filter inner=\inner, hev_thresh=w5
|
|
|
|
// back up to P2: u,v -= stride * 6
|
|
sub x0, x0, x2, lsl #2
|
|
sub x1, x1, x2, lsl #2
|
|
sub x0, x0, x2, lsl #1
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
// Store pixels:
|
|
|
|
st1 {v1.d}[0], [x0], x2 // P2
|
|
st1 {v1.d}[1], [x1], x2 // P2
|
|
st1 {v2.d}[0], [x0], x2 // P1
|
|
st1 {v2.d}[1], [x1], x2 // P1
|
|
st1 {v3.d}[0], [x0], x2 // P0
|
|
st1 {v3.d}[1], [x1], x2 // P0
|
|
st1 {v4.d}[0], [x0], x2 // Q0
|
|
st1 {v4.d}[1], [x1], x2 // Q0
|
|
st1 {v5.d}[0], [x0], x2 // Q1
|
|
st1 {v5.d}[1], [x1], x2 // Q1
|
|
st1 {v6.d}[0], [x0] // Q2
|
|
st1 {v6.d}[1], [x1] // Q2
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
vp8_v_loop_filter8uv
|
|
vp8_v_loop_filter8uv _inner, inner=1
|
|
|
|
.macro vp8_h_loop_filter16 name, inner=0, simple=0
|
|
function ff_vp8_h_loop_filter16\name\()_neon, export=1
|
|
|
|
sub x0, x0, #4
|
|
// Load pixels:
|
|
ld1 {v0.d}[0], [x0], x1
|
|
ld1 {v1.d}[0], [x0], x1
|
|
ld1 {v2.d}[0], [x0], x1
|
|
ld1 {v3.d}[0], [x0], x1
|
|
ld1 {v4.d}[0], [x0], x1
|
|
ld1 {v5.d}[0], [x0], x1
|
|
ld1 {v6.d}[0], [x0], x1
|
|
ld1 {v7.d}[0], [x0], x1
|
|
ld1 {v0.d}[1], [x0], x1
|
|
ld1 {v1.d}[1], [x0], x1
|
|
ld1 {v2.d}[1], [x0], x1
|
|
ld1 {v3.d}[1], [x0], x1
|
|
ld1 {v4.d}[1], [x0], x1
|
|
ld1 {v5.d}[1], [x0], x1
|
|
ld1 {v6.d}[1], [x0], x1
|
|
ld1 {v7.d}[1], [x0], x1
|
|
|
|
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
|
|
|
dup v22.16b, w2 // flim_E
|
|
.if !\simple
|
|
dup v23.16b, w3 // flim_I
|
|
.endif
|
|
|
|
vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
|
|
|
|
sub x0, x0, x1, lsl #4 // backup 16 rows
|
|
|
|
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
|
|
|
// Store pixels:
|
|
st1 {v0.d}[0], [x0], x1
|
|
st1 {v1.d}[0], [x0], x1
|
|
st1 {v2.d}[0], [x0], x1
|
|
st1 {v3.d}[0], [x0], x1
|
|
st1 {v4.d}[0], [x0], x1
|
|
st1 {v5.d}[0], [x0], x1
|
|
st1 {v6.d}[0], [x0], x1
|
|
st1 {v7.d}[0], [x0], x1
|
|
st1 {v0.d}[1], [x0], x1
|
|
st1 {v1.d}[1], [x0], x1
|
|
st1 {v2.d}[1], [x0], x1
|
|
st1 {v3.d}[1], [x0], x1
|
|
st1 {v4.d}[1], [x0], x1
|
|
st1 {v5.d}[1], [x0], x1
|
|
st1 {v6.d}[1], [x0], x1
|
|
st1 {v7.d}[1], [x0]
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
vp8_h_loop_filter16
|
|
vp8_h_loop_filter16 _inner, inner=1
|
|
vp8_h_loop_filter16 _simple, simple=1
|
|
|
|
.macro vp8_h_loop_filter8uv name, inner=0
|
|
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
|
|
sub x0, x0, #4
|
|
sub x1, x1, #4
|
|
|
|
// Load pixels:
|
|
ld1 {v0.d}[0], [x0], x2 // load u
|
|
ld1 {v0.d}[1], [x1], x2 // load v
|
|
ld1 {v1.d}[0], [x0], x2
|
|
ld1 {v1.d}[1], [x1], x2
|
|
ld1 {v2.d}[0], [x0], x2
|
|
ld1 {v2.d}[1], [x1], x2
|
|
ld1 {v3.d}[0], [x0], x2
|
|
ld1 {v3.d}[1], [x1], x2
|
|
ld1 {v4.d}[0], [x0], x2
|
|
ld1 {v4.d}[1], [x1], x2
|
|
ld1 {v5.d}[0], [x0], x2
|
|
ld1 {v5.d}[1], [x1], x2
|
|
ld1 {v6.d}[0], [x0], x2
|
|
ld1 {v6.d}[1], [x1], x2
|
|
ld1 {v7.d}[0], [x0], x2
|
|
ld1 {v7.d}[1], [x1], x2
|
|
|
|
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
|
|
|
dup v22.16b, w3 // flim_E
|
|
dup v23.16b, w4 // flim_I
|
|
|
|
vp8_loop_filter inner=\inner, hev_thresh=w5
|
|
|
|
sub x0, x0, x2, lsl #3 // backup u 8 rows
|
|
sub x1, x1, x2, lsl #3 // backup v 8 rows
|
|
|
|
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
|
|
|
// Store pixels:
|
|
st1 {v0.d}[0], [x0], x2 // load u
|
|
st1 {v0.d}[1], [x1], x2 // load v
|
|
st1 {v1.d}[0], [x0], x2
|
|
st1 {v1.d}[1], [x1], x2
|
|
st1 {v2.d}[0], [x0], x2
|
|
st1 {v2.d}[1], [x1], x2
|
|
st1 {v3.d}[0], [x0], x2
|
|
st1 {v3.d}[1], [x1], x2
|
|
st1 {v4.d}[0], [x0], x2
|
|
st1 {v4.d}[1], [x1], x2
|
|
st1 {v5.d}[0], [x0], x2
|
|
st1 {v5.d}[1], [x1], x2
|
|
st1 {v6.d}[0], [x0], x2
|
|
st1 {v6.d}[1], [x1], x2
|
|
st1 {v7.d}[0], [x0]
|
|
st1 {v7.d}[1], [x1]
|
|
|
|
ret
|
|
|
|
endfunc
|
|
.endm
|
|
|
|
vp8_h_loop_filter8uv
|
|
vp8_h_loop_filter8uv _inner, inner=1
|
|
|
|
|
|
function ff_put_vp8_pixels16_neon, export=1
|
|
1:
|
|
subs w4, w4, #4
|
|
ld1 {v0.16b}, [x2], x3
|
|
ld1 {v1.16b}, [x2], x3
|
|
ld1 {v2.16b}, [x2], x3
|
|
ld1 {v3.16b}, [x2], x3
|
|
st1 {v0.16b}, [x0], x1
|
|
st1 {v1.16b}, [x0], x1
|
|
st1 {v2.16b}, [x0], x1
|
|
st1 {v3.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_pixels8_neon, export=1
|
|
1:
|
|
subs w4, w4, #4
|
|
ld1 {v0.8b}, [x2], x3
|
|
ld1 {v0.d}[1], [x2], x3
|
|
ld1 {v1.8b}, [x2], x3
|
|
ld1 {v1.d}[1], [x2], x3
|
|
st1 {v0.8b}, [x0], x1
|
|
st1 {v0.d}[1], [x0], x1
|
|
st1 {v1.8b}, [x0], x1
|
|
st1 {v1.d}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
/* 4/6-tap 8th-pel MC */
|
|
|
|
.macro vp8_epel8_h6 d, s0, s1
|
|
ext v22.8b, \s0\().8b, \s1\().8b, #1
|
|
uxtl v18.8h, \s0\().8b
|
|
ext v23.8b, \s0\().8b, \s1\().8b, #2
|
|
uxtl v19.8h, v22.8b
|
|
ext v24.8b, \s0\().8b, \s1\().8b, #3
|
|
uxtl v21.8h, v23.8b
|
|
ext v25.8b, \s0\().8b, \s1\().8b, #4
|
|
uxtl v22.8h, v24.8b
|
|
ext v26.8b, \s0\().8b, \s1\().8b, #5
|
|
uxtl v25.8h, v25.8b
|
|
mul v21.8h, v21.8h, v0.h[2]
|
|
uxtl v26.8h, v26.8b
|
|
mul v22.8h, v22.8h, v0.h[3]
|
|
mls v21.8h, v19.8h, v0.h[1]
|
|
mls v22.8h, v25.8h, v0.h[4]
|
|
mla v21.8h, v18.8h, v0.h[0]
|
|
mla v22.8h, v26.8h, v0.h[5]
|
|
sqadd v22.8h, v21.8h, v22.8h
|
|
sqrshrun \d\().8b, v22.8h, #7
|
|
.endm
|
|
|
|
.macro vp8_epel16_h6 d0, v0, v1
|
|
ext v22.16b, \v0\().16b, \v1\().16b, #3
|
|
ext v23.16b, \v0\().16b, \v1\().16b, #4
|
|
uxtl v19.8h, v22.8b
|
|
uxtl2 v22.8h, v22.16b
|
|
ext v3.16b, \v0\().16b, \v1\().16b, #2
|
|
uxtl v20.8h, v23.8b
|
|
uxtl2 v23.8h, v23.16b
|
|
ext v16.16b, \v0\().16b, \v1\().16b, #1
|
|
uxtl v18.8h, v3.8b
|
|
uxtl2 v3.8h, v3.16b
|
|
ext v2.16b, \v0\().16b, \v1\().16b, #5
|
|
uxtl v21.8h, v2.8b
|
|
uxtl2 v2.8h, v2.16b
|
|
uxtl v17.8h, v16.8b
|
|
uxtl2 v16.8h, v16.16b
|
|
mul v19.8h, v19.8h, v0.h[3]
|
|
mul v18.8h, v18.8h, v0.h[2]
|
|
mul v3.8h, v3.8h, v0.h[2]
|
|
mul v22.8h, v22.8h, v0.h[3]
|
|
mls v19.8h, v20.8h, v0.h[4]
|
|
uxtl v20.8h, \v0\().8b
|
|
uxtl2 v1.8h, \v0\().16b
|
|
mls v18.8h, v17.8h, v0.h[1]
|
|
mls v3.8h, v16.8h, v0.h[1]
|
|
mls v22.8h, v23.8h, v0.h[4]
|
|
mla v18.8h, v20.8h, v0.h[0]
|
|
mla v19.8h, v21.8h, v0.h[5]
|
|
mla v3.8h, v1.8h, v0.h[0]
|
|
mla v22.8h, v2.8h, v0.h[5]
|
|
sqadd v19.8h, v18.8h, v19.8h
|
|
sqadd v22.8h, v3.8h, v22.8h
|
|
sqrshrun \d0\().8b, v19.8h, #7
|
|
sqrshrun2 \d0\().16b, v22.8h, #7
|
|
.endm
|
|
|
|
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
|
|
uxtl \s0\().8h, \s0\().8b
|
|
uxtl \s3\().8h, \s3\().8b
|
|
uxtl \s6\().8h, \s6\().8b
|
|
uxtl \s1\().8h, \s1\().8b
|
|
uxtl \s4\().8h, \s4\().8b
|
|
uxtl \s2\().8h, \s2\().8b
|
|
uxtl \s5\().8h, \s5\().8b
|
|
mul \s0\().8h, \s0\().8h, v0.h[0]
|
|
mul v31.8h , \s3\().8h, v0.h[3]
|
|
mul \s3\().8h, \s3\().8h, v0.h[2]
|
|
mul \s6\().8h, \s6\().8h, v0.h[5]
|
|
|
|
mls \s0\().8h, \s1\().8h, v0.h[1]
|
|
mls v31.8h , \s4\().8h, v0.h[4]
|
|
mls \s3\().8h, \s2\().8h, v0.h[1]
|
|
mls \s6\().8h, \s5\().8h, v0.h[4]
|
|
|
|
mla \s0\().8h, \s2\().8h, v0.h[2]
|
|
mla v31.8h , \s5\().8h, v0.h[5]
|
|
mla \s3\().8h, \s1\().8h, v0.h[0]
|
|
mla \s6\().8h, \s4\().8h, v0.h[3]
|
|
sqadd v31.8h , \s0\().8h, v31.8h
|
|
sqadd \s6\().8h, \s3\().8h, \s6\().8h
|
|
sqrshrun \d0\().8b, v31.8h, #7
|
|
sqrshrun \d1\().8b, \s6\().8h, #7
|
|
.endm
|
|
|
|
.macro vp8_epel8_h4 d, v0, v1
|
|
ext v22.8b, \v0\().8b, \v1\().8b, #1
|
|
uxtl v19.8h, \v0\().8b
|
|
ext v23.8b, \v0\().8b, \v1\().8b, #2
|
|
uxtl v20.8h, v22.8b
|
|
ext v25.8b, \v0\().8b, \v1\().8b, #3
|
|
uxtl v22.8h, v23.8b
|
|
uxtl v25.8h, v25.8b
|
|
mul v20.8h, v20.8h, v0.h[2]
|
|
mul v22.8h, v22.8h, v0.h[3]
|
|
mls v20.8h, v19.8h, v0.h[1]
|
|
mls v22.8h, v25.8h, v0.h[4]
|
|
sqadd v22.8h, v20.8h, v22.8h
|
|
sqrshrun \d\().8b, v22.8h, #7
|
|
.endm
|
|
|
|
.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
|
|
uxtl \s0\().8h, \s0\().8b
|
|
uxtl \s1\().8h, \s1\().8b
|
|
uxtl \s2\().8h, \s2\().8b
|
|
uxtl \s3\().8h, \s3\().8b
|
|
uxtl \s4\().8h, \s4\().8b
|
|
mul v21.8h, \s1\().8h, v0.h[2]
|
|
mul v23.8h, \s2\().8h, v0.h[3]
|
|
mul \s2\().8h, \s2\().8h, v0.h[2]
|
|
mul v22.8h, \s3\().8h, v0.h[3]
|
|
mls v21.8h, \s0\().8h, v0.h[1]
|
|
mls v23.8h, \s3\().8h, v0.h[4]
|
|
mls \s2\().8h, \s1\().8h, v0.h[1]
|
|
mls v22.8h, \s4\().8h, v0.h[4]
|
|
sqadd v21.8h, v21.8h, v23.8h
|
|
sqadd \s2\().8h, \s2\().8h, v22.8h
|
|
sqrshrun \d0\().8b, v21.8h, #7
|
|
sqrshrun2 \d0\().16b, \s2\().8h, #7
|
|
.endm
|
|
|
|
|
|
// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
|
|
// arithmetic can be used to apply filters
|
|
const subpel_filters, align=4
|
|
.short 0, 6, 123, 12, 1, 0, 0, 0
|
|
.short 2, 11, 108, 36, 8, 1, 0, 0
|
|
.short 0, 9, 93, 50, 6, 0, 0, 0
|
|
.short 3, 16, 77, 77, 16, 3, 0, 0
|
|
.short 0, 6, 50, 93, 9, 0, 0, 0
|
|
.short 1, 8, 36, 108, 11, 2, 0, 0
|
|
.short 0, 1, 12, 123, 6, 0, 0, 0
|
|
endconst
|
|
|
|
function ff_put_vp8_epel16_v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sxtw x4, w4
|
|
sxtw x6, w6
|
|
movrel x17, subpel_filters, -16
|
|
add x6, x17, x6, lsl #4 // y
|
|
ld1 {v0.8h}, [x6]
|
|
1:
|
|
ld1 {v1.1d - v2.1d}, [x2], x3
|
|
ld1 {v3.1d - v4.1d}, [x2], x3
|
|
ld1 {v16.1d - v17.1d}, [x2], x3
|
|
ld1 {v18.1d - v19.1d}, [x2], x3
|
|
ld1 {v20.1d - v21.1d}, [x2], x3
|
|
ld1 {v22.1d - v23.1d}, [x2], x3
|
|
ld1 {v24.1d - v25.1d}, [x2]
|
|
sub x2, x2, x3, lsl #2
|
|
|
|
vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
|
|
vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
|
|
|
|
st1 {v1.1d - v2.1d}, [x0], x1
|
|
st1 {v3.1d - v4.1d}, [x0], x1
|
|
subs x4, x4, #2
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel16_h6_neon, export=1
|
|
sub x2, x2, #2
|
|
sxtw x5, w5 // x
|
|
|
|
// first pass (horizontal):
|
|
movrel x17, subpel_filters, -16
|
|
add x5, x17, x5, lsl #4 // x
|
|
ld1 {v0.8h}, [x5]
|
|
1:
|
|
ld1 {v1.16b, v2.16b}, [x2], x3
|
|
vp8_epel16_h6 v1, v1, v2
|
|
st1 {v1.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
|
|
function ff_put_vp8_epel16_h6v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
sub x2, x2, #2
|
|
|
|
// first pass (horizontal):
|
|
movrel x17, subpel_filters, -16
|
|
sxtw x5, w5 // x
|
|
add x16, x17, x5, lsl #4 // x
|
|
sub sp, sp, #336+16
|
|
ld1 {v0.8h}, [x16]
|
|
add x7, sp, #15
|
|
sxtw x4, w4
|
|
add x16, x4, #5 // h
|
|
bic x7, x7, #15
|
|
1:
|
|
ld1 {v1.16b, v2.16b}, [x2], x3
|
|
vp8_epel16_h6 v1, v1, v2
|
|
st1 {v1.16b}, [x7], #16
|
|
subs x16, x16, #1
|
|
b.ne 1b
|
|
|
|
|
|
// second pass (vertical):
|
|
sxtw x6, w6
|
|
add x6, x17, x6, lsl #4 // y
|
|
add x7, sp, #15
|
|
ld1 {v0.8h}, [x6]
|
|
bic x7, x7, #15
|
|
2:
|
|
ld1 {v1.8b - v4.8b}, [x7], #32
|
|
ld1 {v16.8b - v19.8b}, [x7], #32
|
|
ld1 {v20.8b - v23.8b}, [x7], #32
|
|
ld1 {v24.8b - v25.8b}, [x7]
|
|
sub x7, x7, #64
|
|
|
|
vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
|
|
vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
|
|
trn1 v1.2d, v1.2d, v2.2d
|
|
trn1 v3.2d, v3.2d, v4.2d
|
|
|
|
st1 {v1.16b}, [x0], x1
|
|
st1 {v3.16b}, [x0], x1
|
|
subs x4, x4, #2
|
|
b.ne 2b
|
|
|
|
add sp, sp, #336+16
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
1:
|
|
ld1 {v2.8b}, [x2], x3
|
|
ld1 {v3.8b}, [x2], x3
|
|
ld1 {v4.8b}, [x2], x3
|
|
ld1 {v5.8b}, [x2], x3
|
|
ld1 {v6.8b}, [x2], x3
|
|
ld1 {v7.8b}, [x2], x3
|
|
ld1 {v28.8b}, [x2]
|
|
|
|
sub x2, x2, x3, lsl #2
|
|
|
|
vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
|
|
|
|
st1 {v2.8b}, [x0], x1
|
|
st1 {v3.8b}, [x0], x1
|
|
subs w4, w4, #2
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_h6_neon, export=1
|
|
sub x2, x2, #2
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
1:
|
|
ld1 {v2.8b, v3.8b}, [x2], x3
|
|
|
|
vp8_epel8_h6 v2, v2, v3
|
|
|
|
st1 {v2.8b}, [x0], x1
|
|
subs w4, w4, #1
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_h6v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
sub x2, x2, #2
|
|
sxtw x4, w4
|
|
|
|
// first pass (horizontal):
|
|
movrel x17, subpel_filters, -16
|
|
sxtw x5, w5
|
|
add x5, x17, x5, lsl #4 // x
|
|
sub sp, sp, #168+16
|
|
ld1 {v0.8h}, [x5]
|
|
add x7, sp, #15
|
|
add x16, x4, #5 // h
|
|
bic x7, x7, #15
|
|
1:
|
|
ld1 {v1.8b, v2.8b}, [x2], x3
|
|
|
|
vp8_epel8_h6 v1, v1, v2
|
|
|
|
st1 {v1.8b}, [x7], #8
|
|
subs x16, x16, #1
|
|
b.ne 1b
|
|
|
|
// second pass (vertical):
|
|
sxtw x6, w6
|
|
add x6, x17, x6, lsl #4 // y
|
|
add x7, sp, #15
|
|
ld1 {v0.8h}, [x6]
|
|
bic x7, x7, #15
|
|
2:
|
|
ld1 {v1.8b - v4.8b}, [x7], #32
|
|
ld1 {v5.8b - v7.8b}, [x7]
|
|
|
|
sub x7, x7, #16
|
|
|
|
vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
st1 {v1.8b}, [x0], x1
|
|
st1 {v2.8b}, [x0], x1
|
|
subs x4, x4, #2
|
|
b.ne 2b
|
|
|
|
add sp, sp, #168+16
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_v4_neon, export=1
|
|
sub x2, x2, x3
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
1:
|
|
ld1 {v2.8b}, [x2], x3
|
|
ld1 {v3.8b}, [x2], x3
|
|
ld1 {v4.8b}, [x2], x3
|
|
ld1 {v5.8b}, [x2], x3
|
|
ld1 {v6.8b}, [x2]
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
|
|
|
|
st1 {v2.d}[0], [x0], x1
|
|
st1 {v2.d}[1], [x0], x1
|
|
subs w4, w4, #2
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_h4_neon, export=1
|
|
sub x2, x2, #1
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
1:
|
|
ld1 {v2.8b,v3.8b}, [x2], x3
|
|
|
|
vp8_epel8_h4 v2, v2, v3
|
|
|
|
st1 {v2.8b}, [x0], x1
|
|
subs w4, w4, #1
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_h4v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
sub x2, x2, #1
|
|
sxtw x4, w4
|
|
|
|
// first pass (horizontal):
|
|
movrel x17, subpel_filters, -16
|
|
sxtw x5, w5
|
|
add x5, x17, x5, lsl #4 // x
|
|
sub sp, sp, #168+16
|
|
ld1 {v0.8h}, [x5]
|
|
add x7, sp, #15
|
|
add x16, x4, #5 // h
|
|
bic x7, x7, #15
|
|
1:
|
|
ld1 {v1.8b, v2.8b}, [x2], x3
|
|
|
|
vp8_epel8_h4 v1, v1, v2
|
|
|
|
st1 {v1.8b}, [x7], #8
|
|
subs x16, x16, #1
|
|
b.ne 1b
|
|
|
|
// second pass (vertical):
|
|
sxtw x6, w6
|
|
add x6, x17, x6, lsl #4 // y
|
|
add x7, sp, #15
|
|
ld1 {v0.8h}, [x6]
|
|
bic x7, x7, #15
|
|
2:
|
|
ld1 {v1.8b - v4.8b}, [x7], #32
|
|
ld1 {v5.8b - v7.8b}, [x7]
|
|
|
|
sub x7, x7, #16
|
|
|
|
vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
st1 {v1.8b}, [x0], x1
|
|
st1 {v2.8b}, [x0], x1
|
|
subs x4, x4, #2
|
|
b.ne 2b
|
|
|
|
add sp, sp, #168+16
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_h4v4_neon, export=1
|
|
sub x2, x2, x3
|
|
sub x2, x2, #1
|
|
sxtw x4, w4
|
|
|
|
|
|
// first pass (horizontal):
|
|
movrel x17, subpel_filters, -16
|
|
sxtw x5, w5
|
|
add x5, x17, x5, lsl #4 // x
|
|
sub sp, sp, #168+16
|
|
ld1 {v0.8h}, [x5]
|
|
add x7, sp, #15
|
|
add x16, x4, #3 // h
|
|
bic x7, x7, #15
|
|
1:
|
|
ld1 {v1.8b, v2.8b}, [x2], x3
|
|
|
|
vp8_epel8_h4 v1, v1, v2
|
|
|
|
st1 {v1.8b}, [x7], #8
|
|
subs x16, x16, #1
|
|
b.ne 1b
|
|
|
|
// second pass (vertical):
|
|
sxtw x6, w6
|
|
add x6, x17, x6, lsl #4 // y
|
|
add x7, sp, #15
|
|
ld1 {v0.8h}, [x6]
|
|
bic x7, x7, #15
|
|
2:
|
|
ld1 {v1.8b - v2.8b}, [x7], #16
|
|
ld1 {v3.8b - v5.8b}, [x7]
|
|
|
|
vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
|
|
|
|
st1 {v1.d}[0], [x0], x1
|
|
st1 {v1.d}[1], [x0], x1
|
|
subs x4, x4, #2
|
|
b.ne 2b
|
|
|
|
add sp, sp, #168+16
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel8_h6v4_neon, export=1
|
|
sub x2, x2, x3
|
|
sub x2, x2, #2
|
|
sxtw x4, w4
|
|
|
|
|
|
// first pass (horizontal):
|
|
movrel x17, subpel_filters, -16
|
|
sxtw x5, w5
|
|
add x5, x17, x5, lsl #4 // x
|
|
sub sp, sp, #168+16
|
|
ld1 {v0.8h}, [x5]
|
|
add x7, sp, #15
|
|
add x16, x4, #3 // h
|
|
bic x7, x7, #15
|
|
1:
|
|
ld1 {v1.8b, v2.8b}, [x2], x3
|
|
|
|
vp8_epel8_h6 v1, v1, v2
|
|
|
|
st1 {v1.8b}, [x7], #8
|
|
subs x16, x16, #1
|
|
b.ne 1b
|
|
|
|
// second pass (vertical):
|
|
sxtw x6, w6
|
|
add x6, x17, x6, lsl #4 // y
|
|
add x7, sp, #15
|
|
ld1 {v0.8h}, [x6]
|
|
bic x7, x7, #15
|
|
2:
|
|
ld1 {v1.8b - v2.8b}, [x7], #16
|
|
ld1 {v3.8b - v5.8b}, [x7]
|
|
|
|
vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
|
|
|
|
st1 {v1.d}[0], [x0], x1
|
|
st1 {v1.d}[1], [x0], x1
|
|
subs x4, x4, #2
|
|
b.ne 2b
|
|
|
|
add sp, sp, #168+16
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
1:
|
|
ld1r {v2.2s}, [x2], x3
|
|
ld1r {v3.2s}, [x2], x3
|
|
ld1r {v4.2s}, [x2], x3
|
|
ld1r {v5.2s}, [x2], x3
|
|
ld1r {v6.2s}, [x2], x3
|
|
ld1r {v7.2s}, [x2], x3
|
|
ld1r {v28.2s}, [x2]
|
|
sub x2, x2, x3, lsl #2
|
|
ld1 {v2.s}[1], [x2], x3
|
|
ld1 {v3.s}[1], [x2], x3
|
|
ld1 {v4.s}[1], [x2], x3
|
|
ld1 {v5.s}[1], [x2], x3
|
|
ld1 {v6.s}[1], [x2], x3
|
|
ld1 {v7.s}[1], [x2], x3
|
|
ld1 {v28.s}[1], [x2]
|
|
sub x2, x2, x3, lsl #2
|
|
|
|
vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
|
|
|
|
st1 {v2.s}[0], [x0], x1
|
|
st1 {v3.s}[0], [x0], x1
|
|
st1 {v2.s}[1], [x0], x1
|
|
st1 {v3.s}[1], [x0], x1
|
|
subs w4, w4, #4
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_h6_neon, export=1
|
|
sub x2, x2, #2
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
1:
|
|
ld1 {v2.8b,v3.8b}, [x2], x3
|
|
vp8_epel8_h6 v2, v2, v3
|
|
st1 {v2.s}[0], [x0], x1
|
|
subs w4, w4, #1
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_h6v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
sub x2, x2, #2
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
|
|
sub sp, sp, #52
|
|
add w8, w4, #5
|
|
mov x9, sp
|
|
1:
|
|
ld1 {v2.8b,v3.8b}, [x2], x3
|
|
vp8_epel8_h6 v2, v2, v3
|
|
st1 {v2.s}[0], [x9], #4
|
|
subs w8, w8, #1
|
|
b.ne 1b
|
|
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
mov x9, sp
|
|
2:
|
|
ld1 {v2.8b,v3.8b}, [x9], #16
|
|
ld1 {v6.8b}, [x9], #8
|
|
ld1r {v28.2s}, [x9]
|
|
sub x9, x9, #16
|
|
ld1 {v4.8b,v5.8b}, [x9], #16
|
|
ld1 {v7.8b}, [x9], #8
|
|
ld1 {v28.s}[1], [x9]
|
|
sub x9, x9, #16
|
|
trn1 v1.2s, v2.2s, v4.2s
|
|
trn2 v4.2s, v2.2s, v4.2s
|
|
trn1 v2.2s, v3.2s, v5.2s
|
|
trn2 v5.2s, v3.2s, v5.2s
|
|
trn1 v3.2s, v6.2s, v7.2s
|
|
trn2 v7.2s, v6.2s, v7.2s
|
|
vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
|
|
st1 {v2.s}[0], [x0], x1
|
|
st1 {v3.s}[0], [x0], x1
|
|
st1 {v2.s}[1], [x0], x1
|
|
st1 {v3.s}[1], [x0], x1
|
|
subs w4, w4, #4
|
|
b.ne 2b
|
|
|
|
add sp, sp, #52
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_h4v6_neon, export=1
|
|
sub x2, x2, x3, lsl #1
|
|
sub x2, x2, #1
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
|
|
sub sp, sp, #52
|
|
add w8, w4, #5
|
|
mov x9, sp
|
|
1:
|
|
ld1 {v2.8b}, [x2], x3
|
|
vp8_epel8_h4 v2, v2, v2
|
|
st1 {v2.s}[0], [x9], #4
|
|
subs w8, w8, #1
|
|
b.ne 1b
|
|
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
mov x9, sp
|
|
2:
|
|
ld1 {v2.8b,v3.8b}, [x9], #16
|
|
ld1 {v6.8b}, [x9], #8
|
|
ld1r {v28.2s}, [x9]
|
|
sub x9, x9, #16
|
|
ld1 {v4.8b,v5.8b}, [x9], #16
|
|
ld1 {v7.8b}, [x9], #8
|
|
ld1 {v28.s}[1], [x9]
|
|
sub x9, x9, #16
|
|
trn1 v1.2s, v2.2s, v4.2s
|
|
trn2 v4.2s, v2.2s, v4.2s
|
|
trn1 v2.2s, v3.2s, v5.2s
|
|
trn2 v5.2s, v3.2s, v5.2s
|
|
trn1 v3.2s, v6.2s, v7.2s
|
|
trn2 v7.2s, v6.2s, v7.2s
|
|
vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
|
|
st1 {v2.s}[0], [x0], x1
|
|
st1 {v3.s}[0], [x0], x1
|
|
st1 {v2.s}[1], [x0], x1
|
|
st1 {v3.s}[1], [x0], x1
|
|
subs w4, w4, #4
|
|
b.ne 2b
|
|
|
|
add sp, sp, #52
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_h6v4_neon, export=1
|
|
sub x2, x2, x3
|
|
sub x2, x2, #2
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
|
|
sub sp, sp, #44
|
|
add w8, w4, #3
|
|
mov x9, sp
|
|
1:
|
|
ld1 {v2.8b,v3.8b}, [x2], x3
|
|
vp8_epel8_h6 v2, v2, v3
|
|
st1 {v2.s}[0], [x9], #4
|
|
subs w8, w8, #1
|
|
b.ne 1b
|
|
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
mov x9, sp
|
|
2:
|
|
ld1 {v2.8b,v3.8b}, [x9], #16
|
|
ld1r {v6.2s}, [x9]
|
|
sub x9, x9, #8
|
|
ld1 {v4.8b,v5.8b}, [x9], #16
|
|
ld1 {v6.s}[1], [x9]
|
|
sub x9, x9, #8
|
|
trn1 v1.2s, v2.2s, v4.2s
|
|
trn2 v4.2s, v2.2s, v4.2s
|
|
trn1 v2.2s, v3.2s, v5.2s
|
|
trn2 v5.2s, v3.2s, v5.2s
|
|
vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
|
|
st1 {v1.s}[0], [x0], x1
|
|
st1 {v1.s}[2], [x0], x1
|
|
st1 {v1.s}[1], [x0], x1
|
|
st1 {v1.s}[3], [x0], x1
|
|
subs w4, w4, #4
|
|
b.ne 2b
|
|
|
|
add sp, sp, #44
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_h4_neon, export=1
|
|
sub x2, x2, #1
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
1:
|
|
ld1 {v2.8b}, [x2], x3
|
|
vp8_epel8_h4 v2, v2, v2
|
|
st1 {v2.s}[0], [x0], x1
|
|
subs w4, w4, #1
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_v4_neon, export=1
|
|
sub x2, x2, x3
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
1:
|
|
ld1r {v2.2s}, [x2], x3
|
|
ld1r {v3.2s}, [x2], x3
|
|
ld1r {v4.2s}, [x2], x3
|
|
ld1r {v5.2s}, [x2], x3
|
|
ld1r {v6.2s}, [x2]
|
|
sub x2, x2, x3, lsl #1
|
|
ld1 {v2.s}[1], [x2], x3
|
|
ld1 {v3.s}[1], [x2], x3
|
|
ld1 {v4.s}[1], [x2], x3
|
|
ld1 {v5.s}[1], [x2], x3
|
|
ld1 {v6.s}[1], [x2]
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
|
|
|
|
st1 {v2.s}[0], [x0], x1
|
|
st1 {v2.s}[2], [x0], x1
|
|
st1 {v2.s}[1], [x0], x1
|
|
st1 {v2.s}[3], [x0], x1
|
|
subs w4, w4, #4
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_epel4_h4v4_neon, export=1
|
|
sub x2, x2, x3
|
|
sub x2, x2, #1
|
|
|
|
movrel x7, subpel_filters, -16
|
|
add x5, x7, w5, uxtw #4
|
|
ld1 {v0.8h}, [x5]
|
|
|
|
sub sp, sp, #44
|
|
add w8, w4, #3
|
|
mov x9, sp
|
|
1:
|
|
ld1 {v2.8b}, [x2], x3
|
|
vp8_epel8_h4 v2, v2, v3
|
|
st1 {v2.s}[0], [x9], #4
|
|
subs w8, w8, #1
|
|
b.ne 1b
|
|
|
|
add x6, x7, w6, uxtw #4
|
|
ld1 {v0.8h}, [x6]
|
|
mov x9, sp
|
|
2:
|
|
ld1 {v2.8b,v3.8b}, [x9], #16
|
|
ld1r {v6.2s}, [x9]
|
|
sub x9, x9, #8
|
|
ld1 {v4.8b,v5.8b}, [x9], #16
|
|
ld1 {v6.s}[1], [x9]
|
|
sub x9, x9, #8
|
|
trn1 v1.2s, v2.2s, v4.2s
|
|
trn2 v4.2s, v2.2s, v4.2s
|
|
trn1 v2.2s, v3.2s, v5.2s
|
|
trn2 v5.2s, v3.2s, v5.2s
|
|
vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
|
|
st1 {v1.s}[0], [x0], x1
|
|
st1 {v1.s}[2], [x0], x1
|
|
st1 {v1.s}[1], [x0], x1
|
|
st1 {v1.s}[3], [x0], x1
|
|
subs w4, w4, #4
|
|
b.ne 2b
|
|
|
|
add sp, sp, #44
|
|
ret
|
|
endfunc
|
|
|
|
/* Bilinear MC */
|
|
|
|
function ff_put_vp8_bilin16_h_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w5
|
|
sub w5, w7, w5
|
|
dup v1.8b, w5
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3
|
|
ext v5.8b, v3.8b, v4.8b, #1
|
|
ext v4.8b, v2.8b, v3.8b, #1
|
|
umull v16.8h, v2.8b, v1.8b
|
|
umlal v16.8h, v4.8b, v0.8b
|
|
ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
|
|
umull v6.8h, v3.8b, v1.8b
|
|
umlal v6.8h, v5.8b, v0.8b
|
|
ext v21.8b, v19.8b, v20.8b, #1
|
|
ext v20.8b, v18.8b, v19.8b, #1
|
|
umull v22.8h, v18.8b, v1.8b
|
|
umlal v22.8h, v20.8b, v0.8b
|
|
umull v24.8h, v19.8b, v1.8b
|
|
umlal v24.8h, v21.8b, v0.8b
|
|
rshrn v4.8b, v16.8h, #3
|
|
rshrn2 v4.16b, v6.8h, #3
|
|
rshrn v6.8b, v22.8h, #3
|
|
rshrn2 v6.16b, v24.8h, #3
|
|
st1 {v4.16b}, [x0], x1
|
|
st1 {v6.16b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin16_v_neon, export=1
|
|
mov w7, #8
|
|
dup v0.16b, w6
|
|
sub w6, w7, w6
|
|
dup v1.16b, w6
|
|
|
|
ld1 {v2.16b}, [x2], x3
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v4.16b}, [x2], x3
|
|
umull v6.8h, v2.8b, v1.8b
|
|
umlal v6.8h, v4.8b, v0.8b
|
|
umull2 v16.8h, v2.16b, v1.16b
|
|
umlal2 v16.8h, v4.16b, v0.16b
|
|
ld1 {v2.16b}, [x2], x3
|
|
umull v18.8h, v4.8b, v1.8b
|
|
umlal v18.8h, v2.8b, v0.8b
|
|
umull2 v20.8h, v4.16b, v1.16b
|
|
umlal2 v20.8h, v2.16b, v0.16b
|
|
rshrn v4.8b, v6.8h, #3
|
|
rshrn2 v4.16b, v16.8h, #3
|
|
rshrn v6.8b, v18.8h, #3
|
|
rshrn2 v6.16b, v20.8h, #3
|
|
st1 {v4.16b}, [x0], x1
|
|
st1 {v6.16b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin16_hv_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w5 // mx
|
|
sub w5, w7, w5
|
|
dup v1.8b, w5
|
|
dup v2.16b, w6 // my
|
|
sub w6, w7, w6
|
|
dup v3.16b, w6
|
|
|
|
ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3
|
|
|
|
ext v7.8b, v5.8b, v6.8b, #1
|
|
ext v6.8b, v4.8b, v5.8b, #1
|
|
umull v16.8h, v4.8b, v1.8b
|
|
umlal v16.8h, v6.8b, v0.8b
|
|
umull v18.8h, v5.8b, v1.8b
|
|
umlal v18.8h, v7.8b, v0.8b
|
|
rshrn v4.8b, v16.8h, #3
|
|
rshrn2 v4.16b, v18.8h, #3
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
|
|
ext v21.8b, v19.8b, v20.8b, #1
|
|
ext v20.8b, v18.8b, v19.8b, #1
|
|
umull v22.8h, v18.8b, v1.8b
|
|
umlal v22.8h, v20.8b, v0.8b
|
|
ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3
|
|
umull v24.8h, v19.8b, v1.8b
|
|
umlal v24.8h, v21.8b, v0.8b
|
|
ext v29.8b, v27.8b, v28.8b, #1
|
|
ext v28.8b, v26.8b, v27.8b, #1
|
|
umull v16.8h, v26.8b, v1.8b
|
|
umlal v16.8h, v28.8b, v0.8b
|
|
umull v18.8h, v27.8b, v1.8b
|
|
umlal v18.8h, v29.8b, v0.8b
|
|
rshrn v6.8b, v22.8h, #3
|
|
rshrn2 v6.16b, v24.8h, #3
|
|
umull v24.8h, v4.8b, v3.8b
|
|
umlal v24.8h, v6.8b, v2.8b
|
|
umull2 v30.8h, v4.16b, v3.16b
|
|
umlal2 v30.8h, v6.16b, v2.16b
|
|
rshrn v4.8b, v16.8h, #3
|
|
rshrn2 v4.16b, v18.8h, #3
|
|
umull v20.8h, v6.8b, v3.8b
|
|
umlal v20.8h, v4.8b, v2.8b
|
|
umull2 v22.8h, v6.16b, v3.16b
|
|
umlal2 v22.8h, v4.16b, v2.16b
|
|
rshrn v24.8b, v24.8h, #3
|
|
rshrn2 v24.16b, v30.8h, #3
|
|
st1 {v24.16b}, [x0], x1
|
|
rshrn v20.8b, v20.8h, #3
|
|
rshrn2 v20.16b, v22.8h, #3
|
|
st1 {v20.16b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin8_h_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w5
|
|
sub w5, w7, w5
|
|
dup v1.8b, w5
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v2.8b,v3.8b}, [x2], x3
|
|
ext v3.8b, v2.8b, v3.8b, #1
|
|
umull v4.8h, v2.8b, v1.8b
|
|
umlal v4.8h, v3.8b, v0.8b
|
|
ld1 {v6.8b,v7.8b}, [x2], x3
|
|
ext v7.8b, v6.8b, v7.8b, #1
|
|
umull v16.8h, v6.8b, v1.8b
|
|
umlal v16.8h, v7.8b, v0.8b
|
|
rshrn v4.8b, v4.8h, #3
|
|
rshrn v16.8b, v16.8h, #3
|
|
st1 {v4.8b}, [x0], x1
|
|
st1 {v16.8b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin8_v_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w6
|
|
sub w6, w7, w6
|
|
dup v1.8b, w6
|
|
|
|
ld1 {v2.8b}, [x2], x3
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v3.8b}, [x2], x3
|
|
umull v4.8h, v2.8b, v1.8b
|
|
umlal v4.8h, v3.8b, v0.8b
|
|
ld1 {v2.8b}, [x2], x3
|
|
umull v6.8h, v3.8b, v1.8b
|
|
umlal v6.8h, v2.8b, v0.8b
|
|
rshrn v4.8b, v4.8h, #3
|
|
rshrn v6.8b, v6.8h, #3
|
|
st1 {v4.8b}, [x0], x1
|
|
st1 {v6.8b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin8_hv_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w5 // mx
|
|
sub w5, w7, w5
|
|
dup v1.8b, w5
|
|
dup v2.8b, w6 // my
|
|
sub w6, w7, w6
|
|
dup v3.8b, w6
|
|
|
|
ld1 {v4.8b,v5.8b}, [x2], x3
|
|
ext v5.8b, v4.8b, v5.8b, #1
|
|
umull v18.8h, v4.8b, v1.8b
|
|
umlal v18.8h, v5.8b, v0.8b
|
|
rshrn v22.8b, v18.8h, #3
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v6.8b,v7.8b}, [x2], x3
|
|
ext v7.8b, v6.8b, v7.8b, #1
|
|
umull v16.8h, v6.8b, v1.8b
|
|
umlal v16.8h, v7.8b, v0.8b
|
|
ld1 {v4.8b,v5.8b}, [x2], x3
|
|
ext v5.8b, v4.8b, v5.8b, #1
|
|
umull v18.8h, v4.8b, v1.8b
|
|
umlal v18.8h, v5.8b, v0.8b
|
|
rshrn v16.8b, v16.8h, #3
|
|
umull v20.8h, v22.8b, v3.8b
|
|
umlal v20.8h, v16.8b, v2.8b
|
|
rshrn v22.8b, v18.8h, #3
|
|
umull v24.8h, v16.8b, v3.8b
|
|
umlal v24.8h, v22.8b, v2.8b
|
|
rshrn v20.8b, v20.8h, #3
|
|
st1 {v20.8b}, [x0], x1
|
|
rshrn v23.8b, v24.8h, #3
|
|
st1 {v23.8b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin4_h_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w5
|
|
sub w5, w7, w5
|
|
dup v1.8b, w5
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v2.8b}, [x2], x3
|
|
ext v3.8b, v2.8b, v3.8b, #1
|
|
ld1 {v6.8b}, [x2], x3
|
|
ext v7.8b, v6.8b, v7.8b, #1
|
|
trn1 v2.2s, v2.2s, v6.2s
|
|
trn1 v3.2s, v3.2s, v7.2s
|
|
umull v4.8h, v2.8b, v1.8b
|
|
umlal v4.8h, v3.8b, v0.8b
|
|
rshrn v4.8b, v4.8h, #3
|
|
st1 {v4.s}[0], [x0], x1
|
|
st1 {v4.s}[1], [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin4_v_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w6
|
|
sub w6, w7, w6
|
|
dup v1.8b, w6
|
|
|
|
ld1r {v2.2s}, [x2], x3
|
|
1:
|
|
ld1r {v3.2s}, [x2]
|
|
ld1 {v2.s}[1], [x2], x3
|
|
ld1 {v3.s}[1], [x2], x3
|
|
umull v4.8h, v2.8b, v1.8b
|
|
umlal v4.8h, v3.8b, v0.8b
|
|
trn2 v2.2s, v3.2s, v2.2s
|
|
rshrn v4.8b, v4.8h, #3
|
|
st1 {v4.s}[0], [x0], x1
|
|
st1 {v4.s}[1], [x0], x1
|
|
subs w4, w4, #2
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_put_vp8_bilin4_hv_neon, export=1
|
|
mov w7, #8
|
|
dup v0.8b, w5 // mx
|
|
sub w5, w7, w5
|
|
dup v1.8b, w5
|
|
dup v2.8b, w6 // my
|
|
sub w6, w7, w6
|
|
dup v3.8b, w6
|
|
|
|
ld1 {v4.8b}, [x2], x3
|
|
ext v5.8b, v4.8b, v4.8b, #1
|
|
umull v18.8h, v4.8b, v1.8b
|
|
umlal v18.8h, v5.8b, v0.8b
|
|
rshrn v22.8b, v18.8h, #3
|
|
1:
|
|
subs w4, w4, #2
|
|
ld1 {v6.8b}, [x2], x3
|
|
ext v7.8b, v6.8b, v6.8b, #1
|
|
ld1 {v4.8b}, [x2], x3
|
|
ext v5.8b, v4.8b, v4.8b, #1
|
|
trn1 v6.2s, v6.2s, v4.2s
|
|
trn1 v7.2s, v7.2s, v5.2s
|
|
umull v16.8h, v6.8b, v1.8b
|
|
umlal v16.8h, v7.8b, v0.8b
|
|
rshrn v16.8b, v16.8h, #3
|
|
umull v20.8h, v16.8b, v2.8b
|
|
trn1 v22.2s, v22.2s, v16.2s
|
|
umlal v20.8h, v22.8b, v3.8b
|
|
rev64 v22.2s, v16.2s
|
|
rshrn v20.8b, v20.8h, #3
|
|
st1 {v20.s}[0], [x0], x1
|
|
st1 {v20.s}[1], [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|