mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
fd3bd5c492
This gives rather big speedups on these functions: Before: put_h264_qpel_8_mc01_8_neon: 241.0 131.5 138.7 put_h264_qpel_8_mc02_8_neon: 214.7 121.2 127.5 put_h264_qpel_8_mc03_8_neon: 242.5 131.2 135.7 put_h264_qpel_8_mc11_8_neon: 421.2 218.7 251.0 put_h264_qpel_8_mc12_8_neon: 878.0 509.5 537.5 put_h264_qpel_8_mc13_8_neon: 423.7 217.0 252.0 put_h264_qpel_8_mc21_8_neon: 858.2 479.5 514.0 put_h264_qpel_8_mc22_8_neon: 649.7 385.2 403.0 put_h264_qpel_8_mc23_8_neon: 860.2 476.5 517.7 put_h264_qpel_8_mc31_8_neon: 437.2 219.5 252.5 put_h264_qpel_8_mc32_8_neon: 892.5 510.5 546.0 put_h264_qpel_8_mc33_8_neon: 438.2 218.5 257.0 put_h264_qpel_16_mc01_8_neon: 944.2 509.7 546.7 put_h264_qpel_16_mc02_8_neon: 878.7 469.5 509.7 put_h264_qpel_16_mc03_8_neon: 945.7 510.7 557.0 put_h264_qpel_16_mc11_8_neon: 1663.2 858.5 979.5 put_h264_qpel_16_mc12_8_neon: 3510.2 2027.7 2112.7 put_h264_qpel_16_mc13_8_neon: 1664.7 857.5 980.5 put_h264_qpel_16_mc21_8_neon: 3366.2 1928.5 2030.5 put_h264_qpel_16_mc22_8_neon: 2584.7 1514.7 1590.2 put_h264_qpel_16_mc23_8_neon: 3367.7 1927.7 2035.0 put_h264_qpel_16_mc31_8_neon: 1716.7 849.7 997.0 put_h264_qpel_16_mc32_8_neon: 3564.0 2044.2 3835.2 put_h264_qpel_16_mc33_8_neon: 1717.7 863.0 989.5 After: put_h264_qpel_8_mc01_8_neon: 136.0 73.7 76.0 put_h264_qpel_8_mc02_8_neon: 108.7 65.0 64.0 put_h264_qpel_8_mc03_8_neon: 137.5 72.7 73.0 put_h264_qpel_8_mc11_8_neon: 316.2 159.0 188.5 put_h264_qpel_8_mc12_8_neon: 653.0 375.5 384.7 put_h264_qpel_8_mc13_8_neon: 318.7 165.5 189.5 put_h264_qpel_8_mc21_8_neon: 739.2 385.7 432.5 put_h264_qpel_8_mc22_8_neon: 530.7 295.5 309.5 put_h264_qpel_8_mc23_8_neon: 741.2 393.7 421.0 put_h264_qpel_8_mc31_8_neon: 332.2 162.5 190.0 put_h264_qpel_8_mc32_8_neon: 667.5 378.2 390.5 put_h264_qpel_8_mc33_8_neon: 332.7 166.5 195.5 put_h264_qpel_16_mc01_8_neon: 524.2 285.2 294.0 put_h264_qpel_16_mc02_8_neon: 454.7 252.2 250.2 put_h264_qpel_16_mc03_8_neon: 525.7 286.0 283.0 put_h264_qpel_16_mc11_8_neon: 1243.2 630.7 726.7 put_h264_qpel_16_mc12_8_neon: 2610.2 1479.7 1481.2 put_h264_qpel_16_mc13_8_neon: 1250.5 631.7 727.7 put_h264_qpel_16_mc21_8_neon: 2890.2 1571.2 1679.7 put_h264_qpel_16_mc22_8_neon: 2108.7 1177.5 1223.5 put_h264_qpel_16_mc23_8_neon: 2891.7 1578.7 1667.7 put_h264_qpel_16_mc31_8_neon: 1296.7 630.5 752.5 put_h264_qpel_16_mc32_8_neon: 2664.0 1483.2 1503.5 put_h264_qpel_16_mc33_8_neon: 1297.7 632.5 747.2 I.e. overall a 20%-60% reduction in runtime of these functions. Signed-off-by: Martin Storsjö <martin@martin.st>
936 lines
33 KiB
ArmAsm
936 lines
33 KiB
ArmAsm
/*
|
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
#include "neon.S"
|
|
|
|
/* H.264 qpel MC */
|
|
|
|
.macro lowpass_const r
|
|
movz \r, #20, lsl #16
|
|
movk \r, #5
|
|
mov v6.S[0], \r
|
|
.endm
|
|
|
|
//trashes v0-v5
|
|
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
|
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
|
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
|
uaddl v2.8H, v2.8B, v3.8B
|
|
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
|
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
|
uaddl v4.8H, v4.8B, v5.8B
|
|
ext v1.8B, \r0\().8B, \r1\().8B, #5
|
|
uaddl \d0\().8H, \r0\().8B, v1.8B
|
|
ext v0.8B, \r2\().8B, \r3\().8B, #2
|
|
mla \d0\().8H, v2.8H, v6.H[1]
|
|
ext v1.8B, \r2\().8B, \r3\().8B, #3
|
|
uaddl v0.8H, v0.8B, v1.8B
|
|
ext v1.8B, \r2\().8B, \r3\().8B, #1
|
|
mls \d0\().8H, v4.8H, v6.H[0]
|
|
ext v3.8B, \r2\().8B, \r3\().8B, #4
|
|
uaddl v1.8H, v1.8B, v3.8B
|
|
ext v2.8B, \r2\().8B, \r3\().8B, #5
|
|
uaddl \d1\().8H, \r2\().8B, v2.8B
|
|
mla \d1\().8H, v0.8H, v6.H[1]
|
|
mls \d1\().8H, v1.8H, v6.H[0]
|
|
.if \narrow
|
|
sqrshrun \d0\().8B, \d0\().8H, #5
|
|
sqrshrun \d1\().8B, \d1\().8H, #5
|
|
.endif
|
|
.endm
|
|
|
|
//trashes v0-v4
|
|
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
|
|
uaddl v2.8H, \r2\().8B, \r3\().8B
|
|
uaddl v0.8H, \r3\().8B, \r4\().8B
|
|
uaddl v4.8H, \r1\().8B, \r4\().8B
|
|
uaddl v1.8H, \r2\().8B, \r5\().8B
|
|
uaddl \d0\().8H, \r0\().8B, \r5\().8B
|
|
uaddl \d1\().8H, \r1\().8B, \r6\().8B
|
|
mla \d0\().8H, v2.8H, v6.H[1]
|
|
mls \d0\().8H, v4.8H, v6.H[0]
|
|
mla \d1\().8H, v0.8H, v6.H[1]
|
|
mls \d1\().8H, v1.8H, v6.H[0]
|
|
.if \narrow
|
|
sqrshrun \d0\().8B, \d0\().8H, #5
|
|
sqrshrun \d1\().8B, \d1\().8H, #5
|
|
.endif
|
|
.endm
|
|
|
|
//trashes v0-v5, v7, v30-v31
|
|
.macro lowpass_8H r0, r1
|
|
ext v0.16B, \r0\().16B, \r0\().16B, #2
|
|
ext v1.16B, \r0\().16B, \r0\().16B, #3
|
|
uaddl v0.8H, v0.8B, v1.8B
|
|
ext v2.16B, \r0\().16B, \r0\().16B, #1
|
|
ext v3.16B, \r0\().16B, \r0\().16B, #4
|
|
uaddl v2.8H, v2.8B, v3.8B
|
|
ext v30.16B, \r0\().16B, \r0\().16B, #5
|
|
uaddl \r0\().8H, \r0\().8B, v30.8B
|
|
ext v4.16B, \r1\().16B, \r1\().16B, #2
|
|
mla \r0\().8H, v0.8H, v6.H[1]
|
|
ext v5.16B, \r1\().16B, \r1\().16B, #3
|
|
uaddl v4.8H, v4.8B, v5.8B
|
|
ext v7.16B, \r1\().16B, \r1\().16B, #1
|
|
mls \r0\().8H, v2.8H, v6.H[0]
|
|
ext v0.16B, \r1\().16B, \r1\().16B, #4
|
|
uaddl v7.8H, v7.8B, v0.8B
|
|
ext v31.16B, \r1\().16B, \r1\().16B, #5
|
|
uaddl \r1\().8H, \r1\().8B, v31.8B
|
|
mla \r1\().8H, v4.8H, v6.H[1]
|
|
mls \r1\().8H, v7.8H, v6.H[0]
|
|
.endm
|
|
|
|
// trashes v2-v5, v30
|
|
.macro lowpass_8_1 r0, r1, d0, narrow=1
|
|
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
|
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
|
uaddl v2.8H, v2.8B, v3.8B
|
|
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
|
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
|
uaddl v4.8H, v4.8B, v5.8B
|
|
ext v30.8B, \r0\().8B, \r1\().8B, #5
|
|
uaddl \d0\().8H, \r0\().8B, v30.8B
|
|
mla \d0\().8H, v2.8H, v6.H[1]
|
|
mls \d0\().8H, v4.8H, v6.H[0]
|
|
.if \narrow
|
|
sqrshrun \d0\().8B, \d0\().8H, #5
|
|
.endif
|
|
.endm
|
|
|
|
// trashed v0-v7
|
|
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5
|
|
saddl v5.4S, \r2\().4H, \r3\().4H
|
|
saddl2 v1.4S, \r2\().8H, \r3\().8H
|
|
saddl v6.4S, \r1\().4H, \r4\().4H
|
|
saddl2 v2.4S, \r1\().8H, \r4\().8H
|
|
saddl v0.4S, \r0\().4H, \r5\().4H
|
|
saddl2 v4.4S, \r0\().8H, \r5\().8H
|
|
|
|
shl v3.4S, v5.4S, #4
|
|
shl v5.4S, v5.4S, #2
|
|
shl v7.4S, v6.4S, #2
|
|
add v5.4S, v5.4S, v3.4S
|
|
add v6.4S, v6.4S, v7.4S
|
|
|
|
shl v3.4S, v1.4S, #4
|
|
shl v1.4S, v1.4S, #2
|
|
shl v7.4S, v2.4S, #2
|
|
add v1.4S, v1.4S, v3.4S
|
|
add v2.4S, v2.4S, v7.4S
|
|
|
|
add v5.4S, v5.4S, v0.4S
|
|
sub v5.4S, v5.4S, v6.4S
|
|
|
|
add v1.4S, v1.4S, v4.4S
|
|
sub v1.4S, v1.4S, v2.4S
|
|
|
|
rshrn v5.4H, v5.4S, #10
|
|
rshrn2 v5.8H, v1.4S, #10
|
|
|
|
sqxtun \r0\().8B, v5.8H
|
|
.endm
|
|
|
|
function put_h264_qpel16_h_lowpass_neon_packed
|
|
mov x4, x30
|
|
mov x12, #16
|
|
mov x3, #8
|
|
bl put_h264_qpel8_h_lowpass_neon
|
|
sub x1, x1, x2, lsl #4
|
|
add x1, x1, #8
|
|
mov x12, #16
|
|
mov x30, x4
|
|
b put_h264_qpel8_h_lowpass_neon
|
|
endfunc
|
|
|
|
.macro h264_qpel_h_lowpass type
|
|
function \type\()_h264_qpel16_h_lowpass_neon
|
|
mov x13, x30
|
|
mov x12, #16
|
|
bl \type\()_h264_qpel8_h_lowpass_neon
|
|
sub x0, x0, x3, lsl #4
|
|
sub x1, x1, x2, lsl #4
|
|
add x0, x0, #8
|
|
add x1, x1, #8
|
|
mov x12, #16
|
|
mov x30, x13
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_h_lowpass_neon
|
|
1: ld1 {v28.8B, v29.8B}, [x1], x2
|
|
ld1 {v16.8B, v17.8B}, [x1], x2
|
|
subs x12, x12, #2
|
|
lowpass_8 v28, v29, v16, v17, v28, v16
|
|
.ifc \type,avg
|
|
ld1 {v2.8B}, [x0], x3
|
|
ld1 {v3.8B}, [x0]
|
|
urhadd v28.8B, v28.8B, v2.8B
|
|
urhadd v16.8B, v16.8B, v3.8B
|
|
sub x0, x0, x3
|
|
.endif
|
|
st1 {v28.8B}, [x0], x3
|
|
st1 {v16.8B}, [x0], x3
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_h_lowpass put
|
|
h264_qpel_h_lowpass avg
|
|
|
|
.macro h264_qpel_h_lowpass_l2 type
|
|
function \type\()_h264_qpel16_h_lowpass_l2_neon
|
|
mov x13, x30
|
|
mov x12, #16
|
|
bl \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
sub x0, x0, x2, lsl #4
|
|
sub x1, x1, x2, lsl #4
|
|
sub x3, x3, x2, lsl #4
|
|
add x0, x0, #8
|
|
add x1, x1, #8
|
|
add x3, x3, #8
|
|
mov x12, #16
|
|
mov x30, x13
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
1: ld1 {v26.8B, v27.8B}, [x1], x2
|
|
ld1 {v16.8B, v17.8B}, [x1], x2
|
|
ld1 {v28.8B}, [x3], x2
|
|
ld1 {v29.8B}, [x3], x2
|
|
subs x12, x12, #2
|
|
lowpass_8 v26, v27, v16, v17, v26, v27
|
|
urhadd v26.8B, v26.8B, v28.8B
|
|
urhadd v27.8B, v27.8B, v29.8B
|
|
.ifc \type,avg
|
|
ld1 {v2.8B}, [x0], x2
|
|
ld1 {v3.8B}, [x0]
|
|
urhadd v26.8B, v26.8B, v2.8B
|
|
urhadd v27.8B, v27.8B, v3.8B
|
|
sub x0, x0, x2
|
|
.endif
|
|
st1 {v26.8B}, [x0], x2
|
|
st1 {v27.8B}, [x0], x2
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_h_lowpass_l2 put
|
|
h264_qpel_h_lowpass_l2 avg
|
|
|
|
function put_h264_qpel16_v_lowpass_neon_packed
|
|
mov x4, x30
|
|
mov x2, #8
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
b put_h264_qpel8_v_lowpass_neon
|
|
endfunc
|
|
|
|
.macro h264_qpel_v_lowpass type
|
|
function \type\()_h264_qpel16_v_lowpass_neon
|
|
mov x4, x30
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
sub x0, x0, x2, lsl #4
|
|
add x0, x0, #8
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_v_lowpass_neon
|
|
ld1 {v16.8B}, [x1], x3
|
|
ld1 {v17.8B}, [x1], x3
|
|
ld1 {v18.8B}, [x1], x3
|
|
ld1 {v19.8B}, [x1], x3
|
|
ld1 {v20.8B}, [x1], x3
|
|
ld1 {v21.8B}, [x1], x3
|
|
ld1 {v22.8B}, [x1], x3
|
|
ld1 {v23.8B}, [x1], x3
|
|
ld1 {v24.8B}, [x1], x3
|
|
ld1 {v25.8B}, [x1], x3
|
|
ld1 {v26.8B}, [x1], x3
|
|
ld1 {v27.8B}, [x1], x3
|
|
ld1 {v28.8B}, [x1]
|
|
|
|
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
|
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
|
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
|
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
|
.ifc \type,avg
|
|
ld1 {v24.8B}, [x0], x2
|
|
ld1 {v25.8B}, [x0], x2
|
|
ld1 {v26.8B}, [x0], x2
|
|
urhadd v16.8B, v16.8B, v24.8B
|
|
ld1 {v27.8B}, [x0], x2
|
|
urhadd v17.8B, v17.8B, v25.8B
|
|
ld1 {v28.8B}, [x0], x2
|
|
urhadd v18.8B, v18.8B, v26.8B
|
|
ld1 {v29.8B}, [x0], x2
|
|
urhadd v19.8B, v19.8B, v27.8B
|
|
ld1 {v30.8B}, [x0], x2
|
|
urhadd v20.8B, v20.8B, v28.8B
|
|
ld1 {v31.8B}, [x0], x2
|
|
urhadd v21.8B, v21.8B, v29.8B
|
|
urhadd v22.8B, v22.8B, v30.8B
|
|
urhadd v23.8B, v23.8B, v31.8B
|
|
sub x0, x0, x2, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8B}, [x0], x2
|
|
st1 {v17.8B}, [x0], x2
|
|
st1 {v18.8B}, [x0], x2
|
|
st1 {v19.8B}, [x0], x2
|
|
st1 {v20.8B}, [x0], x2
|
|
st1 {v21.8B}, [x0], x2
|
|
st1 {v22.8B}, [x0], x2
|
|
st1 {v23.8B}, [x0], x2
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_v_lowpass put
|
|
h264_qpel_v_lowpass avg
|
|
|
|
.macro h264_qpel_v_lowpass_l2 type
|
|
function \type\()_h264_qpel16_v_lowpass_l2_neon
|
|
mov x4, x30
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
sub x0, x0, x3, lsl #4
|
|
sub x12, x12, x2, lsl #4
|
|
add x0, x0, #8
|
|
add x12, x12, #8
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
ld1 {v16.8B}, [x1], x3
|
|
ld1 {v17.8B}, [x1], x3
|
|
ld1 {v18.8B}, [x1], x3
|
|
ld1 {v19.8B}, [x1], x3
|
|
ld1 {v20.8B}, [x1], x3
|
|
ld1 {v21.8B}, [x1], x3
|
|
ld1 {v22.8B}, [x1], x3
|
|
ld1 {v23.8B}, [x1], x3
|
|
ld1 {v24.8B}, [x1], x3
|
|
ld1 {v25.8B}, [x1], x3
|
|
ld1 {v26.8B}, [x1], x3
|
|
ld1 {v27.8B}, [x1], x3
|
|
ld1 {v28.8B}, [x1]
|
|
|
|
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
|
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
|
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
|
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
|
|
|
ld1 {v24.8B}, [x12], x2
|
|
ld1 {v25.8B}, [x12], x2
|
|
ld1 {v26.8B}, [x12], x2
|
|
ld1 {v27.8B}, [x12], x2
|
|
ld1 {v28.8B}, [x12], x2
|
|
urhadd v16.8B, v24.8B, v16.8B
|
|
urhadd v17.8B, v25.8B, v17.8B
|
|
ld1 {v29.8B}, [x12], x2
|
|
urhadd v18.8B, v26.8B, v18.8B
|
|
urhadd v19.8B, v27.8B, v19.8B
|
|
ld1 {v30.8B}, [x12], x2
|
|
urhadd v20.8B, v28.8B, v20.8B
|
|
urhadd v21.8B, v29.8B, v21.8B
|
|
ld1 {v31.8B}, [x12], x2
|
|
urhadd v22.8B, v30.8B, v22.8B
|
|
urhadd v23.8B, v31.8B, v23.8B
|
|
|
|
.ifc \type,avg
|
|
ld1 {v24.8B}, [x0], x3
|
|
ld1 {v25.8B}, [x0], x3
|
|
ld1 {v26.8B}, [x0], x3
|
|
urhadd v16.8B, v16.8B, v24.8B
|
|
ld1 {v27.8B}, [x0], x3
|
|
urhadd v17.8B, v17.8B, v25.8B
|
|
ld1 {v28.8B}, [x0], x3
|
|
urhadd v18.8B, v18.8B, v26.8B
|
|
ld1 {v29.8B}, [x0], x3
|
|
urhadd v19.8B, v19.8B, v27.8B
|
|
ld1 {v30.8B}, [x0], x3
|
|
urhadd v20.8B, v20.8B, v28.8B
|
|
ld1 {v31.8B}, [x0], x3
|
|
urhadd v21.8B, v21.8B, v29.8B
|
|
urhadd v22.8B, v22.8B, v30.8B
|
|
urhadd v23.8B, v23.8B, v31.8B
|
|
sub x0, x0, x3, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8B}, [x0], x3
|
|
st1 {v17.8B}, [x0], x3
|
|
st1 {v18.8B}, [x0], x3
|
|
st1 {v19.8B}, [x0], x3
|
|
st1 {v20.8B}, [x0], x3
|
|
st1 {v21.8B}, [x0], x3
|
|
st1 {v22.8B}, [x0], x3
|
|
st1 {v23.8B}, [x0], x3
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_v_lowpass_l2 put
|
|
h264_qpel_v_lowpass_l2 avg
|
|
|
|
function put_h264_qpel8_hv_lowpass_neon_top
|
|
lowpass_const w12
|
|
ld1 {v16.8H}, [x1], x3
|
|
ld1 {v17.8H}, [x1], x3
|
|
ld1 {v18.8H}, [x1], x3
|
|
ld1 {v19.8H}, [x1], x3
|
|
ld1 {v20.8H}, [x1], x3
|
|
ld1 {v21.8H}, [x1], x3
|
|
ld1 {v22.8H}, [x1], x3
|
|
ld1 {v23.8H}, [x1], x3
|
|
ld1 {v24.8H}, [x1], x3
|
|
ld1 {v25.8H}, [x1], x3
|
|
ld1 {v26.8H}, [x1], x3
|
|
ld1 {v27.8H}, [x1], x3
|
|
ld1 {v28.8H}, [x1]
|
|
lowpass_8H v16, v17
|
|
lowpass_8H v18, v19
|
|
lowpass_8H v20, v21
|
|
lowpass_8H v22, v23
|
|
lowpass_8H v24, v25
|
|
lowpass_8H v26, v27
|
|
lowpass_8H v28, v29
|
|
|
|
lowpass_8.16 v16, v17, v18, v19, v20, v21
|
|
lowpass_8.16 v17, v18, v19, v20, v21, v22
|
|
|
|
lowpass_8.16 v18, v19, v20, v21, v22, v23
|
|
lowpass_8.16 v19, v20, v21, v22, v23, v24
|
|
|
|
lowpass_8.16 v20, v21, v22, v23, v24, v25
|
|
lowpass_8.16 v21, v22, v23, v24, v25, v26
|
|
|
|
lowpass_8.16 v22, v23, v24, v25, v26, v27
|
|
lowpass_8.16 v23, v24, v25, v26, v27, v28
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro h264_qpel8_hv_lowpass type
|
|
function \type\()_h264_qpel8_hv_lowpass_neon
|
|
mov x10, x30
|
|
bl put_h264_qpel8_hv_lowpass_neon_top
|
|
.ifc \type,avg
|
|
ld1 {v0.8B}, [x0], x2
|
|
ld1 {v1.8B}, [x0], x2
|
|
ld1 {v2.8B}, [x0], x2
|
|
urhadd v16.8B, v16.8B, v0.8B
|
|
ld1 {v3.8B}, [x0], x2
|
|
urhadd v17.8B, v17.8B, v1.8B
|
|
ld1 {v4.8B}, [x0], x2
|
|
urhadd v18.8B, v18.8B, v2.8B
|
|
ld1 {v5.8B}, [x0], x2
|
|
urhadd v19.8B, v19.8B, v3.8B
|
|
ld1 {v6.8B}, [x0], x2
|
|
urhadd v20.8B, v20.8B, v4.8B
|
|
ld1 {v7.8B}, [x0], x2
|
|
urhadd v21.8B, v21.8B, v5.8B
|
|
urhadd v22.8B, v22.8B, v6.8B
|
|
urhadd v23.8B, v23.8B, v7.8B
|
|
sub x0, x0, x2, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8B}, [x0], x2
|
|
st1 {v17.8B}, [x0], x2
|
|
st1 {v18.8B}, [x0], x2
|
|
st1 {v19.8B}, [x0], x2
|
|
st1 {v20.8B}, [x0], x2
|
|
st1 {v21.8B}, [x0], x2
|
|
st1 {v22.8B}, [x0], x2
|
|
st1 {v23.8B}, [x0], x2
|
|
|
|
ret x10
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8_hv_lowpass put
|
|
h264_qpel8_hv_lowpass avg
|
|
|
|
.macro h264_qpel8_hv_lowpass_l2 type
|
|
function \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
mov x10, x30
|
|
bl put_h264_qpel8_hv_lowpass_neon_top
|
|
|
|
ld1 {v0.8B, v1.8B}, [x2], #16
|
|
ld1 {v2.8B, v3.8B}, [x2], #16
|
|
urhadd v0.8B, v0.8B, v16.8B
|
|
urhadd v1.8B, v1.8B, v17.8B
|
|
ld1 {v4.8B, v5.8B}, [x2], #16
|
|
urhadd v2.8B, v2.8B, v18.8B
|
|
urhadd v3.8B, v3.8B, v19.8B
|
|
ld1 {v6.8B, v7.8B}, [x2], #16
|
|
urhadd v4.8B, v4.8B, v20.8B
|
|
urhadd v5.8B, v5.8B, v21.8B
|
|
urhadd v6.8B, v6.8B, v22.8B
|
|
urhadd v7.8B, v7.8B, v23.8B
|
|
.ifc \type,avg
|
|
ld1 {v16.8B}, [x0], x3
|
|
ld1 {v17.8B}, [x0], x3
|
|
ld1 {v18.8B}, [x0], x3
|
|
urhadd v0.8B, v0.8B, v16.8B
|
|
ld1 {v19.8B}, [x0], x3
|
|
urhadd v1.8B, v1.8B, v17.8B
|
|
ld1 {v20.8B}, [x0], x3
|
|
urhadd v2.8B, v2.8B, v18.8B
|
|
ld1 {v21.8B}, [x0], x3
|
|
urhadd v3.8B, v3.8B, v19.8B
|
|
ld1 {v22.8B}, [x0], x3
|
|
urhadd v4.8B, v4.8B, v20.8B
|
|
ld1 {v23.8B}, [x0], x3
|
|
urhadd v5.8B, v5.8B, v21.8B
|
|
urhadd v6.8B, v6.8B, v22.8B
|
|
urhadd v7.8B, v7.8B, v23.8B
|
|
sub x0, x0, x3, lsl #3
|
|
.endif
|
|
st1 {v0.8B}, [x0], x3
|
|
st1 {v1.8B}, [x0], x3
|
|
st1 {v2.8B}, [x0], x3
|
|
st1 {v3.8B}, [x0], x3
|
|
st1 {v4.8B}, [x0], x3
|
|
st1 {v5.8B}, [x0], x3
|
|
st1 {v6.8B}, [x0], x3
|
|
st1 {v7.8B}, [x0], x3
|
|
|
|
ret x10
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8_hv_lowpass_l2 put
|
|
h264_qpel8_hv_lowpass_l2 avg
|
|
|
|
.macro h264_qpel16_hv type
|
|
function \type\()_h264_qpel16_hv_lowpass_neon
|
|
mov x13, x30
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
sub x0, x0, x2, lsl #4
|
|
add x0, x0, #8
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x13
|
|
b \type\()_h264_qpel8_hv_lowpass_neon
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel16_hv_lowpass_l2_neon
|
|
mov x13, x30
|
|
sub x2, x4, #256
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
sub x0, x0, x3, lsl #4
|
|
add x0, x0, #8
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x13
|
|
b \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel16_hv put
|
|
h264_qpel16_hv avg
|
|
|
|
.macro h264_qpel8 type
|
|
function ff_\type\()_h264_qpel8_mc10_neon, export=1
|
|
lowpass_const w3
|
|
mov x3, x1
|
|
sub x1, x1, #2
|
|
mov x12, #8
|
|
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc20_neon, export=1
|
|
lowpass_const w3
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
mov x12, #8
|
|
b \type\()_h264_qpel8_h_lowpass_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc30_neon, export=1
|
|
lowpass_const w3
|
|
add x3, x1, #1
|
|
sub x1, x1, #2
|
|
mov x12, #8
|
|
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc01_neon, export=1
|
|
mov x14, x30
|
|
mov x12, x1
|
|
\type\()_h264_qpel8_mc01:
|
|
lowpass_const w3
|
|
mov x3, x2
|
|
sub x1, x1, x2, lsl #1
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc11_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc11:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #64
|
|
mov x0, sp
|
|
sub x1, x1, #2
|
|
mov x3, #8
|
|
mov x12, #8
|
|
bl put_h264_qpel8_h_lowpass_neon
|
|
mov x0, x8
|
|
mov x3, x2
|
|
mov x12, sp
|
|
sub x1, x9, x2, lsl #1
|
|
mov x2, #8
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc21_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc21:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(8*8+16*12)
|
|
sub x1, x1, #2
|
|
mov x3, #8
|
|
mov x0, sp
|
|
mov x12, #8
|
|
bl put_h264_qpel8_h_lowpass_neon
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
sub x2, x4, #64
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc31_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel8_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc02_neon, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc12_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc12:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(8*8+16*12)
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
mov x2, #8
|
|
mov x0, sp
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x3, lsl #1
|
|
sub x1, x1, #2
|
|
sub x2, x4, #64
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc22_neon, export=1
|
|
mov x14, x30
|
|
mov x11, sp
|
|
sub x1, x1, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc32_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, #1
|
|
b \type\()_h264_qpel8_mc12
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc03_neon, export=1
|
|
mov x14, x30
|
|
add x12, x1, x2
|
|
b \type\()_h264_qpel8_mc01
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc13_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel8_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc23_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel8_mc21
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc33_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel8_mc11
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8 put
|
|
h264_qpel8 avg
|
|
|
|
.macro h264_qpel16 type
|
|
function ff_\type\()_h264_qpel16_mc10_neon, export=1
|
|
lowpass_const w3
|
|
mov x3, x1
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc20_neon, export=1
|
|
lowpass_const w3
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
b \type\()_h264_qpel16_h_lowpass_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc30_neon, export=1
|
|
lowpass_const w3
|
|
add x3, x1, #1
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc01_neon, export=1
|
|
mov x14, x30
|
|
mov x12, x1
|
|
\type\()_h264_qpel16_mc01:
|
|
lowpass_const w3
|
|
mov x3, x2
|
|
sub x1, x1, x2, lsl #1
|
|
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc11_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc11:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #256
|
|
mov x0, sp
|
|
sub x1, x1, #2
|
|
mov x3, #16
|
|
bl put_h264_qpel16_h_lowpass_neon
|
|
mov x0, x8
|
|
mov x3, x2
|
|
mov x12, sp
|
|
sub x1, x9, x2, lsl #1
|
|
mov x2, #16
|
|
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc21_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc21:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(16*16+16*12)
|
|
sub x1, x1, #2
|
|
mov x0, sp
|
|
bl put_h264_qpel16_h_lowpass_neon_packed
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc31_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel16_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc02_neon, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_v_lowpass_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc12_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc12:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(16*16+16*12)
|
|
sub x1, x1, x2, lsl #1
|
|
mov x0, sp
|
|
mov x3, x2
|
|
bl put_h264_qpel16_v_lowpass_neon_packed
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x3, lsl #1
|
|
sub x1, x1, #2
|
|
mov x2, x3
|
|
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc22_neon, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub x1, x1, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_hv_lowpass_neon
|
|
mov sp, x11 // restore stack
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc32_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, #1
|
|
b \type\()_h264_qpel16_mc12
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc03_neon, export=1
|
|
mov x14, x30
|
|
add x12, x1, x2
|
|
b \type\()_h264_qpel16_mc01
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc13_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel16_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc23_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel16_mc21
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc33_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel16_mc11
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel16 put
|
|
h264_qpel16 avg
|