mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
0f745b74ec
Benchmarks A53 A55 A72 A76 avg_h264_qpel_8_mc01_10_c: 936.5 924.0 656.0 504.7 avg_h264_qpel_8_mc01_10_neon: 234.7 202.0 120.7 63.2 avg_h264_qpel_8_mc02_10_c: 921.0 920.0 669.2 493.7 avg_h264_qpel_8_mc02_10_neon: 202.0 173.2 102.7 58.5 avg_h264_qpel_8_mc03_10_c: 936.5 924.0 656.0 509.5 avg_h264_qpel_8_mc03_10_neon: 236.2 203.7 120.0 63.2 avg_h264_qpel_8_mc10_10_c: 1441.0 1437.7 806.7 478.5 avg_h264_qpel_8_mc10_10_neon: 325.7 324.0 153.7 94.2 avg_h264_qpel_8_mc11_10_c: 2160.7 2148.2 1366.7 906.7 avg_h264_qpel_8_mc11_10_neon: 492.0 464.0 242.5 134.5 avg_h264_qpel_8_mc13_10_c: 2157.0 2138.2 1357.0 908.2 avg_h264_qpel_8_mc13_10_neon: 494.0 467.2 242.0 140.0 avg_h264_qpel_8_mc20_10_c: 1433.5 1410.0 785.2 486.0 avg_h264_qpel_8_mc20_10_neon: 293.7 289.7 138.0 91.5 avg_h264_qpel_8_mc30_10_c: 1458.5 1461.7 813.7 483.2 avg_h264_qpel_8_mc30_10_neon: 341.7 339.2 154.0 95.2 avg_h264_qpel_8_mc31_10_c: 2194.7 2197.2 1358.7 928.0 avg_h264_qpel_8_mc31_10_neon: 520.0 495.0 245.5 142.5 avg_h264_qpel_8_mc33_10_c: 2188.0 2205.5 1356.7 910.7 avg_h264_qpel_8_mc33_10_neon: 521.0 494.5 245.7 145.7 avg_h264_qpel_16_mc01_10_c: 3717.2 3595.0 2610.0 2012.0 avg_h264_qpel_16_mc01_10_neon: 920.5 791.5 483.2 240.5 avg_h264_qpel_16_mc02_10_c: 3684.0 3633.0 2659.0 1919.7 avg_h264_qpel_16_mc02_10_neon: 790.7 678.2 409.2 217.0 avg_h264_qpel_16_mc03_10_c: 3726.5 3596.0 2606.7 2010.0 avg_h264_qpel_16_mc03_10_neon: 922.0 792.5 483.2 239.7 avg_h264_qpel_16_mc10_10_c: 5912.0 5803.2 3241.5 1916.7 avg_h264_qpel_16_mc10_10_neon: 1267.5 1277.2 616.5 365.0 avg_h264_qpel_16_mc11_10_c: 8599.2 8482.5 5338.0 3616.2 avg_h264_qpel_16_mc11_10_neon: 1913.0 1827.0 956.2 542.2 avg_h264_qpel_16_mc13_10_c: 8643.7 8488.5 5388.0 3628.5 avg_h264_qpel_16_mc13_10_neon: 1914.7 1828.7 969.2 530.5 avg_h264_qpel_16_mc20_10_c: 5719.5 5641.0 3147.0 1946.2 avg_h264_qpel_16_mc20_10_neon: 1139.5 1150.0 539.5 344.0 avg_h264_qpel_16_mc30_10_c: 5930.0 5872.5 3267.5 1918.0 avg_h264_qpel_16_mc30_10_neon: 1331.5 1341.2 616.5 369.5 avg_h264_qpel_16_mc31_10_c: 8758.7 8697.7 5353.0 3630.7 avg_h264_qpel_16_mc31_10_neon: 2018.7 1941.7 982.2 574.7 avg_h264_qpel_16_mc33_10_c: 8683.2 8675.2 5339.2 3634.7 avg_h264_qpel_16_mc33_10_neon: 2019.7 1940.2 994.5 566.0 put_h264_qpel_8_mc01_10_c: 854.2 843.0 599.2 478.0 put_h264_qpel_8_mc01_10_neon: 192.7 168.0 101.7 56.7 put_h264_qpel_8_mc02_10_c: 766.5 760.0 550.2 441.0 put_h264_qpel_8_mc02_10_neon: 160.0 139.2 88.7 53.0 put_h264_qpel_8_mc03_10_c: 854.2 843.0 599.2 479.0 put_h264_qpel_8_mc03_10_neon: 194.2 169.7 102.0 56.2 put_h264_qpel_8_mc10_10_c: 1352.7 1353.7 749.7 446.7 put_h264_qpel_8_mc10_10_neon: 289.7 294.2 135.5 88.5 put_h264_qpel_8_mc11_10_c: 2080.0 2066.2 1309.5 876.7 put_h264_qpel_8_mc11_10_neon: 450.0 429.7 229.7 131.2 put_h264_qpel_8_mc13_10_c: 2074.7 2060.2 1294.5 870.5 put_h264_qpel_8_mc13_10_neon: 452.5 434.5 226.5 130.0 put_h264_qpel_8_mc20_10_c: 1221.5 1216.0 684.5 399.7 put_h264_qpel_8_mc20_10_neon: 257.7 262.5 121.2 78.7 put_h264_qpel_8_mc30_10_c: 1379.0 1374.7 757.2 449.5 put_h264_qpel_8_mc30_10_neon: 305.7 310.2 135.5 86.5 put_h264_qpel_8_mc31_10_c: 2109.2 2119.7 1299.5 878.0 put_h264_qpel_8_mc31_10_neon: 478.0 458.5 226.0 137.2 put_h264_qpel_8_mc33_10_c: 2101.5 2115.2 1306.5 887.0 put_h264_qpel_8_mc33_10_neon: 479.0 458.7 229.7 141.7 put_h264_qpel_16_mc01_10_c: 3485.7 3396.7 2460.5 1914.5 put_h264_qpel_16_mc01_10_neon: 752.5 665.5 397.0 213.2 put_h264_qpel_16_mc02_10_c: 3103.5 3023.2 2154.7 1720.7 put_h264_qpel_16_mc02_10_neon: 622.7 551.2 347.7 196.2 put_h264_qpel_16_mc03_10_c: 3486.2 3394.0 2436.5 1917.7 put_h264_qpel_16_mc03_10_neon: 754.0 666.5 397.0 215.7 put_h264_qpel_16_mc10_10_c: 5533.0 5488.5 2989.0 1783.0 put_h264_qpel_16_mc10_10_neon: 1123.5 1165.2 535.2 334.7 put_h264_qpel_16_mc11_10_c: 8437.7 8281.2 5209.0 3510.7 put_h264_qpel_16_mc11_10_neon: 1745.0 1697.0 878.5 513.5 put_h264_qpel_16_mc13_10_c: 8567.7 8468.0 5221.5 3528.0 put_h264_qpel_16_mc13_10_neon: 1751.7 1698.2 889.2 507.0 put_h264_qpel_16_mc20_10_c: 4907.5 4885.0 2786.2 1607.5 put_h264_qpel_16_mc20_10_neon: 995.5 1034.5 475.5 307.0 put_h264_qpel_16_mc30_10_c: 5579.7 5537.7 3045.2 1789.5 put_h264_qpel_16_mc30_10_neon: 1187.5 1231.2 532.5 334.5 put_h264_qpel_16_mc31_10_c: 8677.2 8672.5 5204.2 3516.0 put_h264_qpel_16_mc31_10_neon: 1850.7 1813.2 893.0 545.2 put_h264_qpel_16_mc33_10_c: 8688.7 8671.2 5223.2 3512.0 put_h264_qpel_16_mc33_10_neon: 1851.7 1814.2 908.5 535.2 Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> Signed-off-by: Martin Storsjö <martin@martin.st>
1468 lines
52 KiB
ArmAsm
1468 lines
52 KiB
ArmAsm
/*
|
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
#include "neon.S"
|
|
|
|
/* H.264 qpel MC */
|
|
|
|
.macro lowpass_const r
|
|
movz \r, #20, lsl #16
|
|
movk \r, #5
|
|
mov v6.s[0], \r
|
|
.endm
|
|
|
|
//trashes v0-v5
|
|
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
|
ext v2.8b, \r0\().8b, \r1\().8b, #2
|
|
ext v3.8b, \r0\().8b, \r1\().8b, #3
|
|
uaddl v2.8h, v2.8b, v3.8b
|
|
ext v4.8b, \r0\().8b, \r1\().8b, #1
|
|
ext v5.8b, \r0\().8b, \r1\().8b, #4
|
|
uaddl v4.8h, v4.8b, v5.8b
|
|
ext v1.8b, \r0\().8b, \r1\().8b, #5
|
|
uaddl \d0\().8h, \r0\().8b, v1.8b
|
|
ext v0.8b, \r2\().8b, \r3\().8b, #2
|
|
mla \d0\().8h, v2.8h, v6.h[1]
|
|
ext v1.8b, \r2\().8b, \r3\().8b, #3
|
|
uaddl v0.8h, v0.8b, v1.8b
|
|
ext v1.8b, \r2\().8b, \r3\().8b, #1
|
|
mls \d0\().8h, v4.8h, v6.h[0]
|
|
ext v3.8b, \r2\().8b, \r3\().8b, #4
|
|
uaddl v1.8h, v1.8b, v3.8b
|
|
ext v2.8b, \r2\().8b, \r3\().8b, #5
|
|
uaddl \d1\().8h, \r2\().8b, v2.8b
|
|
mla \d1\().8h, v0.8h, v6.h[1]
|
|
mls \d1\().8h, v1.8h, v6.h[0]
|
|
.if \narrow
|
|
sqrshrun \d0\().8b, \d0\().8h, #5
|
|
sqrshrun \d1\().8b, \d1\().8h, #5
|
|
.endif
|
|
.endm
|
|
|
|
//trashes v0-v4
|
|
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
|
|
uaddl v2.8h, \r2\().8b, \r3\().8b
|
|
uaddl v0.8h, \r3\().8b, \r4\().8b
|
|
uaddl v4.8h, \r1\().8b, \r4\().8b
|
|
uaddl v1.8h, \r2\().8b, \r5\().8b
|
|
uaddl \d0\().8h, \r0\().8b, \r5\().8b
|
|
uaddl \d1\().8h, \r1\().8b, \r6\().8b
|
|
mla \d0\().8h, v2.8h, v6.h[1]
|
|
mls \d0\().8h, v4.8h, v6.h[0]
|
|
mla \d1\().8h, v0.8h, v6.h[1]
|
|
mls \d1\().8h, v1.8h, v6.h[0]
|
|
.if \narrow
|
|
sqrshrun \d0\().8b, \d0\().8h, #5
|
|
sqrshrun \d1\().8b, \d1\().8h, #5
|
|
.endif
|
|
.endm
|
|
|
|
//trashes v0-v5, v7, v30-v31
|
|
.macro lowpass_8H r0, r1
|
|
ext v0.16b, \r0\().16b, \r0\().16b, #2
|
|
ext v1.16b, \r0\().16b, \r0\().16b, #3
|
|
uaddl v0.8h, v0.8b, v1.8b
|
|
ext v2.16b, \r0\().16b, \r0\().16b, #1
|
|
ext v3.16b, \r0\().16b, \r0\().16b, #4
|
|
uaddl v2.8h, v2.8b, v3.8b
|
|
ext v30.16b, \r0\().16b, \r0\().16b, #5
|
|
uaddl \r0\().8h, \r0\().8b, v30.8b
|
|
ext v4.16b, \r1\().16b, \r1\().16b, #2
|
|
mla \r0\().8h, v0.8h, v6.h[1]
|
|
ext v5.16b, \r1\().16b, \r1\().16b, #3
|
|
uaddl v4.8h, v4.8b, v5.8b
|
|
ext v7.16b, \r1\().16b, \r1\().16b, #1
|
|
mls \r0\().8h, v2.8h, v6.h[0]
|
|
ext v0.16b, \r1\().16b, \r1\().16b, #4
|
|
uaddl v7.8h, v7.8b, v0.8b
|
|
ext v31.16b, \r1\().16b, \r1\().16b, #5
|
|
uaddl \r1\().8h, \r1\().8b, v31.8b
|
|
mla \r1\().8h, v4.8h, v6.h[1]
|
|
mls \r1\().8h, v7.8h, v6.h[0]
|
|
.endm
|
|
|
|
// trashes v2-v5, v30
|
|
.macro lowpass_8_1 r0, r1, d0, narrow=1
|
|
ext v2.8b, \r0\().8b, \r1\().8b, #2
|
|
ext v3.8b, \r0\().8b, \r1\().8b, #3
|
|
uaddl v2.8h, v2.8b, v3.8b
|
|
ext v4.8b, \r0\().8b, \r1\().8b, #1
|
|
ext v5.8b, \r0\().8b, \r1\().8b, #4
|
|
uaddl v4.8h, v4.8b, v5.8b
|
|
ext v30.8b, \r0\().8b, \r1\().8b, #5
|
|
uaddl \d0\().8h, \r0\().8b, v30.8b
|
|
mla \d0\().8h, v2.8h, v6.h[1]
|
|
mls \d0\().8h, v4.8h, v6.h[0]
|
|
.if \narrow
|
|
sqrshrun \d0\().8b, \d0\().8h, #5
|
|
.endif
|
|
.endm
|
|
|
|
// trashed v0-v7
|
|
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5
|
|
saddl v5.4s, \r2\().4h, \r3\().4h
|
|
saddl2 v1.4s, \r2\().8h, \r3\().8h
|
|
saddl v6.4s, \r1\().4h, \r4\().4h
|
|
saddl2 v2.4s, \r1\().8h, \r4\().8h
|
|
saddl v0.4s, \r0\().4h, \r5\().4h
|
|
saddl2 v4.4s, \r0\().8h, \r5\().8h
|
|
|
|
shl v3.4s, v5.4s, #4
|
|
shl v5.4s, v5.4s, #2
|
|
shl v7.4s, v6.4s, #2
|
|
add v5.4s, v5.4s, v3.4s
|
|
add v6.4s, v6.4s, v7.4s
|
|
|
|
shl v3.4s, v1.4s, #4
|
|
shl v1.4s, v1.4s, #2
|
|
shl v7.4s, v2.4s, #2
|
|
add v1.4s, v1.4s, v3.4s
|
|
add v2.4s, v2.4s, v7.4s
|
|
|
|
add v5.4s, v5.4s, v0.4s
|
|
sub v5.4s, v5.4s, v6.4s
|
|
|
|
add v1.4s, v1.4s, v4.4s
|
|
sub v1.4s, v1.4s, v2.4s
|
|
|
|
rshrn v5.4h, v5.4s, #10
|
|
rshrn2 v5.8h, v1.4s, #10
|
|
|
|
sqxtun \r0\().8b, v5.8h
|
|
.endm
|
|
|
|
function put_h264_qpel16_h_lowpass_neon_packed
|
|
mov x4, x30
|
|
mov x12, #16
|
|
mov x3, #8
|
|
bl put_h264_qpel8_h_lowpass_neon
|
|
sub x1, x1, x2, lsl #4
|
|
add x1, x1, #8
|
|
mov x12, #16
|
|
mov x30, x4
|
|
b put_h264_qpel8_h_lowpass_neon
|
|
endfunc
|
|
|
|
.macro h264_qpel_h_lowpass type
|
|
function \type\()_h264_qpel16_h_lowpass_neon
|
|
mov x13, x30
|
|
mov x12, #16
|
|
bl \type\()_h264_qpel8_h_lowpass_neon
|
|
sub x0, x0, x3, lsl #4
|
|
sub x1, x1, x2, lsl #4
|
|
add x0, x0, #8
|
|
add x1, x1, #8
|
|
mov x12, #16
|
|
mov x30, x13
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_h_lowpass_neon
|
|
1: ld1 {v28.8b, v29.8b}, [x1], x2
|
|
ld1 {v16.8b, v17.8b}, [x1], x2
|
|
subs x12, x12, #2
|
|
lowpass_8 v28, v29, v16, v17, v28, v16
|
|
.ifc \type,avg
|
|
ld1 {v2.8b}, [x0], x3
|
|
ld1 {v3.8b}, [x0]
|
|
urhadd v28.8b, v28.8b, v2.8b
|
|
urhadd v16.8b, v16.8b, v3.8b
|
|
sub x0, x0, x3
|
|
.endif
|
|
st1 {v28.8b}, [x0], x3
|
|
st1 {v16.8b}, [x0], x3
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_h_lowpass put
|
|
h264_qpel_h_lowpass avg
|
|
|
|
.macro h264_qpel_h_lowpass_l2 type
|
|
function \type\()_h264_qpel16_h_lowpass_l2_neon
|
|
mov x13, x30
|
|
mov x12, #16
|
|
bl \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
sub x0, x0, x2, lsl #4
|
|
sub x1, x1, x2, lsl #4
|
|
sub x3, x3, x2, lsl #4
|
|
add x0, x0, #8
|
|
add x1, x1, #8
|
|
add x3, x3, #8
|
|
mov x12, #16
|
|
mov x30, x13
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
1: ld1 {v26.8b, v27.8b}, [x1], x2
|
|
ld1 {v16.8b, v17.8b}, [x1], x2
|
|
ld1 {v28.8b}, [x3], x2
|
|
ld1 {v29.8b}, [x3], x2
|
|
subs x12, x12, #2
|
|
lowpass_8 v26, v27, v16, v17, v26, v27
|
|
urhadd v26.8b, v26.8b, v28.8b
|
|
urhadd v27.8b, v27.8b, v29.8b
|
|
.ifc \type,avg
|
|
ld1 {v2.8b}, [x0], x2
|
|
ld1 {v3.8b}, [x0]
|
|
urhadd v26.8b, v26.8b, v2.8b
|
|
urhadd v27.8b, v27.8b, v3.8b
|
|
sub x0, x0, x2
|
|
.endif
|
|
st1 {v26.8b}, [x0], x2
|
|
st1 {v27.8b}, [x0], x2
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_h_lowpass_l2 put
|
|
h264_qpel_h_lowpass_l2 avg
|
|
|
|
function put_h264_qpel16_v_lowpass_neon_packed
|
|
mov x4, x30
|
|
mov x2, #8
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
b put_h264_qpel8_v_lowpass_neon
|
|
endfunc
|
|
|
|
.macro h264_qpel_v_lowpass type
|
|
function \type\()_h264_qpel16_v_lowpass_neon
|
|
mov x4, x30
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
sub x0, x0, x2, lsl #4
|
|
add x0, x0, #8
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_v_lowpass_neon
|
|
ld1 {v16.8b}, [x1], x3
|
|
ld1 {v17.8b}, [x1], x3
|
|
ld1 {v18.8b}, [x1], x3
|
|
ld1 {v19.8b}, [x1], x3
|
|
ld1 {v20.8b}, [x1], x3
|
|
ld1 {v21.8b}, [x1], x3
|
|
ld1 {v22.8b}, [x1], x3
|
|
ld1 {v23.8b}, [x1], x3
|
|
ld1 {v24.8b}, [x1], x3
|
|
ld1 {v25.8b}, [x1], x3
|
|
ld1 {v26.8b}, [x1], x3
|
|
ld1 {v27.8b}, [x1], x3
|
|
ld1 {v28.8b}, [x1]
|
|
|
|
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
|
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
|
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
|
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
|
.ifc \type,avg
|
|
ld1 {v24.8b}, [x0], x2
|
|
ld1 {v25.8b}, [x0], x2
|
|
ld1 {v26.8b}, [x0], x2
|
|
urhadd v16.8b, v16.8b, v24.8b
|
|
ld1 {v27.8b}, [x0], x2
|
|
urhadd v17.8b, v17.8b, v25.8b
|
|
ld1 {v28.8b}, [x0], x2
|
|
urhadd v18.8b, v18.8b, v26.8b
|
|
ld1 {v29.8b}, [x0], x2
|
|
urhadd v19.8b, v19.8b, v27.8b
|
|
ld1 {v30.8b}, [x0], x2
|
|
urhadd v20.8b, v20.8b, v28.8b
|
|
ld1 {v31.8b}, [x0], x2
|
|
urhadd v21.8b, v21.8b, v29.8b
|
|
urhadd v22.8b, v22.8b, v30.8b
|
|
urhadd v23.8b, v23.8b, v31.8b
|
|
sub x0, x0, x2, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8b}, [x0], x2
|
|
st1 {v17.8b}, [x0], x2
|
|
st1 {v18.8b}, [x0], x2
|
|
st1 {v19.8b}, [x0], x2
|
|
st1 {v20.8b}, [x0], x2
|
|
st1 {v21.8b}, [x0], x2
|
|
st1 {v22.8b}, [x0], x2
|
|
st1 {v23.8b}, [x0], x2
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_v_lowpass put
|
|
h264_qpel_v_lowpass avg
|
|
|
|
.macro h264_qpel_v_lowpass_l2 type
|
|
function \type\()_h264_qpel16_v_lowpass_l2_neon
|
|
mov x4, x30
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
sub x0, x0, x3, lsl #4
|
|
sub x12, x12, x2, lsl #4
|
|
add x0, x0, #8
|
|
add x12, x12, #8
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
ld1 {v16.8b}, [x1], x3
|
|
ld1 {v17.8b}, [x1], x3
|
|
ld1 {v18.8b}, [x1], x3
|
|
ld1 {v19.8b}, [x1], x3
|
|
ld1 {v20.8b}, [x1], x3
|
|
ld1 {v21.8b}, [x1], x3
|
|
ld1 {v22.8b}, [x1], x3
|
|
ld1 {v23.8b}, [x1], x3
|
|
ld1 {v24.8b}, [x1], x3
|
|
ld1 {v25.8b}, [x1], x3
|
|
ld1 {v26.8b}, [x1], x3
|
|
ld1 {v27.8b}, [x1], x3
|
|
ld1 {v28.8b}, [x1]
|
|
|
|
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
|
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
|
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
|
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
|
|
|
ld1 {v24.8b}, [x12], x2
|
|
ld1 {v25.8b}, [x12], x2
|
|
ld1 {v26.8b}, [x12], x2
|
|
ld1 {v27.8b}, [x12], x2
|
|
ld1 {v28.8b}, [x12], x2
|
|
urhadd v16.8b, v24.8b, v16.8b
|
|
urhadd v17.8b, v25.8b, v17.8b
|
|
ld1 {v29.8b}, [x12], x2
|
|
urhadd v18.8b, v26.8b, v18.8b
|
|
urhadd v19.8b, v27.8b, v19.8b
|
|
ld1 {v30.8b}, [x12], x2
|
|
urhadd v20.8b, v28.8b, v20.8b
|
|
urhadd v21.8b, v29.8b, v21.8b
|
|
ld1 {v31.8b}, [x12], x2
|
|
urhadd v22.8b, v30.8b, v22.8b
|
|
urhadd v23.8b, v31.8b, v23.8b
|
|
|
|
.ifc \type,avg
|
|
ld1 {v24.8b}, [x0], x3
|
|
ld1 {v25.8b}, [x0], x3
|
|
ld1 {v26.8b}, [x0], x3
|
|
urhadd v16.8b, v16.8b, v24.8b
|
|
ld1 {v27.8b}, [x0], x3
|
|
urhadd v17.8b, v17.8b, v25.8b
|
|
ld1 {v28.8b}, [x0], x3
|
|
urhadd v18.8b, v18.8b, v26.8b
|
|
ld1 {v29.8b}, [x0], x3
|
|
urhadd v19.8b, v19.8b, v27.8b
|
|
ld1 {v30.8b}, [x0], x3
|
|
urhadd v20.8b, v20.8b, v28.8b
|
|
ld1 {v31.8b}, [x0], x3
|
|
urhadd v21.8b, v21.8b, v29.8b
|
|
urhadd v22.8b, v22.8b, v30.8b
|
|
urhadd v23.8b, v23.8b, v31.8b
|
|
sub x0, x0, x3, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8b}, [x0], x3
|
|
st1 {v17.8b}, [x0], x3
|
|
st1 {v18.8b}, [x0], x3
|
|
st1 {v19.8b}, [x0], x3
|
|
st1 {v20.8b}, [x0], x3
|
|
st1 {v21.8b}, [x0], x3
|
|
st1 {v22.8b}, [x0], x3
|
|
st1 {v23.8b}, [x0], x3
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_v_lowpass_l2 put
|
|
h264_qpel_v_lowpass_l2 avg
|
|
|
|
function put_h264_qpel8_hv_lowpass_neon_top
|
|
lowpass_const w12
|
|
ld1 {v16.8h}, [x1], x3
|
|
ld1 {v17.8h}, [x1], x3
|
|
ld1 {v18.8h}, [x1], x3
|
|
ld1 {v19.8h}, [x1], x3
|
|
ld1 {v20.8h}, [x1], x3
|
|
ld1 {v21.8h}, [x1], x3
|
|
ld1 {v22.8h}, [x1], x3
|
|
ld1 {v23.8h}, [x1], x3
|
|
ld1 {v24.8h}, [x1], x3
|
|
ld1 {v25.8h}, [x1], x3
|
|
ld1 {v26.8h}, [x1], x3
|
|
ld1 {v27.8h}, [x1], x3
|
|
ld1 {v28.8h}, [x1]
|
|
lowpass_8H v16, v17
|
|
lowpass_8H v18, v19
|
|
lowpass_8H v20, v21
|
|
lowpass_8H v22, v23
|
|
lowpass_8H v24, v25
|
|
lowpass_8H v26, v27
|
|
lowpass_8H v28, v29
|
|
|
|
lowpass_8.16 v16, v17, v18, v19, v20, v21
|
|
lowpass_8.16 v17, v18, v19, v20, v21, v22
|
|
|
|
lowpass_8.16 v18, v19, v20, v21, v22, v23
|
|
lowpass_8.16 v19, v20, v21, v22, v23, v24
|
|
|
|
lowpass_8.16 v20, v21, v22, v23, v24, v25
|
|
lowpass_8.16 v21, v22, v23, v24, v25, v26
|
|
|
|
lowpass_8.16 v22, v23, v24, v25, v26, v27
|
|
lowpass_8.16 v23, v24, v25, v26, v27, v28
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro h264_qpel8_hv_lowpass type
|
|
function \type\()_h264_qpel8_hv_lowpass_neon
|
|
mov x10, x30
|
|
bl put_h264_qpel8_hv_lowpass_neon_top
|
|
.ifc \type,avg
|
|
ld1 {v0.8b}, [x0], x2
|
|
ld1 {v1.8b}, [x0], x2
|
|
ld1 {v2.8b}, [x0], x2
|
|
urhadd v16.8b, v16.8b, v0.8b
|
|
ld1 {v3.8b}, [x0], x2
|
|
urhadd v17.8b, v17.8b, v1.8b
|
|
ld1 {v4.8b}, [x0], x2
|
|
urhadd v18.8b, v18.8b, v2.8b
|
|
ld1 {v5.8b}, [x0], x2
|
|
urhadd v19.8b, v19.8b, v3.8b
|
|
ld1 {v6.8b}, [x0], x2
|
|
urhadd v20.8b, v20.8b, v4.8b
|
|
ld1 {v7.8b}, [x0], x2
|
|
urhadd v21.8b, v21.8b, v5.8b
|
|
urhadd v22.8b, v22.8b, v6.8b
|
|
urhadd v23.8b, v23.8b, v7.8b
|
|
sub x0, x0, x2, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8b}, [x0], x2
|
|
st1 {v17.8b}, [x0], x2
|
|
st1 {v18.8b}, [x0], x2
|
|
st1 {v19.8b}, [x0], x2
|
|
st1 {v20.8b}, [x0], x2
|
|
st1 {v21.8b}, [x0], x2
|
|
st1 {v22.8b}, [x0], x2
|
|
st1 {v23.8b}, [x0], x2
|
|
|
|
ret x10
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8_hv_lowpass put
|
|
h264_qpel8_hv_lowpass avg
|
|
|
|
.macro h264_qpel8_hv_lowpass_l2 type
|
|
function \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
mov x10, x30
|
|
bl put_h264_qpel8_hv_lowpass_neon_top
|
|
|
|
ld1 {v0.8b, v1.8b}, [x2], #16
|
|
ld1 {v2.8b, v3.8b}, [x2], #16
|
|
urhadd v0.8b, v0.8b, v16.8b
|
|
urhadd v1.8b, v1.8b, v17.8b
|
|
ld1 {v4.8b, v5.8b}, [x2], #16
|
|
urhadd v2.8b, v2.8b, v18.8b
|
|
urhadd v3.8b, v3.8b, v19.8b
|
|
ld1 {v6.8b, v7.8b}, [x2], #16
|
|
urhadd v4.8b, v4.8b, v20.8b
|
|
urhadd v5.8b, v5.8b, v21.8b
|
|
urhadd v6.8b, v6.8b, v22.8b
|
|
urhadd v7.8b, v7.8b, v23.8b
|
|
.ifc \type,avg
|
|
ld1 {v16.8b}, [x0], x3
|
|
ld1 {v17.8b}, [x0], x3
|
|
ld1 {v18.8b}, [x0], x3
|
|
urhadd v0.8b, v0.8b, v16.8b
|
|
ld1 {v19.8b}, [x0], x3
|
|
urhadd v1.8b, v1.8b, v17.8b
|
|
ld1 {v20.8b}, [x0], x3
|
|
urhadd v2.8b, v2.8b, v18.8b
|
|
ld1 {v21.8b}, [x0], x3
|
|
urhadd v3.8b, v3.8b, v19.8b
|
|
ld1 {v22.8b}, [x0], x3
|
|
urhadd v4.8b, v4.8b, v20.8b
|
|
ld1 {v23.8b}, [x0], x3
|
|
urhadd v5.8b, v5.8b, v21.8b
|
|
urhadd v6.8b, v6.8b, v22.8b
|
|
urhadd v7.8b, v7.8b, v23.8b
|
|
sub x0, x0, x3, lsl #3
|
|
.endif
|
|
st1 {v0.8b}, [x0], x3
|
|
st1 {v1.8b}, [x0], x3
|
|
st1 {v2.8b}, [x0], x3
|
|
st1 {v3.8b}, [x0], x3
|
|
st1 {v4.8b}, [x0], x3
|
|
st1 {v5.8b}, [x0], x3
|
|
st1 {v6.8b}, [x0], x3
|
|
st1 {v7.8b}, [x0], x3
|
|
|
|
ret x10
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8_hv_lowpass_l2 put
|
|
h264_qpel8_hv_lowpass_l2 avg
|
|
|
|
.macro h264_qpel16_hv type
|
|
function \type\()_h264_qpel16_hv_lowpass_neon
|
|
mov x13, x30
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
sub x0, x0, x2, lsl #4
|
|
add x0, x0, #8
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x13
|
|
b \type\()_h264_qpel8_hv_lowpass_neon
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel16_hv_lowpass_l2_neon
|
|
mov x13, x30
|
|
sub x2, x4, #256
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
sub x0, x0, x3, lsl #4
|
|
add x0, x0, #8
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x13
|
|
b \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel16_hv put
|
|
h264_qpel16_hv avg
|
|
|
|
.macro h264_qpel8 type
|
|
function ff_\type\()_h264_qpel8_mc10_neon, export=1
|
|
lowpass_const w3
|
|
mov x3, x1
|
|
sub x1, x1, #2
|
|
mov x12, #8
|
|
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc20_neon, export=1
|
|
lowpass_const w3
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
mov x12, #8
|
|
b \type\()_h264_qpel8_h_lowpass_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc30_neon, export=1
|
|
lowpass_const w3
|
|
add x3, x1, #1
|
|
sub x1, x1, #2
|
|
mov x12, #8
|
|
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc01_neon, export=1
|
|
mov x14, x30
|
|
mov x12, x1
|
|
\type\()_h264_qpel8_mc01:
|
|
lowpass_const w3
|
|
mov x3, x2
|
|
sub x1, x1, x2, lsl #1
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc11_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc11:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #64
|
|
mov x0, sp
|
|
sub x1, x1, #2
|
|
mov x3, #8
|
|
mov x12, #8
|
|
bl put_h264_qpel8_h_lowpass_neon
|
|
mov x0, x8
|
|
mov x3, x2
|
|
mov x12, sp
|
|
sub x1, x9, x2, lsl #1
|
|
mov x2, #8
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc21_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc21:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(8*8+16*12)
|
|
sub x1, x1, #2
|
|
mov x3, #8
|
|
mov x0, sp
|
|
mov x12, #8
|
|
bl put_h264_qpel8_h_lowpass_neon
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
sub x2, x4, #64
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc31_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel8_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc02_neon, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel8_v_lowpass_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc12_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc12:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(8*8+16*12)
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
mov x2, #8
|
|
mov x0, sp
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x3, lsl #1
|
|
sub x1, x1, #2
|
|
sub x2, x4, #64
|
|
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc22_neon, export=1
|
|
mov x14, x30
|
|
mov x11, sp
|
|
sub x1, x1, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel8_hv_lowpass_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc32_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, #1
|
|
b \type\()_h264_qpel8_mc12
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc03_neon, export=1
|
|
mov x14, x30
|
|
add x12, x1, x2
|
|
b \type\()_h264_qpel8_mc01
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc13_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel8_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc23_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel8_mc21
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc33_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel8_mc11
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8 put
|
|
h264_qpel8 avg
|
|
|
|
.macro h264_qpel16 type
|
|
function ff_\type\()_h264_qpel16_mc10_neon, export=1
|
|
lowpass_const w3
|
|
mov x3, x1
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc20_neon, export=1
|
|
lowpass_const w3
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
b \type\()_h264_qpel16_h_lowpass_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc30_neon, export=1
|
|
lowpass_const w3
|
|
add x3, x1, #1
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc01_neon, export=1
|
|
mov x14, x30
|
|
mov x12, x1
|
|
\type\()_h264_qpel16_mc01:
|
|
lowpass_const w3
|
|
mov x3, x2
|
|
sub x1, x1, x2, lsl #1
|
|
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc11_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc11:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #256
|
|
mov x0, sp
|
|
sub x1, x1, #2
|
|
mov x3, #16
|
|
bl put_h264_qpel16_h_lowpass_neon
|
|
mov x0, x8
|
|
mov x3, x2
|
|
mov x12, sp
|
|
sub x1, x9, x2, lsl #1
|
|
mov x2, #16
|
|
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc21_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc21:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(16*16+16*12)
|
|
sub x1, x1, #2
|
|
mov x0, sp
|
|
bl put_h264_qpel16_h_lowpass_neon_packed
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc31_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel16_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc02_neon, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_v_lowpass_neon
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc12_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc12:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #(16*16+16*12)
|
|
sub x1, x1, x2, lsl #1
|
|
mov x0, sp
|
|
mov x3, x2
|
|
bl put_h264_qpel16_v_lowpass_neon_packed
|
|
mov x4, x0
|
|
mov x0, x8
|
|
sub x1, x9, x3, lsl #1
|
|
sub x1, x1, #2
|
|
mov x2, x3
|
|
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc22_neon, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub x1, x1, x2, lsl #1
|
|
sub x1, x1, #2
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_hv_lowpass_neon
|
|
mov sp, x11 // restore stack
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc32_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, #1
|
|
b \type\()_h264_qpel16_mc12
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc03_neon, export=1
|
|
mov x14, x30
|
|
add x12, x1, x2
|
|
b \type\()_h264_qpel16_mc01
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc13_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel16_mc11
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc23_neon, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel16_mc21
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc33_neon, export=1
|
|
add x1, x1, #1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
sub x1, x1, #1
|
|
b \type\()_h264_qpel16_mc11
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel16 put
|
|
h264_qpel16 avg
|
|
|
|
//trashes v0-v5
|
|
.macro lowpass_8_10 r0, r1, r2, r3, d0, d1
|
|
ext v2.16b, \r0\().16b, \r1\().16b, #4
|
|
ext v3.16b, \r0\().16b, \r1\().16b, #6
|
|
add v2.8h, v2.8h, v3.8h
|
|
ext v4.16b, \r0\().16b, \r1\().16b, #2
|
|
ext v5.16b, \r0\().16b, \r1\().16b, #8
|
|
add v4.8h, v4.8h, v5.8h
|
|
ext v1.16b, \r0\().16b, \r1\().16b, #10
|
|
|
|
add \d0\().8h, \r0\().8h, v1.8h
|
|
ext v0.16b, \r2\().16b, \r3\().16b, #4
|
|
mla \d0\().8h, v2.8h, v6.h[1]
|
|
ext v1.16b, \r2\().16b, \r3\().16b, #6
|
|
add v0.8h, v0.8h, v1.8h
|
|
ext v1.16b, \r2\().16b, \r3\().16b, #2
|
|
mul v5.8h, v4.8h, v6.h[0]
|
|
uqsub \d0\().8h, \d0\().8h, v5.8h
|
|
urshr \d0\().8h, \d0\().8h, #5
|
|
|
|
ext v3.16b, \r2\().16b, \r3\().16b, #8
|
|
add v1.8h, v1.8h, v3.8h
|
|
ext v2.16b, \r2\().16b, \r3\().16b, #10
|
|
|
|
add \d1\().8h, \r2\().8h, v2.8h
|
|
mla \d1\().8h, v0.8h, v6.h[1]
|
|
mul v5.8h, v1.8h, v6.h[0]
|
|
uqsub \d1\().8h, \d1\().8h, v5.8h
|
|
mvni v5.8h, #0xFC, lsl #8 // 1023 for clipping
|
|
urshr \d1\().8h, \d1\().8h, #5
|
|
|
|
umin \d0\().8h, \d0\().8h, v5.8h
|
|
umin \d1\().8h, \d1\().8h, v5.8h
|
|
.endm
|
|
|
|
//trashes v0-v4
|
|
.macro lowpass_8_10_v r0, r1, r2, r3, r4, r5, r6, d0, d1
|
|
add v2.8h, \r2\().8h, \r3\().8h
|
|
add v0.8h, \r3\().8h, \r4\().8h
|
|
add v4.8h, \r1\().8h, \r4\().8h
|
|
add v1.8h, \r2\().8h, \r5\().8h
|
|
|
|
add \d0\().8h, \r0\().8h, \r5\().8h
|
|
add \d1\().8h, \r1\().8h, \r6\().8h
|
|
mla \d0\().8h, v2.8h, v6.h[1]
|
|
mla \d1\().8h, v0.8h, v6.h[1]
|
|
mul v2.8h, v4.8h, v6.h[0]
|
|
mul v0.8h, v1.8h, v6.h[0]
|
|
uqsub \d0\().8h, \d0\().8h, v2.8h
|
|
uqsub \d1\().8h, \d1\().8h, v0.8h
|
|
|
|
mvni v0.8h, #0xFC, lsl #8 // 1023 for clipping
|
|
|
|
urshr \d0\().8h, \d0\().8h, #5
|
|
urshr \d1\().8h, \d1\().8h, #5
|
|
|
|
umin \d0\().8h, \d0\().8h, v0.8h
|
|
umin \d1\().8h, \d1\().8h, v0.8h
|
|
.endm
|
|
|
|
function put_h264_qpel16_h_lowpass_neon_packed_10
|
|
mov x4, x30
|
|
mov x12, #32
|
|
mov x3, #16
|
|
bl put_h264_qpel8_h_lowpass_neon_10
|
|
sub x1, x1, x2, lsl #4
|
|
add x1, x1, #16
|
|
mov x12, #32
|
|
mov x30, x4
|
|
b put_h264_qpel8_h_lowpass_neon_10
|
|
endfunc
|
|
|
|
.macro h264_qpel_h_lowpass_10 type
|
|
function \type\()_h264_qpel16_h_lowpass_neon_10
|
|
mov x13, x30
|
|
mov x12, #32
|
|
bl \type\()_h264_qpel8_h_lowpass_neon_10
|
|
sub x0, x0, x3, lsl #4
|
|
sub x1, x1, x2, lsl #4
|
|
add x0, x0, #16
|
|
add x1, x1, #16
|
|
mov x12, #32
|
|
mov x30, x13
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_h_lowpass_neon_10
|
|
1: ld1 {v28.8h, v29.8h}, [x1], x2
|
|
ld1 {v16.8h, v17.8h}, [x1], x2
|
|
subs x12, x12, #4
|
|
lowpass_8_10 v28, v29, v16, v17, v28, v20
|
|
.ifc \type,avg
|
|
ld1 {v2.8h}, [x0], x3
|
|
ld1 {v3.8h}, [x0]
|
|
urhadd v28.8h, v28.8h, v2.8h
|
|
urhadd v20.8h, v20.8h, v3.8h
|
|
sub x0, x0, x3
|
|
.endif
|
|
st1 {v28.8h}, [x0], x3
|
|
st1 {v20.8h}, [x0], x3
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_h_lowpass_10 put
|
|
h264_qpel_h_lowpass_10 avg
|
|
|
|
.macro h264_qpel_h_lowpass_l2_10 type
|
|
function \type\()_h264_qpel16_h_lowpass_l2_neon_10
|
|
mov x13, x30
|
|
mov x12, #32
|
|
bl \type\()_h264_qpel8_h_lowpass_l2_neon_10
|
|
sub x0, x0, x2, lsl #4
|
|
sub x1, x1, x2, lsl #4
|
|
sub x3, x3, x2, lsl #4
|
|
add x0, x0, #16
|
|
add x1, x1, #16
|
|
add x3, x3, #16
|
|
mov x12, #32
|
|
mov x30, x13
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_h_lowpass_l2_neon_10
|
|
1: ld1 {v26.8h, v27.8h}, [x1], x2
|
|
ld1 {v16.8h, v17.8h}, [x1], x2
|
|
ld1 {v28.8h}, [x3], x2
|
|
ld1 {v29.8h}, [x3], x2
|
|
subs x12, x12, #4
|
|
lowpass_8_10 v26, v27, v16, v17, v26, v27
|
|
urhadd v26.8h, v26.8h, v28.8h
|
|
urhadd v27.8h, v27.8h, v29.8h
|
|
.ifc \type,avg
|
|
ld1 {v2.8h}, [x0], x2
|
|
ld1 {v3.8h}, [x0]
|
|
urhadd v26.8h, v26.8h, v2.8h
|
|
urhadd v27.8h, v27.8h, v3.8h
|
|
sub x0, x0, x2
|
|
.endif
|
|
st1 {v26.8h}, [x0], x2
|
|
st1 {v27.8h}, [x0], x2
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_h_lowpass_l2_10 put
|
|
h264_qpel_h_lowpass_l2_10 avg
|
|
|
|
function put_h264_qpel16_v_lowpass_neon_packed_10
|
|
mov x4, x30
|
|
mov x2, #8
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #8
|
|
bl put_h264_qpel8_v_lowpass_neon
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
b put_h264_qpel8_v_lowpass_neon
|
|
endfunc
|
|
|
|
.macro h264_qpel_v_lowpass_10 type
|
|
function \type\()_h264_qpel16_v_lowpass_neon_10
|
|
mov x4, x30
|
|
bl \type\()_h264_qpel8_v_lowpass_neon_10
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_v_lowpass_neon_10
|
|
sub x0, x0, x2, lsl #4
|
|
add x0, x0, #16
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #16
|
|
bl \type\()_h264_qpel8_v_lowpass_neon_10
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_v_lowpass_neon_10
|
|
ld1 {v16.8h}, [x1], x3
|
|
ld1 {v17.8h}, [x1], x3
|
|
ld1 {v18.8h}, [x1], x3
|
|
ld1 {v19.8h}, [x1], x3
|
|
ld1 {v20.8h}, [x1], x3
|
|
ld1 {v21.8h}, [x1], x3
|
|
ld1 {v22.8h}, [x1], x3
|
|
ld1 {v23.8h}, [x1], x3
|
|
ld1 {v24.8h}, [x1], x3
|
|
ld1 {v25.8h}, [x1], x3
|
|
ld1 {v26.8h}, [x1], x3
|
|
ld1 {v27.8h}, [x1], x3
|
|
ld1 {v28.8h}, [x1]
|
|
|
|
lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
|
lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
|
lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
|
lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
|
|
|
.ifc \type,avg
|
|
ld1 {v24.8h}, [x0], x2
|
|
ld1 {v25.8h}, [x0], x2
|
|
ld1 {v26.8h}, [x0], x2
|
|
urhadd v16.8h, v16.8h, v24.8h
|
|
ld1 {v27.8h}, [x0], x2
|
|
urhadd v17.8h, v17.8h, v25.8h
|
|
ld1 {v28.8h}, [x0], x2
|
|
urhadd v18.8h, v18.8h, v26.8h
|
|
ld1 {v29.8h}, [x0], x2
|
|
urhadd v19.8h, v19.8h, v27.8h
|
|
ld1 {v30.8h}, [x0], x2
|
|
urhadd v20.8h, v20.8h, v28.8h
|
|
ld1 {v31.8h}, [x0], x2
|
|
urhadd v21.8h, v21.8h, v29.8h
|
|
urhadd v22.8h, v22.8h, v30.8h
|
|
urhadd v23.8h, v23.8h, v31.8h
|
|
sub x0, x0, x2, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x0], x2
|
|
st1 {v18.8h}, [x0], x2
|
|
st1 {v19.8h}, [x0], x2
|
|
st1 {v20.8h}, [x0], x2
|
|
st1 {v21.8h}, [x0], x2
|
|
st1 {v22.8h}, [x0], x2
|
|
st1 {v23.8h}, [x0], x2
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_v_lowpass_10 put
|
|
h264_qpel_v_lowpass_10 avg
|
|
|
|
.macro h264_qpel_v_lowpass_l2_10 type
|
|
function \type\()_h264_qpel16_v_lowpass_l2_neon_10
|
|
mov x4, x30
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
|
|
sub x1, x1, x3, lsl #2
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
|
|
sub x0, x0, x3, lsl #4
|
|
sub x12, x12, x2, lsl #4
|
|
add x0, x0, #16
|
|
add x12, x12, #16
|
|
sub x1, x1, x3, lsl #4
|
|
sub x1, x1, x3, lsl #2
|
|
add x1, x1, #16
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
|
|
sub x1, x1, x3, lsl #2
|
|
mov x30, x4
|
|
endfunc
|
|
|
|
function \type\()_h264_qpel8_v_lowpass_l2_neon_10
|
|
ld1 {v16.8h}, [x1], x3
|
|
ld1 {v17.8h}, [x1], x3
|
|
ld1 {v18.8h}, [x1], x3
|
|
ld1 {v19.8h}, [x1], x3
|
|
ld1 {v20.8h}, [x1], x3
|
|
ld1 {v21.8h}, [x1], x3
|
|
ld1 {v22.8h}, [x1], x3
|
|
ld1 {v23.8h}, [x1], x3
|
|
ld1 {v24.8h}, [x1], x3
|
|
ld1 {v25.8h}, [x1], x3
|
|
ld1 {v26.8h}, [x1], x3
|
|
ld1 {v27.8h}, [x1], x3
|
|
ld1 {v28.8h}, [x1]
|
|
|
|
lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
|
lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
|
lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
|
lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
|
|
|
ld1 {v24.8h}, [x12], x2
|
|
ld1 {v25.8h}, [x12], x2
|
|
ld1 {v26.8h}, [x12], x2
|
|
ld1 {v27.8h}, [x12], x2
|
|
ld1 {v28.8h}, [x12], x2
|
|
urhadd v16.8h, v24.8h, v16.8h
|
|
urhadd v17.8h, v25.8h, v17.8h
|
|
ld1 {v29.8h}, [x12], x2
|
|
urhadd v18.8h, v26.8h, v18.8h
|
|
urhadd v19.8h, v27.8h, v19.8h
|
|
ld1 {v30.8h}, [x12], x2
|
|
urhadd v20.8h, v28.8h, v20.8h
|
|
urhadd v21.8h, v29.8h, v21.8h
|
|
ld1 {v31.8h}, [x12], x2
|
|
urhadd v22.8h, v30.8h, v22.8h
|
|
urhadd v23.8h, v31.8h, v23.8h
|
|
|
|
.ifc \type,avg
|
|
ld1 {v24.8h}, [x0], x3
|
|
ld1 {v25.8h}, [x0], x3
|
|
ld1 {v26.8h}, [x0], x3
|
|
urhadd v16.8h, v16.8h, v24.8h
|
|
ld1 {v27.8h}, [x0], x3
|
|
urhadd v17.8h, v17.8h, v25.8h
|
|
ld1 {v28.8h}, [x0], x3
|
|
urhadd v18.8h, v18.8h, v26.8h
|
|
ld1 {v29.8h}, [x0], x3
|
|
urhadd v19.8h, v19.8h, v27.8h
|
|
ld1 {v30.8h}, [x0], x3
|
|
urhadd v20.8h, v20.8h, v28.8h
|
|
ld1 {v31.8h}, [x0], x3
|
|
urhadd v21.8h, v21.8h, v29.8h
|
|
urhadd v22.8h, v22.8h, v30.8h
|
|
urhadd v23.8h, v23.8h, v31.8h
|
|
sub x0, x0, x3, lsl #3
|
|
.endif
|
|
|
|
st1 {v16.8h}, [x0], x3
|
|
st1 {v17.8h}, [x0], x3
|
|
st1 {v18.8h}, [x0], x3
|
|
st1 {v19.8h}, [x0], x3
|
|
st1 {v20.8h}, [x0], x3
|
|
st1 {v21.8h}, [x0], x3
|
|
st1 {v22.8h}, [x0], x3
|
|
st1 {v23.8h}, [x0], x3
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel_v_lowpass_l2_10 put
|
|
h264_qpel_v_lowpass_l2_10 avg
|
|
|
|
.macro h264_qpel8_10 type
|
|
function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
|
|
lowpass_const w3
|
|
mov x3, x1
|
|
sub x1, x1, #4
|
|
mov x12, #16
|
|
b \type\()_h264_qpel8_h_lowpass_l2_neon_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
|
|
lowpass_const w3
|
|
sub x1, x1, #4
|
|
mov x3, x2
|
|
mov x12, #16
|
|
b \type\()_h264_qpel8_h_lowpass_neon_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
|
|
lowpass_const w3
|
|
add x3, x1, #2
|
|
sub x1, x1, #4
|
|
mov x12, #16
|
|
b \type\()_h264_qpel8_h_lowpass_l2_neon_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
|
|
mov x14, x30
|
|
mov x12, x1
|
|
\type\()_h264_qpel8_mc01_10:
|
|
lowpass_const w3
|
|
mov x3, x2
|
|
sub x1, x1, x2, lsl #1
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel8_mc11_10:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #128
|
|
mov x0, sp
|
|
sub x1, x1, #4
|
|
mov x3, #16
|
|
mov x12, #16
|
|
bl put_h264_qpel8_h_lowpass_neon_10
|
|
mov x0, x8
|
|
mov x3, x2
|
|
mov x12, sp
|
|
sub x1, x9, x2, lsl #1
|
|
mov x2, #16
|
|
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
|
|
add x1, x1, #2
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel8_mc11_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel8_v_lowpass_neon_10
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
|
|
mov x14, x30
|
|
add x12, x1, x2
|
|
b \type\()_h264_qpel8_mc01_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel8_mc11_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
|
|
add x1, x1, #2
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel8_mc11_10
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel8_10 put
|
|
h264_qpel8_10 avg
|
|
|
|
.macro h264_qpel16_10 type
|
|
function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
|
|
lowpass_const w3
|
|
mov x3, x1
|
|
sub x1, x1, #4
|
|
b \type\()_h264_qpel16_h_lowpass_l2_neon_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
|
|
lowpass_const w3
|
|
sub x1, x1, #4
|
|
mov x3, x2
|
|
b \type\()_h264_qpel16_h_lowpass_neon_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
|
|
lowpass_const w3
|
|
add x3, x1, #2
|
|
sub x1, x1, #4
|
|
b \type\()_h264_qpel16_h_lowpass_l2_neon_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
|
|
mov x14, x30
|
|
mov x12, x1
|
|
\type\()_h264_qpel16_mc01_10:
|
|
lowpass_const w3
|
|
mov x3, x2
|
|
sub x1, x1, x2, lsl #1
|
|
bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
\type\()_h264_qpel16_mc11_10:
|
|
lowpass_const w3
|
|
mov x11, sp
|
|
sub sp, sp, #512
|
|
mov x0, sp
|
|
sub x1, x1, #4
|
|
mov x3, #32
|
|
bl put_h264_qpel16_h_lowpass_neon_10
|
|
mov x0, x8
|
|
mov x3, x2
|
|
mov x12, sp
|
|
sub x1, x9, x2, lsl #1
|
|
mov x2, #32
|
|
bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
|
|
mov sp, x11
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
|
|
add x1, x1, #2
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel16_mc11_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
|
|
mov x14, x30
|
|
lowpass_const w3
|
|
sub x1, x1, x2, lsl #1
|
|
mov x3, x2
|
|
bl \type\()_h264_qpel16_v_lowpass_neon_10
|
|
ret x14
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
|
|
mov x14, x30
|
|
add x12, x1, x2
|
|
b \type\()_h264_qpel16_mc01_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
b \type\()_h264_qpel16_mc11_10
|
|
endfunc
|
|
|
|
function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
|
|
add x1, x1, #2
|
|
mov x14, x30
|
|
mov x8, x0
|
|
mov x9, x1
|
|
add x1, x1, x2
|
|
sub x1, x1, #2
|
|
b \type\()_h264_qpel16_mc11_10
|
|
endfunc
|
|
.endm
|
|
|
|
h264_qpel16_10 put
|
|
h264_qpel16_10 avg
|