mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
870bfe16a1
Move the loop counter decrement further from the branch instruction, this hides the latency of the decrement. In loops that first load, then store (the horizontal prediction cases), do the decrement after the load (where the next instruction would stall a bit anyway, waiting for the result of the load). In loops that store twice using the same destination register, also do the decrement between the two stores (as the second store would need to wait for the updated destination register from the first instruction). In loops that store twice to two different destination registers, do the decrement before both stores, to do it as soon before the branch as possible. This gives minor (1-2 cycle) speedups in most cases (modulo measurement noise), but the horizontal prediction functions get a rather notable speedup on the Cortex A53. Before: Cortex A53 A72 A73 pred8x8_dc_8_neon: 60.7 46.2 39.2 pred8x8_dc_128_8_neon: 30.7 18.0 14.0 pred8x8_horizontal_8_neon: 42.2 29.2 18.5 pred8x8_left_dc_8_neon: 52.7 36.2 32.2 pred8x8_mad_cow_dc_0l0_8_neon: 48.2 27.7 25.7 pred8x8_mad_cow_dc_0lt_8_neon: 52.5 33.2 34.7 pred8x8_mad_cow_dc_l0t_8_neon: 52.5 31.7 33.2 pred8x8_mad_cow_dc_l00_8_neon: 43.2 27.0 25.5 pred8x8_plane_8_neon: 112.2 86.2 88.2 pred8x8_top_dc_8_neon: 40.7 23.0 21.2 pred8x8_vertical_8_neon: 27.2 15.5 14.0 pred16x16_dc_8_neon: 91.0 73.2 70.5 pred16x16_dc_128_8_neon: 43.0 34.7 30.7 pred16x16_horizontal_8_neon: 86.0 49.7 44.7 pred16x16_left_dc_8_neon: 87.0 67.2 67.5 pred16x16_plane_8_neon: 236.0 175.7 173.0 pred16x16_top_dc_8_neon: 53.2 39.0 41.7 pred16x16_vertical_8_neon: 41.7 29.7 31.0 After: pred8x8_dc_8_neon: 59.0 46.7 42.5 pred8x8_dc_128_8_neon: 28.2 18.0 14.0 pred8x8_horizontal_8_neon: 34.2 29.2 18.5 pred8x8_left_dc_8_neon: 51.0 38.2 32.7 pred8x8_mad_cow_dc_0l0_8_neon: 46.7 28.2 26.2 pred8x8_mad_cow_dc_0lt_8_neon: 55.2 33.7 37.5 pred8x8_mad_cow_dc_l0t_8_neon: 51.2 31.7 37.2 pred8x8_mad_cow_dc_l00_8_neon: 41.7 27.5 26.0 pred8x8_plane_8_neon: 111.5 86.5 89.5 pred8x8_top_dc_8_neon: 39.0 23.2 21.0 pred8x8_vertical_8_neon: 27.2 16.0 14.0 pred16x16_dc_8_neon: 85.0 70.2 70.5 pred16x16_dc_128_8_neon: 42.0 30.0 30.7 pred16x16_horizontal_8_neon: 66.5 49.5 42.5 pred16x16_left_dc_8_neon: 81.0 66.5 67.5 pred16x16_plane_8_neon: 235.0 175.7 173.0 pred16x16_top_dc_8_neon: 52.0 39.0 41.7 pred16x16_vertical_8_neon: 40.2 33.2 31.0 Despite this, a number of these functions still are slower than what e.g. GCC 7 generates - this shows the relative speedup of the neon codepaths over the compiler generated ones: Cortex A53 A72 A73 pred8x8_dc_8_neon: 0.86 0.65 1.04 pred8x8_dc_128_8_neon: 0.59 0.44 0.62 pred8x8_horizontal_8_neon: 1.51 0.58 1.30 pred8x8_left_dc_8_neon: 0.72 0.56 0.89 pred8x8_mad_cow_dc_0l0_8_neon: 0.93 0.93 1.37 pred8x8_mad_cow_dc_0lt_8_neon: 1.37 1.41 1.68 pred8x8_mad_cow_dc_l0t_8_neon: 1.21 1.17 1.32 pred8x8_mad_cow_dc_l00_8_neon: 1.24 1.19 1.60 pred8x8_plane_8_neon: 3.36 3.58 3.76 pred8x8_top_dc_8_neon: 0.97 0.99 1.43 pred8x8_vertical_8_neon: 0.86 0.78 1.18 pred16x16_dc_8_neon: 1.20 1.06 1.49 pred16x16_dc_128_8_neon: 0.83 0.95 0.99 pred16x16_horizontal_8_neon: 1.78 0.96 1.59 pred16x16_left_dc_8_neon: 1.06 0.96 1.32 pred16x16_plane_8_neon: 5.78 6.49 7.19 pred16x16_top_dc_8_neon: 1.48 1.53 1.94 pred16x16_vertical_8_neon: 1.39 1.34 1.98 In particular, on Cortex A72, many of these functions are slower than the compiler generated code, while they're more beneficial on e.g. the Cortex A73. Signed-off-by: Martin Storsjö <martin@martin.st>
362 lines
13 KiB
ArmAsm
362 lines
13 KiB
ArmAsm
/*
|
|
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
|
|
.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
|
.if \n >= 8 || \hi == 0
|
|
ld1 {\rd\().b}[0], [\rs], \rt
|
|
ld1 {\rd\().b}[1], [\rs], \rt
|
|
ld1 {\rd\().b}[2], [\rs], \rt
|
|
ld1 {\rd\().b}[3], [\rs], \rt
|
|
.endif
|
|
.if \n >= 8 || \hi == 1
|
|
ld1 {\rd\().b}[4], [\rs], \rt
|
|
ld1 {\rd\().b}[5], [\rs], \rt
|
|
ld1 {\rd\().b}[6], [\rs], \rt
|
|
ld1 {\rd\().b}[7], [\rs], \rt
|
|
.endif
|
|
.if \n == 16
|
|
ld1 {\rd\().b}[8], [\rs], \rt
|
|
ld1 {\rd\().b}[9], [\rs], \rt
|
|
ld1 {\rd\().b}[10], [\rs], \rt
|
|
ld1 {\rd\().b}[11], [\rs], \rt
|
|
ld1 {\rd\().b}[12], [\rs], \rt
|
|
ld1 {\rd\().b}[13], [\rs], \rt
|
|
ld1 {\rd\().b}[14], [\rs], \rt
|
|
ld1 {\rd\().b}[15], [\rs], \rt
|
|
.endif
|
|
.endm
|
|
|
|
function ff_pred16x16_128_dc_neon, export=1
|
|
movi v0.16b, #128
|
|
b .L_pred16x16_dc_end
|
|
endfunc
|
|
|
|
function ff_pred16x16_top_dc_neon, export=1
|
|
sub x2, x0, x1
|
|
ld1 {v0.16b}, [x2]
|
|
uaddlv h0, v0.16b
|
|
rshrn v0.8b, v0.8h, #4
|
|
dup v0.16b, v0.b[0]
|
|
b .L_pred16x16_dc_end
|
|
endfunc
|
|
|
|
function ff_pred16x16_left_dc_neon, export=1
|
|
sub x2, x0, #1
|
|
ldcol.8 v0, x2, x1, 16
|
|
uaddlv h0, v0.16b
|
|
rshrn v0.8b, v0.8h, #4
|
|
dup v0.16b, v0.b[0]
|
|
b .L_pred16x16_dc_end
|
|
endfunc
|
|
|
|
function ff_pred16x16_dc_neon, export=1
|
|
sub x2, x0, x1
|
|
sub x3, x0, #1
|
|
ld1 {v0.16b}, [x2]
|
|
ldcol.8 v1, x3, x1, 16
|
|
uaddlv h0, v0.16b
|
|
uaddlv h1, v1.16b
|
|
add v0.4h, v0.4h, v1.4h
|
|
rshrn v0.8b, v0.8h, #5
|
|
dup v0.16b, v0.b[0]
|
|
.L_pred16x16_dc_end:
|
|
mov w3, #8
|
|
6: st1 {v0.16b}, [x0], x1
|
|
subs w3, w3, #1
|
|
st1 {v0.16b}, [x0], x1
|
|
b.ne 6b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred16x16_hor_neon, export=1
|
|
sub x2, x0, #1
|
|
mov w3, #16
|
|
1: ld1r {v0.16b}, [x2], x1
|
|
subs w3, w3, #1
|
|
st1 {v0.16b}, [x0], x1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred16x16_vert_neon, export=1
|
|
sub x2, x0, x1
|
|
add x1, x1, x1
|
|
ld1 {v0.16b}, [x2], x1
|
|
mov w3, #8
|
|
1: subs w3, w3, #1
|
|
st1 {v0.16b}, [x0], x1
|
|
st1 {v0.16b}, [x2], x1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred16x16_plane_neon, export=1
|
|
sub x3, x0, x1
|
|
movrel x4, p16weight
|
|
add x2, x3, #8
|
|
sub x3, x3, #1
|
|
ld1 {v0.8b}, [x3]
|
|
ld1 {v2.8b}, [x2], x1
|
|
ldcol.8 v1, x3, x1
|
|
add x3, x3, x1
|
|
ldcol.8 v3, x3, x1
|
|
rev64 v0.8b, v0.8b
|
|
rev64 v1.8b, v1.8b
|
|
uaddl v7.8h, v2.8b, v3.8b
|
|
usubl v2.8h, v2.8b, v0.8b
|
|
usubl v3.8h, v3.8b, v1.8b
|
|
ld1 {v0.8h}, [x4]
|
|
mul v2.8h, v2.8h, v0.8h
|
|
mul v3.8h, v3.8h, v0.8h
|
|
addp v2.8h, v2.8h, v3.8h
|
|
addp v2.8h, v2.8h, v2.8h
|
|
addp v2.4h, v2.4h, v2.4h
|
|
sshll v3.4s, v2.4h, #2
|
|
saddw v2.4s, v3.4s, v2.4h
|
|
rshrn v4.4h, v2.4s, #6
|
|
trn2 v5.4h, v4.4h, v4.4h
|
|
add v2.4h, v4.4h, v5.4h
|
|
shl v3.4h, v2.4h, #3
|
|
ext v7.16b, v7.16b, v7.16b, #14
|
|
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
|
|
add v7.4h, v7.4h, v0.4h
|
|
shl v2.4h, v7.4h, #4
|
|
sub v2.4h, v2.4h, v3.4h
|
|
shl v3.4h, v4.4h, #4
|
|
ext v0.16b, v0.16b, v0.16b, #14
|
|
sub v6.4h, v5.4h, v3.4h
|
|
mov v0.h[0], wzr
|
|
mul v0.8h, v0.8h, v4.h[0]
|
|
dup v1.8h, v2.h[0]
|
|
dup v2.8h, v4.h[0]
|
|
dup v3.8h, v6.h[0]
|
|
shl v2.8h, v2.8h, #3
|
|
add v1.8h, v1.8h, v0.8h
|
|
add v3.8h, v3.8h, v2.8h
|
|
mov w3, #16
|
|
1:
|
|
sqshrun v0.8b, v1.8h, #5
|
|
add v1.8h, v1.8h, v2.8h
|
|
sqshrun2 v0.16b, v1.8h, #5
|
|
add v1.8h, v1.8h, v3.8h
|
|
subs w3, w3, #1
|
|
st1 {v0.16b}, [x0], x1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
const p16weight, align=4
|
|
.short 1,2,3,4,5,6,7,8
|
|
endconst
|
|
const p8weight, align=4
|
|
.short 1,2,3,4,1,2,3,4
|
|
endconst
|
|
|
|
function ff_pred8x8_hor_neon, export=1
|
|
sub x2, x0, #1
|
|
mov w3, #8
|
|
1: ld1r {v0.8b}, [x2], x1
|
|
subs w3, w3, #1
|
|
st1 {v0.8b}, [x0], x1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred8x8_vert_neon, export=1
|
|
sub x2, x0, x1
|
|
lsl x1, x1, #1
|
|
ld1 {v0.8b}, [x2], x1
|
|
mov w3, #4
|
|
1: subs w3, w3, #1
|
|
st1 {v0.8b}, [x0], x1
|
|
st1 {v0.8b}, [x2], x1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred8x8_plane_neon, export=1
|
|
sub x3, x0, x1
|
|
movrel x4, p8weight
|
|
movrel x5, p16weight
|
|
add x2, x3, #4
|
|
sub x3, x3, #1
|
|
ld1 {v0.s}[0], [x3]
|
|
ld1 {v2.s}[0], [x2], x1
|
|
ldcol.8 v0, x3, x1, 4, hi=1
|
|
add x3, x3, x1
|
|
ldcol.8 v3, x3, x1, 4
|
|
uaddl v7.8h, v2.8b, v3.8b
|
|
rev32 v0.8b, v0.8b
|
|
trn1 v2.2s, v2.2s, v3.2s
|
|
usubl v2.8h, v2.8b, v0.8b
|
|
ld1 {v6.8h}, [x4]
|
|
mul v2.8h, v2.8h, v6.8h
|
|
ld1 {v0.8h}, [x5]
|
|
saddlp v2.4s, v2.8h
|
|
addp v2.4s, v2.4s, v2.4s
|
|
shl v3.4s, v2.4s, #4
|
|
add v2.4s, v3.4s, v2.4s
|
|
rshrn v5.4h, v2.4s, #5
|
|
addp v2.4h, v5.4h, v5.4h
|
|
shl v3.4h, v2.4h, #1
|
|
add v3.4h, v3.4h, v2.4h
|
|
rev64 v7.4h, v7.4h
|
|
add v7.4h, v7.4h, v0.4h
|
|
shl v2.4h, v7.4h, #4
|
|
sub v2.4h, v2.4h, v3.4h
|
|
ext v0.16b, v0.16b, v0.16b, #14
|
|
mov v0.h[0], wzr
|
|
mul v0.8h, v0.8h, v5.h[0]
|
|
dup v1.8h, v2.h[0]
|
|
dup v2.8h, v5.h[1]
|
|
add v1.8h, v1.8h, v0.8h
|
|
mov w3, #8
|
|
1:
|
|
sqshrun v0.8b, v1.8h, #5
|
|
subs w3, w3, #1
|
|
add v1.8h, v1.8h, v2.8h
|
|
st1 {v0.8b}, [x0], x1
|
|
b.ne 1b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred8x8_128_dc_neon, export=1
|
|
movi v0.8b, #128
|
|
movi v1.8b, #128
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|
|
|
|
function ff_pred8x8_top_dc_neon, export=1
|
|
sub x2, x0, x1
|
|
ld1 {v0.8b}, [x2]
|
|
uaddlp v0.4h, v0.8b
|
|
addp v0.4h, v0.4h, v0.4h
|
|
zip1 v0.8h, v0.8h, v0.8h
|
|
rshrn v2.8b, v0.8h, #2
|
|
zip1 v0.8b, v2.8b, v2.8b
|
|
zip1 v1.8b, v2.8b, v2.8b
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|
|
|
|
function ff_pred8x8_left_dc_neon, export=1
|
|
sub x2, x0, #1
|
|
ldcol.8 v0, x2, x1
|
|
uaddlp v0.4h, v0.8b
|
|
addp v0.4h, v0.4h, v0.4h
|
|
rshrn v2.8b, v0.8h, #2
|
|
dup v1.8b, v2.b[1]
|
|
dup v0.8b, v2.b[0]
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|
|
|
|
function ff_pred8x8_dc_neon, export=1
|
|
sub x2, x0, x1
|
|
sub x3, x0, #1
|
|
ld1 {v0.8b}, [x2]
|
|
ldcol.8 v1, x3, x1
|
|
uaddlp v0.4h, v0.8b
|
|
uaddlp v1.4h, v1.8b
|
|
trn1 v2.2s, v0.2s, v1.2s
|
|
trn2 v3.2s, v0.2s, v1.2s
|
|
addp v4.4h, v2.4h, v3.4h
|
|
addp v5.4h, v4.4h, v4.4h
|
|
rshrn v6.8b, v5.8h, #3
|
|
rshrn v7.8b, v4.8h, #2
|
|
dup v0.8b, v6.b[0]
|
|
dup v2.8b, v7.b[2]
|
|
dup v1.8b, v7.b[3]
|
|
dup v3.8b, v6.b[1]
|
|
zip1 v0.2s, v0.2s, v2.2s
|
|
zip1 v1.2s, v1.2s, v3.2s
|
|
.L_pred8x8_dc_end:
|
|
mov w3, #4
|
|
add x2, x0, x1, lsl #2
|
|
6: subs w3, w3, #1
|
|
st1 {v0.8b}, [x0], x1
|
|
st1 {v1.8b}, [x2], x1
|
|
b.ne 6b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_pred8x8_l0t_dc_neon, export=1
|
|
sub x2, x0, x1
|
|
sub x3, x0, #1
|
|
ld1 {v0.8b}, [x2]
|
|
ldcol.8 v1, x3, x1, 4
|
|
zip1 v0.4s, v0.4s, v1.4s
|
|
uaddlp v0.8h, v0.16b
|
|
addp v0.8h, v0.8h, v0.8h
|
|
addp v1.4h, v0.4h, v0.4h
|
|
rshrn v2.8b, v0.8h, #2
|
|
rshrn v3.8b, v1.8h, #3
|
|
dup v4.8b, v3.b[0]
|
|
dup v6.8b, v2.b[2]
|
|
dup v5.8b, v2.b[0]
|
|
zip1 v0.2s, v4.2s, v6.2s
|
|
zip1 v1.2s, v5.2s, v6.2s
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|
|
|
|
function ff_pred8x8_l00_dc_neon, export=1
|
|
sub x2, x0, #1
|
|
ldcol.8 v0, x2, x1, 4
|
|
uaddlp v0.4h, v0.8b
|
|
addp v0.4h, v0.4h, v0.4h
|
|
rshrn v0.8b, v0.8h, #2
|
|
movi v1.8b, #128
|
|
dup v0.8b, v0.b[0]
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|
|
|
|
function ff_pred8x8_0lt_dc_neon, export=1
|
|
add x3, x0, x1, lsl #2
|
|
sub x2, x0, x1
|
|
sub x3, x3, #1
|
|
ld1 {v0.8b}, [x2]
|
|
ldcol.8 v1, x3, x1, 4, hi=1
|
|
zip1 v0.4s, v0.4s, v1.4s
|
|
uaddlp v0.8h, v0.16b
|
|
addp v0.8h, v0.8h, v0.8h
|
|
addp v1.4h, v0.4h, v0.4h
|
|
rshrn v2.8b, v0.8h, #2
|
|
rshrn v3.8b, v1.8h, #3
|
|
dup v4.8b, v2.b[0]
|
|
dup v5.8b, v2.b[3]
|
|
dup v6.8b, v2.b[2]
|
|
dup v7.8b, v3.b[1]
|
|
zip1 v0.2s, v4.2s, v6.2s
|
|
zip1 v1.2s, v5.2s, v7.2s
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|
|
|
|
function ff_pred8x8_0l0_dc_neon, export=1
|
|
add x2, x0, x1, lsl #2
|
|
sub x2, x2, #1
|
|
ldcol.8 v1, x2, x1, 4
|
|
uaddlp v2.4h, v1.8b
|
|
addp v2.4h, v2.4h, v2.4h
|
|
rshrn v1.8b, v2.8h, #2
|
|
movi v0.8b, #128
|
|
dup v1.8b, v1.b[0]
|
|
b .L_pred8x8_dc_end
|
|
endfunc
|