You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-10 06:10:52 +02:00
swscale/aarch64: Refactor hscale_16_to_15__fs_4
This patch removes the use of stack for temporary state and replaces interleaved ld4 loads with ld1. Before/after: A78 hscale_16_to_15__fs_4_dstW_8_neon: 86.8 ( 1.72x) hscale_16_to_15__fs_4_dstW_24_neon: 147.5 ( 2.73x) hscale_16_to_15__fs_4_dstW_128_neon: 614.0 ( 3.14x) hscale_16_to_15__fs_4_dstW_144_neon: 680.5 ( 3.18x) hscale_16_to_15__fs_4_dstW_256_neon: 1193.2 ( 3.19x) hscale_16_to_15__fs_4_dstW_512_neon: 2305.0 ( 3.27x) hscale_16_to_15__fs_4_dstW_8_neon: 86.0 ( 1.74x) hscale_16_to_15__fs_4_dstW_24_neon: 106.8 ( 3.78x) hscale_16_to_15__fs_4_dstW_128_neon: 404.0 ( 4.81x) hscale_16_to_15__fs_4_dstW_144_neon: 451.8 ( 4.80x) hscale_16_to_15__fs_4_dstW_256_neon: 760.5 ( 5.06x) hscale_16_to_15__fs_4_dstW_512_neon: 1520.0 ( 5.01x) A72 hscale_16_to_15__fs_4_dstW_8_neon: 156.8 ( 1.52x) hscale_16_to_15__fs_4_dstW_24_neon: 217.8 ( 2.52x) hscale_16_to_15__fs_4_dstW_128_neon: 906.8 ( 2.90x) hscale_16_to_15__fs_4_dstW_144_neon: 1014.5 ( 2.91x) hscale_16_to_15__fs_4_dstW_256_neon: 1751.5 ( 2.96x) hscale_16_to_15__fs_4_dstW_512_neon: 3469.3 ( 2.97x) hscale_16_to_15__fs_4_dstW_8_neon: 151.2 ( 1.54x) hscale_16_to_15__fs_4_dstW_24_neon: 173.4 ( 3.15x) hscale_16_to_15__fs_4_dstW_128_neon: 660.0 ( 3.98x) hscale_16_to_15__fs_4_dstW_144_neon: 735.7 ( 4.00x) hscale_16_to_15__fs_4_dstW_256_neon: 1273.5 ( 4.09x) hscale_16_to_15__fs_4_dstW_512_neon: 2488.2 ( 4.16x) Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
committed by
Martin Storsjö
parent
76b1810017
commit
38929b824b
@@ -638,6 +638,16 @@ function ff_hscale8to19_X4_neon, export=1
|
|||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro hscale_iter src, src2, filter, dst1, dst2
|
||||||
|
uxtl \src\().4s, \src\().4h
|
||||||
|
sxtl v19.4s, \filter\().4h
|
||||||
|
mul \dst1\().4s, \src\().4s, v19.4s
|
||||||
|
uxtl \src2\().4s, \src2\().4h
|
||||||
|
sxtl2 \filter\().4s, \filter\().8h
|
||||||
|
mul \dst2\().4s, \src2\().4s, \filter\().4s
|
||||||
|
.endm
|
||||||
|
|
||||||
function ff_hscale16to15_4_neon_asm, export=1
|
function ff_hscale16to15_4_neon_asm, export=1
|
||||||
// w0 int shift
|
// w0 int shift
|
||||||
// x1 int32_t *dst
|
// x1 int32_t *dst
|
||||||
@@ -664,6 +674,7 @@ function ff_hscale16to15_4_neon_asm, export=1
|
|||||||
add x5, x5, #32
|
add x5, x5, #32
|
||||||
|
|
||||||
// shift all filterPos left by one, as uint16_t will be read
|
// shift all filterPos left by one, as uint16_t will be read
|
||||||
|
ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
|
||||||
lsl x8, x8, #1
|
lsl x8, x8, #1
|
||||||
lsl x9, x9, #1
|
lsl x9, x9, #1
|
||||||
lsl x10, x10, #1
|
lsl x10, x10, #1
|
||||||
@@ -674,154 +685,101 @@ function ff_hscale16to15_4_neon_asm, export=1
|
|||||||
lsl x15, x15, #1
|
lsl x15, x15, #1
|
||||||
|
|
||||||
// load src with given offset
|
// load src with given offset
|
||||||
ldr x8, [x3, w8, uxtw]
|
ldr d0, [x3, w8, uxtw]
|
||||||
ldr x9, [x3, w9, uxtw]
|
ldr d1, [x3, w9, uxtw]
|
||||||
ldr x10, [x3, w10, uxtw]
|
ldr d2, [x3, w10, uxtw]
|
||||||
ldr x11, [x3, w11, uxtw]
|
ldr d3, [x3, w11, uxtw]
|
||||||
ldr x12, [x3, w12, uxtw]
|
ldr d4, [x3, w12, uxtw]
|
||||||
ldr x13, [x3, w13, uxtw]
|
ldr d5, [x3, w13, uxtw]
|
||||||
ldr x14, [x3, w14, uxtw]
|
ldr d6, [x3, w14, uxtw]
|
||||||
ldr x15, [x3, w15, uxtw]
|
ldr d7, [x3, w15, uxtw]
|
||||||
|
|
||||||
sub sp, sp, #64
|
|
||||||
// push src on stack so it can be loaded into vectors later
|
|
||||||
stp x8, x9, [sp]
|
|
||||||
stp x10, x11, [sp, #16]
|
|
||||||
stp x12, x13, [sp, #32]
|
|
||||||
stp x14, x15, [sp, #48]
|
|
||||||
|
|
||||||
1:
|
1:
|
||||||
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
|
|
||||||
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
|
|
||||||
|
|
||||||
// Each of blocks does the following:
|
|
||||||
// Extend src and filter to 32 bits with uxtl and sxtl
|
|
||||||
// multiply or multiply and accumulate results
|
|
||||||
// Extending to 32 bits is necessary, as unit16_t values can't
|
|
||||||
// be represented as int16_t without type promotion.
|
|
||||||
uxtl v26.4s, v0.4h
|
|
||||||
sxtl v27.4s, v28.4h
|
|
||||||
uxtl2 v0.4s, v0.8h
|
|
||||||
mul v5.4s, v26.4s, v27.4s
|
|
||||||
sxtl2 v28.4s, v28.8h
|
|
||||||
uxtl v26.4s, v1.4h
|
|
||||||
mul v6.4s, v0.4s, v28.4s
|
|
||||||
|
|
||||||
sxtl v27.4s, v29.4h
|
|
||||||
uxtl2 v0.4s, v1.8h
|
|
||||||
mla v5.4s, v27.4s, v26.4s
|
|
||||||
sxtl2 v28.4s, v29.8h
|
|
||||||
uxtl v26.4s, v2.4h
|
|
||||||
mla v6.4s, v28.4s, v0.4s
|
|
||||||
|
|
||||||
sxtl v27.4s, v30.4h
|
|
||||||
uxtl2 v0.4s, v2.8h
|
|
||||||
mla v5.4s, v27.4s, v26.4s
|
|
||||||
sxtl2 v28.4s, v30.8h
|
|
||||||
uxtl v26.4s, v3.4h
|
|
||||||
mla v6.4s, v28.4s, v0.4s
|
|
||||||
|
|
||||||
sxtl v27.4s, v31.4h
|
|
||||||
uxtl2 v0.4s, v3.8h
|
|
||||||
mla v5.4s, v27.4s, v26.4s
|
|
||||||
sxtl2 v28.4s, v31.8h
|
|
||||||
sub w2, w2, #8
|
|
||||||
mla v6.4s, v28.4s, v0.4s
|
|
||||||
|
|
||||||
sshl v5.4s, v5.4s, v17.4s
|
|
||||||
sshl v6.4s, v6.4s, v17.4s
|
|
||||||
smin v5.4s, v5.4s, v18.4s
|
|
||||||
smin v6.4s, v6.4s, v18.4s
|
|
||||||
xtn v5.4h, v5.4s
|
|
||||||
xtn2 v5.8h, v6.4s
|
|
||||||
|
|
||||||
st1 {v5.8h}, [x1], #16
|
|
||||||
cmp w2, #16
|
|
||||||
|
|
||||||
// load filterPositions into registers for next iteration
|
// load filterPositions into registers for next iteration
|
||||||
|
|
||||||
|
hscale_iter v0, v1, v28, v20, v21
|
||||||
ldp w8, w9, [x5] // filterPos[0], filterPos[1]
|
ldp w8, w9, [x5] // filterPos[0], filterPos[1]
|
||||||
|
hscale_iter v2, v3, v29, v22, v23
|
||||||
ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
|
ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
|
||||||
|
hscale_iter v4, v5, v30, v24, v25
|
||||||
ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
|
ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
|
||||||
|
hscale_iter v6, v7, v31, v26, v27
|
||||||
ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
|
ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
|
||||||
|
subs w2, w2, #8
|
||||||
add x5, x5, #32
|
add x5, x5, #32
|
||||||
|
|
||||||
|
ldp q28, q29, [x4], #32 // filter[0..7]
|
||||||
lsl x8, x8, #1
|
lsl x8, x8, #1
|
||||||
lsl x9, x9, #1
|
lsl x9, x9, #1
|
||||||
lsl x10, x10, #1
|
lsl x10, x10, #1
|
||||||
lsl x11, x11, #1
|
lsl x11, x11, #1
|
||||||
|
ldp q30, q31, [x4], #32 // filter[0..7]
|
||||||
lsl x12, x12, #1
|
lsl x12, x12, #1
|
||||||
lsl x13, x13, #1
|
lsl x13, x13, #1
|
||||||
lsl x14, x14, #1
|
lsl x14, x14, #1
|
||||||
lsl x15, x15, #1
|
lsl x15, x15, #1
|
||||||
|
|
||||||
ldr x8, [x3, w8, uxtw]
|
addp v20.4s, v20.4s, v21.4s
|
||||||
ldr x9, [x3, w9, uxtw]
|
ldr d0, [x3, w8, uxtw]
|
||||||
ldr x10, [x3, w10, uxtw]
|
addp v22.4s, v22.4s, v23.4s
|
||||||
ldr x11, [x3, w11, uxtw]
|
ldr d1, [x3, w9, uxtw]
|
||||||
ldr x12, [x3, w12, uxtw]
|
addp v24.4s, v24.4s, v25.4s
|
||||||
ldr x13, [x3, w13, uxtw]
|
ldr d2, [x3, w10, uxtw]
|
||||||
ldr x14, [x3, w14, uxtw]
|
addp v26.4s, v26.4s, v27.4s
|
||||||
ldr x15, [x3, w15, uxtw]
|
ldr d3, [x3, w11, uxtw]
|
||||||
|
addp v20.4s, v20.4s, v22.4s
|
||||||
|
ldr d4, [x3, w12, uxtw]
|
||||||
|
addp v21.4s, v24.4s, v26.4s
|
||||||
|
ldr d5, [x3, w13, uxtw]
|
||||||
|
cmp w2, #16
|
||||||
|
|
||||||
stp x8, x9, [sp]
|
sshl v20.4s, v20.4s, v17.4s
|
||||||
stp x10, x11, [sp, #16]
|
ldr d6, [x3, w14, uxtw]
|
||||||
stp x12, x13, [sp, #32]
|
sshl v21.4s, v21.4s, v17.4s
|
||||||
stp x14, x15, [sp, #48]
|
ldr d7, [x3, w15, uxtw]
|
||||||
|
smin v20.4s, v20.4s, v18.4s
|
||||||
|
smin v21.4s, v21.4s, v18.4s
|
||||||
|
xtn v20.4h, v20.4s
|
||||||
|
xtn2 v20.8h, v21.4s
|
||||||
|
|
||||||
|
st1 {v20.8h}, [x1], #16
|
||||||
|
|
||||||
b.ge 1b
|
b.ge 1b
|
||||||
|
|
||||||
// here we make last iteration, without updating the registers
|
// here we make last iteration, without updating the registers
|
||||||
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
|
|
||||||
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
|
|
||||||
|
|
||||||
uxtl v26.4s, v0.4h
|
hscale_iter v0, v1, v28, v20, v21
|
||||||
sxtl v27.4s, v28.4h
|
hscale_iter v2, v3, v29, v22, v23
|
||||||
uxtl2 v0.4s, v0.8h
|
hscale_iter v4, v5, v30, v24, v25
|
||||||
mul v5.4s, v26.4s, v27.4s
|
hscale_iter v6, v7, v31, v26, v27
|
||||||
sxtl2 v28.4s, v28.8h
|
|
||||||
uxtl v26.4s, v1.4h
|
|
||||||
mul v6.4s, v0.4s, v28.4s
|
|
||||||
|
|
||||||
sxtl v27.4s, v29.4h
|
|
||||||
uxtl2 v0.4s, v1.8h
|
|
||||||
mla v5.4s, v26.4s, v27.4s
|
|
||||||
sxtl2 v28.4s, v29.8h
|
|
||||||
uxtl v26.4s, v2.4h
|
|
||||||
mla v6.4s, v0.4s, v28.4s
|
|
||||||
|
|
||||||
sxtl v27.4s, v30.4h
|
|
||||||
uxtl2 v0.4s, v2.8h
|
|
||||||
mla v5.4s, v26.4s, v27.4s
|
|
||||||
sxtl2 v28.4s, v30.8h
|
|
||||||
uxtl v26.4s, v3.4h
|
|
||||||
mla v6.4s, v0.4s, v28.4s
|
|
||||||
|
|
||||||
sxtl v27.4s, v31.4h
|
|
||||||
uxtl2 v0.4s, v3.8h
|
|
||||||
mla v5.4s, v26.4s, v27.4s
|
|
||||||
sxtl2 v28.4s, v31.8h
|
|
||||||
subs w2, w2, #8
|
subs w2, w2, #8
|
||||||
mla v6.4s, v0.4s, v28.4s
|
|
||||||
|
|
||||||
sshl v5.4s, v5.4s, v17.4s
|
addp v20.4s, v20.4s, v21.4s
|
||||||
sshl v6.4s, v6.4s, v17.4s
|
addp v22.4s, v22.4s, v23.4s
|
||||||
smin v5.4s, v5.4s, v18.4s
|
addp v24.4s, v24.4s, v25.4s
|
||||||
smin v6.4s, v6.4s, v18.4s
|
addp v26.4s, v26.4s, v27.4s
|
||||||
xtn v5.4h, v5.4s
|
addp v0.4s, v20.4s, v22.4s
|
||||||
xtn2 v5.8h, v6.4s
|
addp v1.4s, v24.4s, v26.4s
|
||||||
|
|
||||||
st1 {v5.8h}, [x1], #16
|
sshl v0.4s, v0.4s, v17.4s
|
||||||
add sp, sp, #64 // restore stack
|
sshl v1.4s, v1.4s, v17.4s
|
||||||
|
smin v0.4s, v0.4s, v18.4s
|
||||||
|
smin v1.4s, v1.4s, v18.4s
|
||||||
|
xtn v0.4h, v0.4s
|
||||||
|
xtn2 v0.8h, v1.4s
|
||||||
|
|
||||||
|
st1 {v0.8h}, [x1], #16
|
||||||
cbnz w2, 2f
|
cbnz w2, 2f
|
||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
2:
|
2:
|
||||||
ldr w8, [x5], #4 // load filterPos
|
ldr w8, [x5], #4 // load filterPos
|
||||||
lsl w8, w8, #1
|
add x9, x3, w8, uxtw #1 // src + filterPos
|
||||||
add x9, x3, w8, uxtw // src + filterPos
|
|
||||||
ld1 {v0.4h}, [x9] // load 4 * uint16_t
|
ld1 {v0.4h}, [x9] // load 4 * uint16_t
|
||||||
ld1 {v31.4h}, [x4], #8
|
ld1 {v31.4h}, [x4], #8
|
||||||
|
sub w2, w2, #1
|
||||||
|
|
||||||
uxtl v0.4s, v0.4h
|
uxtl v0.4s, v0.4h
|
||||||
sxtl v31.4s, v31.4h
|
sxtl v31.4s, v31.4h
|
||||||
@@ -830,7 +788,6 @@ function ff_hscale16to15_4_neon_asm, export=1
|
|||||||
sshl v0.4s, v0.4s, v17.4s
|
sshl v0.4s, v0.4s, v17.4s
|
||||||
smin v0.4s, v0.4s, v18.4s
|
smin v0.4s, v0.4s, v18.4s
|
||||||
st1 {v0.h}[0], [x1], #2
|
st1 {v0.h}[0], [x1], #2
|
||||||
sub w2, w2, #1
|
|
||||||
cbnz w2, 2b // if iterations remain jump to beginning
|
cbnz w2, 2b // if iterations remain jump to beginning
|
||||||
|
|
||||||
ret
|
ret
|
||||||
|
Reference in New Issue
Block a user