mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
aarch64: Reindent all assembly to 8/24 column indentation
libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally uses a layered indentation style to visually show how different unrolled/interleaved phases fit together. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
cada4597ca
commit
a76b409dd0
@ -19,130 +19,130 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_ps_add_squares_neon, export=1
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
fmul v0.4s, v0.4s, v0.4s
|
||||
fmul v1.4s, v1.4s, v1.4s
|
||||
faddp v2.4s, v0.4s, v1.4s
|
||||
ld1 {v3.4s}, [x0]
|
||||
fadd v3.4s, v3.4s, v2.4s
|
||||
st1 {v3.4s}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 1b
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
fmul v0.4s, v0.4s, v0.4s
|
||||
fmul v1.4s, v1.4s, v1.4s
|
||||
faddp v2.4s, v0.4s, v1.4s
|
||||
ld1 {v3.4s}, [x0]
|
||||
fadd v3.4s, v3.4s, v2.4s
|
||||
st1 {v3.4s}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_mul_pair_single_neon, export=1
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
ld1 {v2.4s}, [x2], #16
|
||||
zip1 v3.4s, v2.4s, v2.4s
|
||||
zip2 v4.4s, v2.4s, v2.4s
|
||||
fmul v0.4s, v0.4s, v3.4s
|
||||
fmul v1.4s, v1.4s, v4.4s
|
||||
st1 {v0.4s,v1.4s}, [x0], #32
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
ld1 {v2.4s}, [x2], #16
|
||||
zip1 v3.4s, v2.4s, v2.4s
|
||||
zip2 v4.4s, v2.4s, v2.4s
|
||||
fmul v0.4s, v0.4s, v3.4s
|
||||
fmul v1.4s, v1.4s, v4.4s
|
||||
st1 {v0.4s,v1.4s}, [x0], #32
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_neon, export=1
|
||||
ld1 {v0.4s}, [x2]
|
||||
ld1 {v1.4s}, [x3]
|
||||
zip1 v4.4s, v0.4s, v0.4s
|
||||
zip2 v5.4s, v0.4s, v0.4s
|
||||
zip1 v6.4s, v1.4s, v1.4s
|
||||
zip2 v7.4s, v1.4s, v1.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fadd v5.4s, v5.4s, v7.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v2.4s, v2.4s, v4.4s
|
||||
fmla v2.4s, v3.4s, v5.4s
|
||||
st1 {v2.d}[0], [x0], #8
|
||||
st1 {v2.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ld1 {v0.4s}, [x2]
|
||||
ld1 {v1.4s}, [x3]
|
||||
zip1 v4.4s, v0.4s, v0.4s
|
||||
zip2 v5.4s, v0.4s, v0.4s
|
||||
zip1 v6.4s, v1.4s, v1.4s
|
||||
zip2 v7.4s, v1.4s, v1.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fadd v5.4s, v5.4s, v7.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v2.4s, v2.4s, v4.4s
|
||||
fmla v2.4s, v3.4s, v5.4s
|
||||
st1 {v2.d}[0], [x0], #8
|
||||
st1 {v2.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
|
||||
ld1 {v0.4s,v1.4s}, [x2]
|
||||
ld1 {v6.4s,v7.4s}, [x3]
|
||||
fneg v2.4s, v1.4s
|
||||
fneg v3.4s, v7.4s
|
||||
zip1 v16.4s, v0.4s, v0.4s
|
||||
zip2 v17.4s, v0.4s, v0.4s
|
||||
zip1 v18.4s, v2.4s, v1.4s
|
||||
zip2 v19.4s, v2.4s, v1.4s
|
||||
zip1 v20.4s, v6.4s, v6.4s
|
||||
zip2 v21.4s, v6.4s, v6.4s
|
||||
zip1 v22.4s, v3.4s, v7.4s
|
||||
zip2 v23.4s, v3.4s, v7.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v16.4s, v16.4s, v20.4s
|
||||
fadd v17.4s, v17.4s, v21.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v4.4s, v2.4s, v16.4s
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
fadd v18.4s, v18.4s, v22.4s
|
||||
fadd v19.4s, v19.4s, v23.4s
|
||||
ext v2.16b, v2.16b, v2.16b, #4
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
fmla v4.4s, v2.4s, v18.4s
|
||||
fmla v4.4s, v3.4s, v19.4s
|
||||
st1 {v4.d}[0], [x0], #8
|
||||
st1 {v4.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ld1 {v0.4s,v1.4s}, [x2]
|
||||
ld1 {v6.4s,v7.4s}, [x3]
|
||||
fneg v2.4s, v1.4s
|
||||
fneg v3.4s, v7.4s
|
||||
zip1 v16.4s, v0.4s, v0.4s
|
||||
zip2 v17.4s, v0.4s, v0.4s
|
||||
zip1 v18.4s, v2.4s, v1.4s
|
||||
zip2 v19.4s, v2.4s, v1.4s
|
||||
zip1 v20.4s, v6.4s, v6.4s
|
||||
zip2 v21.4s, v6.4s, v6.4s
|
||||
zip1 v22.4s, v3.4s, v7.4s
|
||||
zip2 v23.4s, v3.4s, v7.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v16.4s, v16.4s, v20.4s
|
||||
fadd v17.4s, v17.4s, v21.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v4.4s, v2.4s, v16.4s
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
fadd v18.4s, v18.4s, v22.4s
|
||||
fadd v19.4s, v19.4s, v23.4s
|
||||
ext v2.16b, v2.16b, v2.16b, #4
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
fmla v4.4s, v2.4s, v18.4s
|
||||
fmla v4.4s, v3.4s, v19.4s
|
||||
st1 {v4.d}[0], [x0], #8
|
||||
st1 {v4.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_hybrid_analysis_neon, export=1
|
||||
lsl x3, x3, #3
|
||||
ld2 {v0.4s,v1.4s}, [x1], #32
|
||||
ld2 {v2.2s,v3.2s}, [x1], #16
|
||||
ld1 {v24.2s}, [x1], #8
|
||||
ld2 {v4.2s,v5.2s}, [x1], #16
|
||||
ld2 {v6.4s,v7.4s}, [x1]
|
||||
rev64 v6.4s, v6.4s
|
||||
rev64 v7.4s, v7.4s
|
||||
ext v6.16b, v6.16b, v6.16b, #8
|
||||
ext v7.16b, v7.16b, v7.16b, #8
|
||||
rev64 v4.2s, v4.2s
|
||||
rev64 v5.2s, v5.2s
|
||||
mov v2.d[1], v3.d[0]
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v5.d[1], v2.d[0]
|
||||
mov v3.d[1], v4.d[0]
|
||||
fadd v16.4s, v0.4s, v6.4s
|
||||
fadd v17.4s, v1.4s, v7.4s
|
||||
fsub v18.4s, v1.4s, v7.4s
|
||||
fsub v19.4s, v0.4s, v6.4s
|
||||
fadd v22.4s, v2.4s, v4.4s
|
||||
fsub v23.4s, v5.4s, v3.4s
|
||||
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4s,v3.4s}, [x2], #32
|
||||
ld2 {v4.2s,v5.2s}, [x2], #16
|
||||
ld1 {v6.2s}, [x2], #8
|
||||
add x2, x2, #8
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v6.s[1], v6.s[0]
|
||||
fmul v6.2s, v6.2s, v24.2s
|
||||
fmul v0.4s, v2.4s, v16.4s
|
||||
fmul v1.4s, v2.4s, v17.4s
|
||||
fmls v0.4s, v3.4s, v18.4s
|
||||
fmla v1.4s, v3.4s, v19.4s
|
||||
fmla v0.4s, v4.4s, v20.4s
|
||||
fmla v1.4s, v4.4s, v21.4s
|
||||
faddp v0.4s, v0.4s, v1.4s
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
fadd v0.2s, v0.2s, v6.2s
|
||||
st1 {v0.2s}, [x0], x3
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
lsl x3, x3, #3
|
||||
ld2 {v0.4s,v1.4s}, [x1], #32
|
||||
ld2 {v2.2s,v3.2s}, [x1], #16
|
||||
ld1 {v24.2s}, [x1], #8
|
||||
ld2 {v4.2s,v5.2s}, [x1], #16
|
||||
ld2 {v6.4s,v7.4s}, [x1]
|
||||
rev64 v6.4s, v6.4s
|
||||
rev64 v7.4s, v7.4s
|
||||
ext v6.16b, v6.16b, v6.16b, #8
|
||||
ext v7.16b, v7.16b, v7.16b, #8
|
||||
rev64 v4.2s, v4.2s
|
||||
rev64 v5.2s, v5.2s
|
||||
mov v2.d[1], v3.d[0]
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v5.d[1], v2.d[0]
|
||||
mov v3.d[1], v4.d[0]
|
||||
fadd v16.4s, v0.4s, v6.4s
|
||||
fadd v17.4s, v1.4s, v7.4s
|
||||
fsub v18.4s, v1.4s, v7.4s
|
||||
fsub v19.4s, v0.4s, v6.4s
|
||||
fadd v22.4s, v2.4s, v4.4s
|
||||
fsub v23.4s, v5.4s, v3.4s
|
||||
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4s,v3.4s}, [x2], #32
|
||||
ld2 {v4.2s,v5.2s}, [x2], #16
|
||||
ld1 {v6.2s}, [x2], #8
|
||||
add x2, x2, #8
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v6.s[1], v6.s[0]
|
||||
fmul v6.2s, v6.2s, v24.2s
|
||||
fmul v0.4s, v2.4s, v16.4s
|
||||
fmul v1.4s, v2.4s, v17.4s
|
||||
fmls v0.4s, v3.4s, v18.4s
|
||||
fmla v1.4s, v3.4s, v19.4s
|
||||
fmla v0.4s, v4.4s, v20.4s
|
||||
fmla v1.4s, v4.4s, v21.4s
|
||||
faddp v0.4s, v0.4s, v1.4s
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
fadd v0.2s, v0.2s, v6.2s
|
||||
st1 {v0.2s}, [x0], x3
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
@ -33,81 +33,81 @@ const tab_x2, align=4
|
||||
endconst
|
||||
|
||||
function ff_opus_deemphasis_neon, export=1
|
||||
movrel x4, tab_st
|
||||
ld1 {v4.4s}, [x4]
|
||||
movrel x4, tab_x0
|
||||
ld1 {v5.4s}, [x4]
|
||||
movrel x4, tab_x1
|
||||
ld1 {v6.4s}, [x4]
|
||||
movrel x4, tab_x2
|
||||
ld1 {v7.4s}, [x4]
|
||||
movrel x4, tab_st
|
||||
ld1 {v4.4s}, [x4]
|
||||
movrel x4, tab_x0
|
||||
ld1 {v5.4s}, [x4]
|
||||
movrel x4, tab_x1
|
||||
ld1 {v6.4s}, [x4]
|
||||
movrel x4, tab_x2
|
||||
ld1 {v7.4s}, [x4]
|
||||
|
||||
fmul v0.4s, v4.4s, v0.s[0]
|
||||
fmul v0.4s, v4.4s, v0.s[0]
|
||||
|
||||
1: ld1 {v1.4s, v2.4s}, [x1], #32
|
||||
1: ld1 {v1.4s, v2.4s}, [x1], #32
|
||||
|
||||
fmla v0.4s, v5.4s, v1.s[0]
|
||||
fmul v3.4s, v7.4s, v2.s[2]
|
||||
fmla v0.4s, v5.4s, v1.s[0]
|
||||
fmul v3.4s, v7.4s, v2.s[2]
|
||||
|
||||
fmla v0.4s, v6.4s, v1.s[1]
|
||||
fmla v3.4s, v6.4s, v2.s[1]
|
||||
fmla v0.4s, v6.4s, v1.s[1]
|
||||
fmla v3.4s, v6.4s, v2.s[1]
|
||||
|
||||
fmla v0.4s, v7.4s, v1.s[2]
|
||||
fmla v3.4s, v5.4s, v2.s[0]
|
||||
fmla v0.4s, v7.4s, v1.s[2]
|
||||
fmla v3.4s, v5.4s, v2.s[0]
|
||||
|
||||
fadd v1.4s, v1.4s, v0.4s
|
||||
fadd v2.4s, v2.4s, v3.4s
|
||||
fadd v1.4s, v1.4s, v0.4s
|
||||
fadd v2.4s, v2.4s, v3.4s
|
||||
|
||||
fmla v2.4s, v4.4s, v1.s[3]
|
||||
fmla v2.4s, v4.4s, v1.s[3]
|
||||
|
||||
st1 {v1.4s, v2.4s}, [x0], #32
|
||||
fmul v0.4s, v4.4s, v2.s[3]
|
||||
st1 {v1.4s, v2.4s}, [x0], #32
|
||||
fmul v0.4s, v4.4s, v2.s[3]
|
||||
|
||||
subs w2, w2, #8
|
||||
b.gt 1b
|
||||
subs w2, w2, #8
|
||||
b.gt 1b
|
||||
|
||||
mov s0, v2.s[3]
|
||||
mov s0, v2.s[3]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_opus_postfilter_neon, export=1
|
||||
ld1 {v0.4s}, [x2]
|
||||
dup v1.4s, v0.s[1]
|
||||
dup v2.4s, v0.s[2]
|
||||
dup v0.4s, v0.s[0]
|
||||
ld1 {v0.4s}, [x2]
|
||||
dup v1.4s, v0.s[1]
|
||||
dup v2.4s, v0.s[2]
|
||||
dup v0.4s, v0.s[0]
|
||||
|
||||
add w1, w1, #2
|
||||
sub x1, x0, x1, lsl #2
|
||||
add w1, w1, #2
|
||||
sub x1, x0, x1, lsl #2
|
||||
|
||||
ld1 {v3.4s}, [x1]
|
||||
fmul v3.4s, v3.4s, v2.4s
|
||||
ld1 {v3.4s}, [x1]
|
||||
fmul v3.4s, v3.4s, v2.4s
|
||||
|
||||
1: add x1, x1, #4
|
||||
ld1 {v4.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v5.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v6.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v7.4s}, [x1]
|
||||
1: add x1, x1, #4
|
||||
ld1 {v4.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v5.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v6.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v7.4s}, [x1]
|
||||
|
||||
fmla v3.4s, v7.4s, v2.4s
|
||||
fadd v6.4s, v6.4s, v4.4s
|
||||
fmla v3.4s, v7.4s, v2.4s
|
||||
fadd v6.4s, v6.4s, v4.4s
|
||||
|
||||
ld1 {v4.4s}, [x0]
|
||||
fmla v4.4s, v5.4s, v0.4s
|
||||
ld1 {v4.4s}, [x0]
|
||||
fmla v4.4s, v5.4s, v0.4s
|
||||
|
||||
fmul v6.4s, v6.4s, v1.4s
|
||||
fadd v6.4s, v6.4s, v3.4s
|
||||
fmul v6.4s, v6.4s, v1.4s
|
||||
fadd v6.4s, v6.4s, v3.4s
|
||||
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fmul v3.4s, v7.4s, v2.4s
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fmul v3.4s, v7.4s, v2.4s
|
||||
|
||||
st1 {v4.4s}, [x0], #16
|
||||
st1 {v4.4s}, [x0], #16
|
||||
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
@ -21,57 +21,57 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_resample_common_apply_filter_x4_float_neon, export=1
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x8_float_neon, export=1
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
ld1 {v3.4s}, [x1], #16 // src[4..7]
|
||||
ld1 {v4.4s}, [x2], #16 // filter[4..7]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
ld1 {v3.4s}, [x1], #16 // src[4..7]
|
||||
ld1 {v4.4s}, [x2], #16 // filter[4..7]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x4_s16_neon, export=1
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
|
||||
ld1 {v2.4h}, [x2], #8 // filter[0..3]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
|
||||
ld1 {v2.4h}, [x2], #8 // filter[0..3]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x8_s16_neon, export=1
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
|
||||
ld1 {v2.8h}, [x2], #16 // filter[0..7]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
|
||||
ld1 {v2.8h}, [x2], #16 // filter[0..7]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -29,178 +29,178 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
// x5 - const uint8_t *dither,
|
||||
// w6 - int offset
|
||||
|
||||
ld1 {v0.8b}, [x5] // load 8x8-bit dither
|
||||
and w6, w6, #7
|
||||
cbz w6, 1f // check if offsetting present
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
|
||||
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
cmp w1, #8 // if filterSize == 8, branch to specialized version
|
||||
b.eq 6f
|
||||
cmp w1, #4 // if filterSize == 4, branch to specialized version
|
||||
b.eq 8f
|
||||
cmp w1, #2 // if filterSize == 2, branch to specialized version
|
||||
b.eq 10f
|
||||
ld1 {v0.8b}, [x5] // load 8x8-bit dither
|
||||
and w6, w6, #7
|
||||
cbz w6, 1f // check if offsetting present
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
|
||||
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
cmp w1, #8 // if filterSize == 8, branch to specialized version
|
||||
b.eq 6f
|
||||
cmp w1, #4 // if filterSize == 4, branch to specialized version
|
||||
b.eq 8f
|
||||
cmp w1, #2 // if filterSize == 2, branch to specialized version
|
||||
b.eq 10f
|
||||
|
||||
// The filter size does not match of the of specialized implementations. It is either even or odd. If it is even
|
||||
// then use the first section below.
|
||||
mov x7, #0 // i = 0
|
||||
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
|
||||
mov x7, #0 // i = 0
|
||||
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
|
||||
// fs % 2 == 0
|
||||
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
|
||||
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
add x12, x12, x7, lsl #1 // &src[j+1][i]
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
|
||||
smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
|
||||
subs w8, w8, #2 // tmpfilterSize -= 2
|
||||
b.gt 3b // loop until filterSize consumed
|
||||
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
|
||||
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
add x12, x12, x7, lsl #1 // &src[j+1][i]
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
|
||||
smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
|
||||
subs w8, w8, #2 // tmpfilterSize -= 2
|
||||
b.gt 3b // loop until filterSize consumed
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 2b // loop until width consumed
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 2b // loop until width consumed
|
||||
ret
|
||||
|
||||
// If filter size is odd (most likely == 1), then use this section.
|
||||
// fs % 2 != 0
|
||||
4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
|
||||
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
|
||||
subs w8, w8, #1 // tmpfilterSize -= 2
|
||||
b.gt 5b // loop until filterSize consumed
|
||||
4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
|
||||
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
|
||||
subs w8, w8, #1 // tmpfilterSize -= 2
|
||||
b.gt 5b // loop until filterSize consumed
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 4b // loop until width consumed
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 4b // loop until width consumed
|
||||
ret
|
||||
|
||||
6: // fs=8
|
||||
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
|
||||
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
|
||||
ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
|
||||
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
|
||||
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
|
||||
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
|
||||
ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
|
||||
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
|
||||
|
||||
// load 8x16-bit values for filter[j], where j=0..7
|
||||
ld1 {v6.8h}, [x0]
|
||||
ld1 {v6.8h}, [x0]
|
||||
7:
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
|
||||
ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
|
||||
ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
|
||||
ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
|
||||
ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
|
||||
ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
|
||||
ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
|
||||
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
|
||||
smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
|
||||
smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
|
||||
smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
|
||||
smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
|
||||
smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
|
||||
smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
|
||||
smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
|
||||
smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
|
||||
smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
|
||||
smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
|
||||
smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
|
||||
smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
|
||||
smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
|
||||
smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
b.gt 7b // loop until width consumed
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
b.gt 7b // loop until width consumed
|
||||
ret
|
||||
|
||||
8: // fs=4
|
||||
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
|
||||
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
|
||||
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
|
||||
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
|
||||
|
||||
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
|
||||
ld1 {v6.4h}, [x0]
|
||||
ld1 {v6.4h}, [x0]
|
||||
9:
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
b.gt 9b // loop until width consumed
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
b.gt 9b // loop until width consumed
|
||||
ret
|
||||
|
||||
10: // fs=2
|
||||
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
|
||||
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
|
||||
|
||||
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
|
||||
ldr s6, [x0]
|
||||
ldr s6, [x0]
|
||||
11:
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
b.gt 11b // loop until width consumed
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
b.gt 11b // loop until width consumed
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -210,25 +210,25 @@ function ff_yuv2plane1_8_neon, export=1
|
||||
// w2 - int dstW,
|
||||
// x3 - const uint8_t *dither,
|
||||
// w4 - int offset
|
||||
ld1 {v0.8b}, [x3] // load 8x8-bit dither
|
||||
and w4, w4, #7
|
||||
cbz w4, 1f // check if offsetting present
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 32-bit
|
||||
uxtl v1.4s, v0.4h
|
||||
uxtl2 v2.4s, v0.8h
|
||||
ld1 {v0.8b}, [x3] // load 8x8-bit dither
|
||||
and w4, w4, #7
|
||||
cbz w4, 1f // check if offsetting present
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 32-bit
|
||||
uxtl v1.4s, v0.4h
|
||||
uxtl2 v2.4s, v0.8h
|
||||
2:
|
||||
ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
sxtl v4.4s, v3.4h
|
||||
sxtl2 v5.4s, v3.8h
|
||||
add v4.4s, v4.4s, v1.4s
|
||||
add v5.4s, v5.4s, v2.4s
|
||||
sqshrun v4.4h, v4.4s, #6
|
||||
sqshrun2 v4.8h, v5.4s, #6
|
||||
ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
sxtl v4.4s, v3.4h
|
||||
sxtl2 v5.4s, v3.8h
|
||||
add v4.4s, v4.4s, v1.4s
|
||||
add v5.4s, v5.4s, v2.4s
|
||||
sqshrun v4.4h, v4.4s, #6
|
||||
sqshrun2 v4.8h, v5.4s, #6
|
||||
|
||||
uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
|
||||
subs w2, w2, #8 // dstW -= 8
|
||||
st1 {v3.8b}, [x1], #8 // write to destination
|
||||
b.gt 2b // loop until width consumed
|
||||
uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
|
||||
subs w2, w2, #8 // dstW -= 8
|
||||
st1 {v3.8b}, [x1], #8 // write to destination
|
||||
b.gt 2b // loop until width consumed
|
||||
ret
|
||||
endfunc
|
||||
|
@ -23,23 +23,23 @@
|
||||
|
||||
.macro load_yoff_ycoeff yoff ycoeff
|
||||
#if defined(__APPLE__)
|
||||
ldp w9, w10, [sp, #\yoff]
|
||||
ldp w9, w10, [sp, #\yoff]
|
||||
#else
|
||||
ldr w9, [sp, #\yoff]
|
||||
ldr w10, [sp, #\ycoeff]
|
||||
ldr w9, [sp, #\yoff]
|
||||
ldr w10, [sp, #\ycoeff]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
neg w11, w0
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
neg w11, w0
|
||||
.endm
|
||||
|
||||
.macro load_args_nv21
|
||||
@ -47,52 +47,52 @@
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv420p
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
lsr w11, w0, #1
|
||||
neg w11, w11
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
lsr w11, w0, #1
|
||||
neg w11, w11
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv422p
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv12
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv21
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v19.8h, v16.8b, #3
|
||||
ushll v18.8h, v17.8b, #3
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v19.8h, v16.8b, #3
|
||||
ushll v18.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv420p
|
||||
ld1 {v16.8b}, [ x6], #8
|
||||
ld1 {v17.8b}, [x13], #8
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
ld1 {v16.8b}, [ x6], #8
|
||||
ld1 {v17.8b}, [x13], #8
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv422p
|
||||
@ -100,9 +100,9 @@
|
||||
.endm
|
||||
|
||||
.macro increment_nv12
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w16, sxtw // srcC += incC
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w16, sxtw // srcC += incC
|
||||
.endm
|
||||
|
||||
.macro increment_nv21
|
||||
@ -110,100 +110,100 @@
|
||||
.endm
|
||||
|
||||
.macro increment_yuv420p
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||
add x6, x6, w16, sxtw // srcU += incU
|
||||
add x13, x13, w17, sxtw // srcV += incV
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||
add x6, x6, w16, sxtw // srcU += incU
|
||||
add x13, x13, w17, sxtw // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro increment_yuv422p
|
||||
add x6, x6, w7, sxtw // srcU += incU
|
||||
add x13, x13, w14, sxtw // srcV += incV
|
||||
add x6, x6, w7, sxtw // srcU += incU
|
||||
add x13, x13, w14, sxtw // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
add v20.8h, v26.8h, v20.8h // Y1 + R1
|
||||
add v21.8h, v27.8h, v21.8h // Y2 + R2
|
||||
add v22.8h, v26.8h, v22.8h // Y1 + G1
|
||||
add v23.8h, v27.8h, v23.8h // Y2 + G2
|
||||
add v24.8h, v26.8h, v24.8h // Y1 + B1
|
||||
add v25.8h, v27.8h, v25.8h // Y2 + B2
|
||||
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
add v20.8h, v26.8h, v20.8h // Y1 + R1
|
||||
add v21.8h, v27.8h, v21.8h // Y2 + R2
|
||||
add v22.8h, v26.8h, v22.8h // Y1 + G1
|
||||
add v23.8h, v27.8h, v23.8h // Y2 + G2
|
||||
add v24.8h, v26.8h, v24.8h // Y1 + B1
|
||||
add v25.8h, v27.8h, v25.8h // Y2 + B2
|
||||
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt
|
||||
mov w9, w1
|
||||
mov w9, w1
|
||||
1:
|
||||
mov w8, w0 // w8 = width
|
||||
mov w8, w0 // w8 = width
|
||||
2:
|
||||
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
|
||||
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
|
||||
load_chroma_\ifmt
|
||||
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
|
||||
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
|
||||
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
|
||||
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
|
||||
zip2 v21.8h, v20.8h, v20.8h // R2
|
||||
zip1 v20.8h, v20.8h, v20.8h // R1
|
||||
zip2 v23.8h, v22.8h, v22.8h // G2
|
||||
zip1 v22.8h, v22.8h, v22.8h // G1
|
||||
zip2 v25.8h, v24.8h, v24.8h // B2
|
||||
zip1 v24.8h, v24.8h, v24.8h // B1
|
||||
ld1 {v2.16b}, [x4], #16 // load luma
|
||||
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
|
||||
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
|
||||
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
|
||||
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
|
||||
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
|
||||
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
|
||||
zip2 v21.8h, v20.8h, v20.8h // R2
|
||||
zip1 v20.8h, v20.8h, v20.8h // R1
|
||||
zip2 v23.8h, v22.8h, v22.8h // G2
|
||||
zip1 v22.8h, v22.8h, v22.8h // G1
|
||||
zip2 v25.8h, v24.8h, v24.8h // B2
|
||||
zip1 v24.8h, v24.8h, v24.8h // B1
|
||||
ld1 {v2.16b}, [x4], #16 // load luma
|
||||
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
|
||||
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
|
||||
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
|
||||
.ifc \ofmt,argb // 1 2 3 0
|
||||
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
|
||||
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,rgba // 0 1 2 3
|
||||
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
|
||||
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,abgr // 3 2 1 0
|
||||
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
|
||||
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,bgra // 2 1 0 3
|
||||
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
|
||||
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
|
||||
.endif
|
||||
|
||||
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
|
||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, sxtw // dst += padding
|
||||
add x4, x4, w5, sxtw // srcY += paddingY
|
||||
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
|
||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, sxtw // dst += padding
|
||||
add x4, x4, w5, sxtw // srcY += paddingY
|
||||
increment_\ifmt
|
||||
subs w1, w1, #1 // height -= 1
|
||||
b.gt 1b
|
||||
mov w0, w9
|
||||
subs w1, w1, #1 // height -= 1
|
||||
b.gt 1b
|
||||
mov w0, w9
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro declare_rgb_funcs ifmt
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
.endm
|
||||
|
||||
declare_rgb_funcs nv12
|
||||
|
Loading…
Reference in New Issue
Block a user