mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
aarch64: Consistently use lowercase for vector element specifiers
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
393d1ee541
commit
184103b310
@ -19,82 +19,82 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_ps_add_squares_neon, export=1
|
||||
1: ld1 {v0.4S,v1.4S}, [x1], #32
|
||||
fmul v0.4S, v0.4S, v0.4S
|
||||
fmul v1.4S, v1.4S, v1.4S
|
||||
faddp v2.4S, v0.4S, v1.4S
|
||||
ld1 {v3.4S}, [x0]
|
||||
fadd v3.4S, v3.4S, v2.4S
|
||||
st1 {v3.4S}, [x0], #16
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
fmul v0.4s, v0.4s, v0.4s
|
||||
fmul v1.4s, v1.4s, v1.4s
|
||||
faddp v2.4s, v0.4s, v1.4s
|
||||
ld1 {v3.4s}, [x0]
|
||||
fadd v3.4s, v3.4s, v2.4s
|
||||
st1 {v3.4s}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_mul_pair_single_neon, export=1
|
||||
1: ld1 {v0.4S,v1.4S}, [x1], #32
|
||||
ld1 {v2.4S}, [x2], #16
|
||||
zip1 v3.4S, v2.4S, v2.4S
|
||||
zip2 v4.4S, v2.4S, v2.4S
|
||||
fmul v0.4S, v0.4S, v3.4S
|
||||
fmul v1.4S, v1.4S, v4.4S
|
||||
st1 {v0.4S,v1.4S}, [x0], #32
|
||||
1: ld1 {v0.4s,v1.4s}, [x1], #32
|
||||
ld1 {v2.4s}, [x2], #16
|
||||
zip1 v3.4s, v2.4s, v2.4s
|
||||
zip2 v4.4s, v2.4s, v2.4s
|
||||
fmul v0.4s, v0.4s, v3.4s
|
||||
fmul v1.4s, v1.4s, v4.4s
|
||||
st1 {v0.4s,v1.4s}, [x0], #32
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_neon, export=1
|
||||
ld1 {v0.4S}, [x2]
|
||||
ld1 {v1.4S}, [x3]
|
||||
zip1 v4.4S, v0.4S, v0.4S
|
||||
zip2 v5.4S, v0.4S, v0.4S
|
||||
zip1 v6.4S, v1.4S, v1.4S
|
||||
zip2 v7.4S, v1.4S, v1.4S
|
||||
1: ld1 {v2.2S}, [x0]
|
||||
ld1 {v3.2S}, [x1]
|
||||
fadd v4.4S, v4.4S, v6.4S
|
||||
fadd v5.4S, v5.4S, v7.4S
|
||||
mov v2.D[1], v2.D[0]
|
||||
mov v3.D[1], v3.D[0]
|
||||
fmul v2.4S, v2.4S, v4.4S
|
||||
fmla v2.4S, v3.4S, v5.4S
|
||||
st1 {v2.D}[0], [x0], #8
|
||||
st1 {v2.D}[1], [x1], #8
|
||||
ld1 {v0.4s}, [x2]
|
||||
ld1 {v1.4s}, [x3]
|
||||
zip1 v4.4s, v0.4s, v0.4s
|
||||
zip2 v5.4s, v0.4s, v0.4s
|
||||
zip1 v6.4s, v1.4s, v1.4s
|
||||
zip2 v7.4s, v1.4s, v1.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fadd v5.4s, v5.4s, v7.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v2.4s, v2.4s, v4.4s
|
||||
fmla v2.4s, v3.4s, v5.4s
|
||||
st1 {v2.d}[0], [x0], #8
|
||||
st1 {v2.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
|
||||
ld1 {v0.4S,v1.4S}, [x2]
|
||||
ld1 {v6.4S,v7.4S}, [x3]
|
||||
fneg v2.4S, v1.4S
|
||||
fneg v3.4S, v7.4S
|
||||
zip1 v16.4S, v0.4S, v0.4S
|
||||
zip2 v17.4S, v0.4S, v0.4S
|
||||
zip1 v18.4S, v2.4S, v1.4S
|
||||
zip2 v19.4S, v2.4S, v1.4S
|
||||
zip1 v20.4S, v6.4S, v6.4S
|
||||
zip2 v21.4S, v6.4S, v6.4S
|
||||
zip1 v22.4S, v3.4S, v7.4S
|
||||
zip2 v23.4S, v3.4S, v7.4S
|
||||
1: ld1 {v2.2S}, [x0]
|
||||
ld1 {v3.2S}, [x1]
|
||||
fadd v16.4S, v16.4S, v20.4S
|
||||
fadd v17.4S, v17.4S, v21.4S
|
||||
mov v2.D[1], v2.D[0]
|
||||
mov v3.D[1], v3.D[0]
|
||||
fmul v4.4S, v2.4S, v16.4S
|
||||
fmla v4.4S, v3.4S, v17.4S
|
||||
fadd v18.4S, v18.4S, v22.4S
|
||||
fadd v19.4S, v19.4S, v23.4S
|
||||
ext v2.16B, v2.16B, v2.16B, #4
|
||||
ext v3.16B, v3.16B, v3.16B, #4
|
||||
fmla v4.4S, v2.4S, v18.4S
|
||||
fmla v4.4S, v3.4S, v19.4S
|
||||
st1 {v4.D}[0], [x0], #8
|
||||
st1 {v4.D}[1], [x1], #8
|
||||
ld1 {v0.4s,v1.4s}, [x2]
|
||||
ld1 {v6.4s,v7.4s}, [x3]
|
||||
fneg v2.4s, v1.4s
|
||||
fneg v3.4s, v7.4s
|
||||
zip1 v16.4s, v0.4s, v0.4s
|
||||
zip2 v17.4s, v0.4s, v0.4s
|
||||
zip1 v18.4s, v2.4s, v1.4s
|
||||
zip2 v19.4s, v2.4s, v1.4s
|
||||
zip1 v20.4s, v6.4s, v6.4s
|
||||
zip2 v21.4s, v6.4s, v6.4s
|
||||
zip1 v22.4s, v3.4s, v7.4s
|
||||
zip2 v23.4s, v3.4s, v7.4s
|
||||
1: ld1 {v2.2s}, [x0]
|
||||
ld1 {v3.2s}, [x1]
|
||||
fadd v16.4s, v16.4s, v20.4s
|
||||
fadd v17.4s, v17.4s, v21.4s
|
||||
mov v2.d[1], v2.d[0]
|
||||
mov v3.d[1], v3.d[0]
|
||||
fmul v4.4s, v2.4s, v16.4s
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
fadd v18.4s, v18.4s, v22.4s
|
||||
fadd v19.4s, v19.4s, v23.4s
|
||||
ext v2.16b, v2.16b, v2.16b, #4
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
fmla v4.4s, v2.4s, v18.4s
|
||||
fmla v4.4s, v3.4s, v19.4s
|
||||
st1 {v4.d}[0], [x0], #8
|
||||
st1 {v4.d}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
@ -102,46 +102,46 @@ endfunc
|
||||
|
||||
function ff_ps_hybrid_analysis_neon, export=1
|
||||
lsl x3, x3, #3
|
||||
ld2 {v0.4S,v1.4S}, [x1], #32
|
||||
ld2 {v2.2S,v3.2S}, [x1], #16
|
||||
ld1 {v24.2S}, [x1], #8
|
||||
ld2 {v4.2S,v5.2S}, [x1], #16
|
||||
ld2 {v6.4S,v7.4S}, [x1]
|
||||
rev64 v6.4S, v6.4S
|
||||
rev64 v7.4S, v7.4S
|
||||
ext v6.16B, v6.16B, v6.16B, #8
|
||||
ext v7.16B, v7.16B, v7.16B, #8
|
||||
rev64 v4.2S, v4.2S
|
||||
rev64 v5.2S, v5.2S
|
||||
mov v2.D[1], v3.D[0]
|
||||
mov v4.D[1], v5.D[0]
|
||||
mov v5.D[1], v2.D[0]
|
||||
mov v3.D[1], v4.D[0]
|
||||
fadd v16.4S, v0.4S, v6.4S
|
||||
fadd v17.4S, v1.4S, v7.4S
|
||||
fsub v18.4S, v1.4S, v7.4S
|
||||
fsub v19.4S, v0.4S, v6.4S
|
||||
fadd v22.4S, v2.4S, v4.4S
|
||||
fsub v23.4S, v5.4S, v3.4S
|
||||
trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4S,v3.4S}, [x2], #32
|
||||
ld2 {v4.2S,v5.2S}, [x2], #16
|
||||
ld1 {v6.2S}, [x2], #8
|
||||
ld2 {v0.4s,v1.4s}, [x1], #32
|
||||
ld2 {v2.2s,v3.2s}, [x1], #16
|
||||
ld1 {v24.2s}, [x1], #8
|
||||
ld2 {v4.2s,v5.2s}, [x1], #16
|
||||
ld2 {v6.4s,v7.4s}, [x1]
|
||||
rev64 v6.4s, v6.4s
|
||||
rev64 v7.4s, v7.4s
|
||||
ext v6.16b, v6.16b, v6.16b, #8
|
||||
ext v7.16b, v7.16b, v7.16b, #8
|
||||
rev64 v4.2s, v4.2s
|
||||
rev64 v5.2s, v5.2s
|
||||
mov v2.d[1], v3.d[0]
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v5.d[1], v2.d[0]
|
||||
mov v3.d[1], v4.d[0]
|
||||
fadd v16.4s, v0.4s, v6.4s
|
||||
fadd v17.4s, v1.4s, v7.4s
|
||||
fsub v18.4s, v1.4s, v7.4s
|
||||
fsub v19.4s, v0.4s, v6.4s
|
||||
fadd v22.4s, v2.4s, v4.4s
|
||||
fsub v23.4s, v5.4s, v3.4s
|
||||
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4s,v3.4s}, [x2], #32
|
||||
ld2 {v4.2s,v5.2s}, [x2], #16
|
||||
ld1 {v6.2s}, [x2], #8
|
||||
add x2, x2, #8
|
||||
mov v4.D[1], v5.D[0]
|
||||
mov v6.S[1], v6.S[0]
|
||||
fmul v6.2S, v6.2S, v24.2S
|
||||
fmul v0.4S, v2.4S, v16.4S
|
||||
fmul v1.4S, v2.4S, v17.4S
|
||||
fmls v0.4S, v3.4S, v18.4S
|
||||
fmla v1.4S, v3.4S, v19.4S
|
||||
fmla v0.4S, v4.4S, v20.4S
|
||||
fmla v1.4S, v4.4S, v21.4S
|
||||
faddp v0.4S, v0.4S, v1.4S
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
fadd v0.2S, v0.2S, v6.2S
|
||||
st1 {v0.2S}, [x0], x3
|
||||
mov v4.d[1], v5.d[0]
|
||||
mov v6.s[1], v6.s[0]
|
||||
fmul v6.2s, v6.2s, v24.2s
|
||||
fmul v0.4s, v2.4s, v16.4s
|
||||
fmul v1.4s, v2.4s, v17.4s
|
||||
fmls v0.4s, v3.4s, v18.4s
|
||||
fmla v1.4s, v3.4s, v19.4s
|
||||
fmla v0.4s, v4.4s, v20.4s
|
||||
fmla v1.4s, v4.4s, v21.4s
|
||||
faddp v0.4s, v0.4s, v1.4s
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
fadd v0.2s, v0.2s, v6.2s
|
||||
st1 {v0.2s}, [x0], x3
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
|
@ -39,10 +39,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
ld1r {v22.8h}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
movi v22.8h, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
@ -55,139 +55,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v0.8B, w4
|
||||
dup v1.8B, w12
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
dup v2.8B, w6
|
||||
dup v3.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
1: ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
umlal v16.8H, v6.8B, v2.8B
|
||||
dup v0.8b, w4
|
||||
dup v1.8b, w12
|
||||
ld1 {v4.8b, v5.8b}, [x1], x2
|
||||
dup v2.8b, w6
|
||||
dup v3.8b, w7
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
1: ld1 {v6.8b, v7.8b}, [x1], x2
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v5.8b, v1.8b
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
ld1 {v4.8b, v5.8b}, [x1], x2
|
||||
umlal v16.8h, v6.8b, v2.8b
|
||||
prfm pldl1strm, [x1]
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
umlal v16.8H, v7.8B, v3.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
umlal v16.8h, v7.8b, v3.8b
|
||||
umull v17.8h, v6.8b, v0.8b
|
||||
subs w3, w3, #2
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
umlal v17.8H, v4.8B, v2.8B
|
||||
umlal v17.8H, v5.8B, v3.8B
|
||||
umlal v17.8h, v7.8b, v1.8b
|
||||
umlal v17.8h, v4.8b, v2.8b
|
||||
umlal v17.8h, v5.8b, v3.8b
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v0.8B, w4
|
||||
dup v0.8b, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v1.8B, w12
|
||||
dup v1.8b, w12
|
||||
b.eq 4f
|
||||
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
3: ld1 {v6.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v6.8B, v1.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v4.8B, v1.8B
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
3: ld1 {v6.8b}, [x1], x2
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v6.8b, v1.8b
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
umull v17.8h, v6.8b, v0.8b
|
||||
umlal v17.8h, v4.8b, v1.8b
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
4: ld1 {v4.8b, v5.8b}, [x1], x2
|
||||
ld1 {v6.8b, v7.8b}, [x1], x2
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v5.8b, v1.8b
|
||||
umull v17.8h, v6.8b, v0.8b
|
||||
umlal v17.8h, v7.8b, v1.8b
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
5: ld1 {v4.8b}, [x1], x2
|
||||
ld1 {v5.8b}, [x1], x2
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umull v17.8H, v5.8B, v0.8B
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umull v17.8h, v5.8b, v0.8b
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
add v16.8h, v16.8h, v22.8h
|
||||
add v17.8h, v17.8h, v22.8h
|
||||
shrn v16.8b, v16.8h, #6
|
||||
shrn v17.8b, v17.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
ld1 {v20.8b}, [x8], x2
|
||||
ld1 {v21.8b}, [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
urhadd v17.8b, v17.8b, v21.8b
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
@ -209,10 +209,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
ld1r {v22.8h}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
movi v22.8h, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
@ -225,133 +225,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v24.8B, w4
|
||||
dup v25.8B, w12
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
dup v26.8B, w6
|
||||
dup v27.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v0.2S, v24.2S, v25.2S
|
||||
trn1 v2.2S, v26.2S, v27.2S
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
1: ld1 {v6.8B}, [x1], x2
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umlal v18.8H, v6.8B, v2.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
dup v24.8b, w4
|
||||
dup v25.8b, w12
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
dup v26.8b, w6
|
||||
dup v27.8b, w7
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
trn1 v0.2s, v24.2s, v25.2s
|
||||
trn1 v2.2s, v26.2s, v27.2s
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
1: ld1 {v6.8b}, [x1], x2
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
trn1 v6.2s, v6.2s, v7.2s
|
||||
umull v18.8h, v4.8b, v0.8b
|
||||
umlal v18.8h, v6.8b, v2.8b
|
||||
ld1 {v4.8b}, [x1], x2
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
prfm pldl1strm, [x1]
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
umlal v19.8H, v4.8B, v2.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
umull v19.8h, v6.8b, v0.8b
|
||||
umlal v19.8h, v4.8b, v2.8b
|
||||
trn1 v30.2d, v18.2d, v19.2d
|
||||
trn2 v31.2d, v18.2d, v19.2d
|
||||
add v18.8h, v30.8h, v31.8h
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v30.8B, w4
|
||||
dup v30.8b, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v31.8B, w12
|
||||
trn1 v0.2S, v30.2S, v31.2S
|
||||
trn2 v1.2S, v30.2S, v31.2S
|
||||
dup v31.8b, w12
|
||||
trn1 v0.2s, v30.2s, v31.2s
|
||||
trn2 v1.2s, v30.2s, v31.2s
|
||||
b.eq 4f
|
||||
|
||||
ext v1.8B, v0.8B, v1.8B, #4
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
3: ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
umull v19.8H, v4.8B, v1.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
ext v1.8b, v0.8b, v1.8b, #4
|
||||
ld1 {v4.s}[0], [x1], x2
|
||||
3: ld1 {v4.s}[1], [x1], x2
|
||||
umull v18.8h, v4.8b, v0.8b
|
||||
ld1 {v4.s}[0], [x1], x2
|
||||
umull v19.8h, v4.8b, v1.8b
|
||||
trn1 v30.2d, v18.2d, v19.2d
|
||||
trn2 v31.2d, v18.2d, v19.2d
|
||||
add v18.8h, v30.8h, v31.8h
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v6.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
4: ld1 {v4.8b}, [x1], x2
|
||||
ld1 {v6.8b}, [x1], x2
|
||||
ext v5.8b, v4.8b, v5.8b, #1
|
||||
ext v7.8b, v6.8b, v7.8b, #1
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
trn1 v6.2s, v6.2s, v7.2s
|
||||
umull v18.8h, v4.8b, v0.8b
|
||||
umull v19.8h, v6.8b, v0.8b
|
||||
subs w3, w3, #2
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
trn1 v30.2d, v18.2d, v19.2d
|
||||
trn2 v31.2d, v18.2d, v19.2d
|
||||
add v18.8h, v30.8h, v31.8h
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v30.8B
|
||||
5: ld1 {v4.s}[0], [x1], x2
|
||||
ld1 {v4.s}[1], [x1], x2
|
||||
umull v18.8h, v4.8b, v30.8b
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
add v18.8h, v18.8h, v22.8h
|
||||
shrn v16.8b, v18.8h, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
ld1 {v20.s}[0], [x8], x2
|
||||
ld1 {v20.s}[1], [x8], x2
|
||||
urhadd v16.8b, v16.8b, v20.8b
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
st1 {v16.s}[0], [x0], x2
|
||||
st1 {v16.s}[1], [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
@ -372,51 +372,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
dup v0.8B, w4
|
||||
dup v2.8B, w12
|
||||
dup v1.8B, w6
|
||||
dup v3.8B, w7
|
||||
trn1 v0.4H, v0.4H, v2.4H
|
||||
trn1 v1.4H, v1.4H, v3.4H
|
||||
dup v0.8b, w4
|
||||
dup v2.8b, w12
|
||||
dup v1.8b, w6
|
||||
dup v3.8b, w7
|
||||
trn1 v0.4h, v0.4h, v2.4h
|
||||
trn1 v1.4h, v1.4h, v3.4h
|
||||
1:
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
rev64 v5.2S, v4.2S
|
||||
ld1 {v5.S}[1], [x1]
|
||||
ext v6.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v5.8B, v4.8B, #1
|
||||
trn1 v4.4H, v4.4H, v6.4H
|
||||
trn1 v5.4H, v5.4H, v7.4H
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
ld1 {v4.s}[0], [x1], x2
|
||||
ld1 {v4.s}[1], [x1], x2
|
||||
rev64 v5.2s, v4.2s
|
||||
ld1 {v5.s}[1], [x1]
|
||||
ext v6.8b, v4.8b, v5.8b, #1
|
||||
ext v7.8b, v5.8b, v4.8b, #1
|
||||
trn1 v4.4h, v4.4h, v6.4h
|
||||
trn1 v5.4h, v5.4h, v7.4h
|
||||
umull v16.8h, v4.8b, v0.8b
|
||||
umlal v16.8h, v5.8b, v1.8b
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[2], [x0]
|
||||
ld1 {v18.h}[0], [x0], x2
|
||||
ld1 {v18.h}[2], [x0]
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
rev64 v17.4S, v16.4S
|
||||
add v16.8H, v16.8H, v17.8H
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rev64 v17.4s, v16.4s
|
||||
add v16.8h, v16.8h, v17.8h
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
.ifc \type,avg
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
urhadd v16.8b, v16.8b, v18.8b
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[2], [x0], x2
|
||||
st1 {v16.h}[0], [x0], x2
|
||||
st1 {v16.h}[2], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2:
|
||||
ld1 {v16.H}[0], [x1], x2
|
||||
ld1 {v16.H}[1], [x1], x2
|
||||
ld1 {v16.h}[0], [x1], x2
|
||||
ld1 {v16.h}[1], [x1], x2
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[1], [x0]
|
||||
ld1 {v18.h}[0], [x0], x2
|
||||
ld1 {v18.h}[1], [x0]
|
||||
sub x0, x0, x2
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
urhadd v16.8b, v16.8b, v18.8b
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[1], [x0], x2
|
||||
st1 {v16.h}[0], [x0], x2
|
||||
st1 {v16.h}[1], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 2b
|
||||
ret
|
||||
|
@ -27,7 +27,7 @@
|
||||
cmp w2, #0
|
||||
ldr w6, [x4]
|
||||
ccmp w3, #0, #0, ne
|
||||
mov v24.S[0], w6
|
||||
mov v24.s[0], w6
|
||||
and w8, w6, w6, lsl #16
|
||||
b.eq 1f
|
||||
ands w8, w8, w8, lsl #8
|
||||
@ -38,95 +38,95 @@
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
dup v22.16B, w2 // alpha
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
|
||||
uxtl v24.4S, v24.4H
|
||||
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
|
||||
sli v24.8H, v24.8H, #8
|
||||
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
|
||||
sli v24.4S, v24.4S, #16
|
||||
cmhi v21.16B, v22.16B, v21.16B // < alpha
|
||||
dup v22.16B, w3 // beta
|
||||
cmlt v23.16B, v24.16B, #0
|
||||
cmhi v28.16B, v22.16B, v28.16B // < beta
|
||||
cmhi v30.16B, v22.16B, v30.16B // < beta
|
||||
bic v21.16B, v21.16B, v23.16B
|
||||
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
||||
and v21.16B, v21.16B, v28.16B
|
||||
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
||||
and v21.16B, v21.16B, v30.16B // < beta
|
||||
dup v22.16b, w2 // alpha
|
||||
uxtl v24.8h, v24.8b
|
||||
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||
uxtl v24.4s, v24.4h
|
||||
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||
sli v24.8h, v24.8h, #8
|
||||
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||
sli v24.4s, v24.4s, #16
|
||||
cmhi v21.16b, v22.16b, v21.16b // < alpha
|
||||
dup v22.16b, w3 // beta
|
||||
cmlt v23.16b, v24.16b, #0
|
||||
cmhi v28.16b, v22.16b, v28.16b // < beta
|
||||
cmhi v30.16b, v22.16b, v30.16b // < beta
|
||||
bic v21.16b, v21.16b, v23.16b
|
||||
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
|
||||
and v21.16b, v21.16b, v28.16b
|
||||
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
|
||||
and v21.16b, v21.16b, v30.16b // < beta
|
||||
shrn v30.8b, v21.8h, #4
|
||||
mov x7, v30.d[0]
|
||||
cmhi v17.16B, v22.16B, v17.16B // < beta
|
||||
cmhi v19.16B, v22.16B, v19.16B // < beta
|
||||
cmhi v17.16b, v22.16b, v17.16b // < beta
|
||||
cmhi v19.16b, v22.16b, v19.16b // < beta
|
||||
cbz x7, 9f
|
||||
and v17.16B, v17.16B, v21.16B
|
||||
and v19.16B, v19.16B, v21.16B
|
||||
and v24.16B, v24.16B, v21.16B
|
||||
urhadd v28.16B, v16.16B, v0.16B
|
||||
sub v21.16B, v24.16B, v17.16B
|
||||
uqadd v23.16B, v18.16B, v24.16B
|
||||
uhadd v20.16B, v20.16B, v28.16B
|
||||
sub v21.16B, v21.16B, v19.16B
|
||||
uhadd v28.16B, v4.16B, v28.16B
|
||||
umin v23.16B, v23.16B, v20.16B
|
||||
uqsub v22.16B, v18.16B, v24.16B
|
||||
uqadd v4.16B, v2.16B, v24.16B
|
||||
umax v23.16B, v23.16B, v22.16B
|
||||
uqsub v22.16B, v2.16B, v24.16B
|
||||
umin v28.16B, v4.16B, v28.16B
|
||||
uxtl v4.8H, v0.8B
|
||||
umax v28.16B, v28.16B, v22.16B
|
||||
uxtl2 v20.8H, v0.16B
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
usubw2 v20.8H, v20.8H, v16.16B
|
||||
shl v4.8H, v4.8H, #2
|
||||
shl v20.8H, v20.8H, #2
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
uaddw2 v20.8H, v20.8H, v18.16B
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
usubw2 v20.8H, v20.8H, v2.16B
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
rshrn2 v4.16B, v20.8H, #3
|
||||
bsl v17.16B, v23.16B, v18.16B
|
||||
bsl v19.16B, v28.16B, v2.16B
|
||||
neg v23.16B, v21.16B
|
||||
uxtl v28.8H, v16.8B
|
||||
smin v4.16B, v4.16B, v21.16B
|
||||
uxtl2 v21.8H, v16.16B
|
||||
smax v4.16B, v4.16B, v23.16B
|
||||
uxtl v22.8H, v0.8B
|
||||
uxtl2 v24.8H, v0.16B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
saddw2 v21.8H, v21.8H, v4.16B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
ssubw2 v24.8H, v24.8H, v4.16B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun2 v16.16B, v21.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
sqxtun2 v0.16B, v24.8H
|
||||
and v17.16b, v17.16b, v21.16b
|
||||
and v19.16b, v19.16b, v21.16b
|
||||
and v24.16b, v24.16b, v21.16b
|
||||
urhadd v28.16b, v16.16b, v0.16b
|
||||
sub v21.16b, v24.16b, v17.16b
|
||||
uqadd v23.16b, v18.16b, v24.16b
|
||||
uhadd v20.16b, v20.16b, v28.16b
|
||||
sub v21.16b, v21.16b, v19.16b
|
||||
uhadd v28.16b, v4.16b, v28.16b
|
||||
umin v23.16b, v23.16b, v20.16b
|
||||
uqsub v22.16b, v18.16b, v24.16b
|
||||
uqadd v4.16b, v2.16b, v24.16b
|
||||
umax v23.16b, v23.16b, v22.16b
|
||||
uqsub v22.16b, v2.16b, v24.16b
|
||||
umin v28.16b, v4.16b, v28.16b
|
||||
uxtl v4.8h, v0.8b
|
||||
umax v28.16b, v28.16b, v22.16b
|
||||
uxtl2 v20.8h, v0.16b
|
||||
usubw v4.8h, v4.8h, v16.8b
|
||||
usubw2 v20.8h, v20.8h, v16.16b
|
||||
shl v4.8h, v4.8h, #2
|
||||
shl v20.8h, v20.8h, #2
|
||||
uaddw v4.8h, v4.8h, v18.8b
|
||||
uaddw2 v20.8h, v20.8h, v18.16b
|
||||
usubw v4.8h, v4.8h, v2.8b
|
||||
usubw2 v20.8h, v20.8h, v2.16b
|
||||
rshrn v4.8b, v4.8h, #3
|
||||
rshrn2 v4.16b, v20.8h, #3
|
||||
bsl v17.16b, v23.16b, v18.16b
|
||||
bsl v19.16b, v28.16b, v2.16b
|
||||
neg v23.16b, v21.16b
|
||||
uxtl v28.8h, v16.8b
|
||||
smin v4.16b, v4.16b, v21.16b
|
||||
uxtl2 v21.8h, v16.16b
|
||||
smax v4.16b, v4.16b, v23.16b
|
||||
uxtl v22.8h, v0.8b
|
||||
uxtl2 v24.8h, v0.16b
|
||||
saddw v28.8h, v28.8h, v4.8b
|
||||
saddw2 v21.8h, v21.8h, v4.16b
|
||||
ssubw v22.8h, v22.8h, v4.8b
|
||||
ssubw2 v24.8h, v24.8h, v4.16b
|
||||
sqxtun v16.8b, v28.8h
|
||||
sqxtun2 v16.16b, v21.8h
|
||||
sqxtun v0.8b, v22.8h
|
||||
sqxtun2 v0.16b, v24.8h
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
ld1 {v0.16B}, [x0], x1
|
||||
ld1 {v2.16B}, [x0], x1
|
||||
ld1 {v4.16B}, [x0], x1
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v2.16b}, [x0], x1
|
||||
ld1 {v4.16b}, [x0], x1
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
ld1 {v18.16B}, [x0], x1
|
||||
ld1 {v16.16B}, [x0], x1
|
||||
ld1 {v20.16b}, [x0], x1
|
||||
ld1 {v18.16b}, [x0], x1
|
||||
ld1 {v16.16b}, [x0], x1
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v17.16B}, [x0], x1
|
||||
st1 {v16.16B}, [x0], x1
|
||||
st1 {v0.16B}, [x0], x1
|
||||
st1 {v19.16B}, [x0]
|
||||
st1 {v17.16b}, [x0], x1
|
||||
st1 {v16.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
st1 {v19.16b}, [x0]
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
@ -135,22 +135,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
ld1 {v20.8B}, [x0], x1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0], x1
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
ld1 {v26.8B}, [x0], x1
|
||||
ld1 {v6.D}[1], [x0], x1
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
ld1 {v18.D}[1], [x0], x1
|
||||
ld1 {v16.D}[1], [x0], x1
|
||||
ld1 {v0.D}[1], [x0], x1
|
||||
ld1 {v2.D}[1], [x0], x1
|
||||
ld1 {v4.D}[1], [x0], x1
|
||||
ld1 {v26.D}[1], [x0], x1
|
||||
ld1 {v6.8b}, [x0], x1
|
||||
ld1 {v20.8b}, [x0], x1
|
||||
ld1 {v18.8b}, [x0], x1
|
||||
ld1 {v16.8b}, [x0], x1
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v2.8b}, [x0], x1
|
||||
ld1 {v4.8b}, [x0], x1
|
||||
ld1 {v26.8b}, [x0], x1
|
||||
ld1 {v6.d}[1], [x0], x1
|
||||
ld1 {v20.d}[1], [x0], x1
|
||||
ld1 {v18.d}[1], [x0], x1
|
||||
ld1 {v16.d}[1], [x0], x1
|
||||
ld1 {v0.d}[1], [x0], x1
|
||||
ld1 {v2.d}[1], [x0], x1
|
||||
ld1 {v4.d}[1], [x0], x1
|
||||
ld1 {v26.d}[1], [x0], x1
|
||||
|
||||
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
||||
|
||||
@ -160,22 +160,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
add x0, x0, #2
|
||||
st1 {v17.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v19.S}[0], [x0], x1
|
||||
st1 {v17.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v19.S}[1], [x0], x1
|
||||
st1 {v17.S}[2], [x0], x1
|
||||
st1 {v16.S}[2], [x0], x1
|
||||
st1 {v0.S}[2], [x0], x1
|
||||
st1 {v19.S}[2], [x0], x1
|
||||
st1 {v17.S}[3], [x0], x1
|
||||
st1 {v16.S}[3], [x0], x1
|
||||
st1 {v0.S}[3], [x0], x1
|
||||
st1 {v19.S}[3], [x0], x1
|
||||
st1 {v17.s}[0], [x0], x1
|
||||
st1 {v16.s}[0], [x0], x1
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
st1 {v19.s}[0], [x0], x1
|
||||
st1 {v17.s}[1], [x0], x1
|
||||
st1 {v16.s}[1], [x0], x1
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
st1 {v19.s}[1], [x0], x1
|
||||
st1 {v17.s}[2], [x0], x1
|
||||
st1 {v16.s}[2], [x0], x1
|
||||
st1 {v0.s}[2], [x0], x1
|
||||
st1 {v19.s}[2], [x0], x1
|
||||
st1 {v17.s}[3], [x0], x1
|
||||
st1 {v16.s}[3], [x0], x1
|
||||
st1 {v0.s}[3], [x0], x1
|
||||
st1 {v19.s}[3], [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
@ -377,52 +377,52 @@ function ff_h264_h_loop_filter_luma_intra_neon, export=1
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
dup v22.8B, w2 // alpha
|
||||
dup v23.8B, w3 // beta
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
||||
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
||||
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
||||
cmhi v26.8B, v22.8B, v26.8B // < alpha
|
||||
cmhi v28.8B, v23.8B, v28.8B // < beta
|
||||
cmhi v30.8B, v23.8B, v30.8B // < beta
|
||||
uxtl v4.8H, v0.8B
|
||||
and v26.8B, v26.8B, v28.8B
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
and v26.8B, v26.8B, v30.8B
|
||||
shl v4.8H, v4.8H, #2
|
||||
dup v22.8b, w2 // alpha
|
||||
dup v23.8b, w3 // beta
|
||||
uxtl v24.8h, v24.8b
|
||||
uabd v26.8b, v16.8b, v0.8b // abs(p0 - q0)
|
||||
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
|
||||
uabd v30.8b, v2.8b, v0.8b // abs(q1 - q0)
|
||||
cmhi v26.8b, v22.8b, v26.8b // < alpha
|
||||
cmhi v28.8b, v23.8b, v28.8b // < beta
|
||||
cmhi v30.8b, v23.8b, v30.8b // < beta
|
||||
uxtl v4.8h, v0.8b
|
||||
and v26.8b, v26.8b, v28.8b
|
||||
usubw v4.8h, v4.8h, v16.8b
|
||||
and v26.8b, v26.8b, v30.8b
|
||||
shl v4.8h, v4.8h, #2
|
||||
mov x8, v26.d[0]
|
||||
sli v24.8H, v24.8H, #8
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
sli v24.8h, v24.8h, #8
|
||||
uaddw v4.8h, v4.8h, v18.8b
|
||||
cbz x8, 9f
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
smin v4.8B, v4.8B, v24.8B
|
||||
neg v25.8B, v24.8B
|
||||
smax v4.8B, v4.8B, v25.8B
|
||||
uxtl v22.8H, v0.8B
|
||||
and v4.8B, v4.8B, v26.8B
|
||||
uxtl v28.8H, v16.8B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
usubw v4.8h, v4.8h, v2.8b
|
||||
rshrn v4.8b, v4.8h, #3
|
||||
smin v4.8b, v4.8b, v24.8b
|
||||
neg v25.8b, v24.8b
|
||||
smax v4.8b, v4.8b, v25.8b
|
||||
uxtl v22.8h, v0.8b
|
||||
and v4.8b, v4.8b, v26.8b
|
||||
uxtl v28.8h, v16.8b
|
||||
saddw v28.8h, v28.8h, v4.8b
|
||||
ssubw v22.8h, v22.8h, v4.8b
|
||||
sqxtun v16.8b, v28.8h
|
||||
sqxtun v0.8b, v22.8h
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0]
|
||||
ld1 {v18.8b}, [x0], x1
|
||||
ld1 {v16.8b}, [x0], x1
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v2.8b}, [x0]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.8B}, [x0], x1
|
||||
st1 {v0.8B}, [x0], x1
|
||||
st1 {v16.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
@ -432,14 +432,14 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
|
||||
|
||||
sub x0, x0, #2
|
||||
h_loop_filter_chroma420:
|
||||
ld1 {v18.S}[0], [x0], x1
|
||||
ld1 {v16.S}[0], [x0], x1
|
||||
ld1 {v0.S}[0], [x0], x1
|
||||
ld1 {v2.S}[0], [x0], x1
|
||||
ld1 {v18.S}[1], [x0], x1
|
||||
ld1 {v16.S}[1], [x0], x1
|
||||
ld1 {v0.S}[1], [x0], x1
|
||||
ld1 {v2.S}[1], [x0], x1
|
||||
ld1 {v18.s}[0], [x0], x1
|
||||
ld1 {v16.s}[0], [x0], x1
|
||||
ld1 {v0.s}[0], [x0], x1
|
||||
ld1 {v2.s}[0], [x0], x1
|
||||
ld1 {v18.s}[1], [x0], x1
|
||||
ld1 {v16.s}[1], [x0], x1
|
||||
ld1 {v0.s}[1], [x0], x1
|
||||
ld1 {v2.s}[1], [x0], x1
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
@ -448,14 +448,14 @@ h_loop_filter_chroma420:
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
sub x0, x0, x1, lsl #3
|
||||
st1 {v18.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v2.S}[0], [x0], x1
|
||||
st1 {v18.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v2.S}[1], [x0], x1
|
||||
st1 {v18.s}[0], [x0], x1
|
||||
st1 {v16.s}[0], [x0], x1
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
st1 {v2.s}[0], [x0], x1
|
||||
st1 {v18.s}[1], [x0], x1
|
||||
st1 {v16.s}[1], [x0], x1
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
st1 {v2.s}[1], [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
@ -584,102 +584,102 @@ function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
|
||||
endfunc
|
||||
|
||||
.macro biweight_16 macs, macd
|
||||
dup v0.16B, w5
|
||||
dup v1.16B, w6
|
||||
mov v4.16B, v16.16B
|
||||
mov v6.16B, v16.16B
|
||||
dup v0.16b, w5
|
||||
dup v1.16b, w6
|
||||
mov v4.16b, v16.16b
|
||||
mov v6.16b, v16.16b
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v20.16B}, [x0], x2
|
||||
\macd v4.8H, v0.8B, v20.8B
|
||||
ld1 {v20.16b}, [x0], x2
|
||||
\macd v4.8h, v0.8b, v20.8b
|
||||
\macd\()2 v6.8H, v0.16B, v20.16B
|
||||
ld1 {v22.16B}, [x1], x2
|
||||
\macs v4.8H, v1.8B, v22.8B
|
||||
ld1 {v22.16b}, [x1], x2
|
||||
\macs v4.8h, v1.8b, v22.8b
|
||||
\macs\()2 v6.8H, v1.16B, v22.16B
|
||||
mov v24.16B, v16.16B
|
||||
ld1 {v28.16B}, [x0], x2
|
||||
mov v26.16B, v16.16B
|
||||
\macd v24.8H, v0.8B, v28.8B
|
||||
mov v24.16b, v16.16b
|
||||
ld1 {v28.16b}, [x0], x2
|
||||
mov v26.16b, v16.16b
|
||||
\macd v24.8h, v0.8b, v28.8b
|
||||
\macd\()2 v26.8H, v0.16B, v28.16B
|
||||
ld1 {v30.16B}, [x1], x2
|
||||
\macs v24.8H, v1.8B, v30.8B
|
||||
ld1 {v30.16b}, [x1], x2
|
||||
\macs v24.8h, v1.8b, v30.8b
|
||||
\macs\()2 v26.8H, v1.16B, v30.16B
|
||||
sshl v4.8H, v4.8H, v18.8H
|
||||
sshl v6.8H, v6.8H, v18.8H
|
||||
sqxtun v4.8B, v4.8H
|
||||
sqxtun2 v4.16B, v6.8H
|
||||
sshl v24.8H, v24.8H, v18.8H
|
||||
sshl v26.8H, v26.8H, v18.8H
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v26.8H
|
||||
mov v6.16B, v16.16B
|
||||
st1 {v4.16B}, [x7], x2
|
||||
mov v4.16B, v16.16B
|
||||
st1 {v24.16B}, [x7], x2
|
||||
sshl v4.8h, v4.8h, v18.8h
|
||||
sshl v6.8h, v6.8h, v18.8h
|
||||
sqxtun v4.8b, v4.8h
|
||||
sqxtun2 v4.16b, v6.8h
|
||||
sshl v24.8h, v24.8h, v18.8h
|
||||
sshl v26.8h, v26.8h, v18.8h
|
||||
sqxtun v24.8b, v24.8h
|
||||
sqxtun2 v24.16b, v26.8h
|
||||
mov v6.16b, v16.16b
|
||||
st1 {v4.16b}, [x7], x2
|
||||
mov v4.16b, v16.16b
|
||||
st1 {v24.16b}, [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_8 macs, macd
|
||||
dup v0.8B, w5
|
||||
dup v1.8B, w6
|
||||
mov v2.16B, v16.16B
|
||||
mov v20.16B, v16.16B
|
||||
dup v0.8b, w5
|
||||
dup v1.8b, w6
|
||||
mov v2.16b, v16.16b
|
||||
mov v20.16b, v16.16b
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
\macd v2.8H, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
\macs v2.8H, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
\macd v20.8H, v0.8B, v6.8B
|
||||
ld1 {v7.8B}, [x1], x2
|
||||
\macs v20.8H, v1.8B, v7.8B
|
||||
sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
sshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
mov v20.16B, v16.16B
|
||||
st1 {v2.8B}, [x7], x2
|
||||
mov v2.16B, v16.16B
|
||||
st1 {v4.8B}, [x7], x2
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
\macd v2.8h, v0.8b, v4.8b
|
||||
ld1 {v5.8b}, [x1], x2
|
||||
\macs v2.8h, v1.8b, v5.8b
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
\macd v20.8h, v0.8b, v6.8b
|
||||
ld1 {v7.8b}, [x1], x2
|
||||
\macs v20.8h, v1.8b, v7.8b
|
||||
sshl v2.8h, v2.8h, v18.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
sshl v20.8h, v20.8h, v18.8h
|
||||
sqxtun v4.8b, v20.8h
|
||||
mov v20.16b, v16.16b
|
||||
st1 {v2.8b}, [x7], x2
|
||||
mov v2.16b, v16.16b
|
||||
st1 {v4.8b}, [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_4 macs, macd
|
||||
dup v0.8B, w5
|
||||
dup v1.8B, w6
|
||||
mov v2.16B, v16.16B
|
||||
mov v20.16B,v16.16B
|
||||
dup v0.8b, w5
|
||||
dup v1.8b, w6
|
||||
mov v2.16b, v16.16b
|
||||
mov v20.16b,v16.16b
|
||||
1: subs w3, w3, #4
|
||||
ld1 {v4.S}[0], [x0], x2
|
||||
ld1 {v4.S}[1], [x0], x2
|
||||
\macd v2.8H, v0.8B, v4.8B
|
||||
ld1 {v5.S}[0], [x1], x2
|
||||
ld1 {v5.S}[1], [x1], x2
|
||||
\macs v2.8H, v1.8B, v5.8B
|
||||
ld1 {v4.s}[0], [x0], x2
|
||||
ld1 {v4.s}[1], [x0], x2
|
||||
\macd v2.8h, v0.8b, v4.8b
|
||||
ld1 {v5.s}[0], [x1], x2
|
||||
ld1 {v5.s}[1], [x1], x2
|
||||
\macs v2.8h, v1.8b, v5.8b
|
||||
b.lt 2f
|
||||
ld1 {v6.S}[0], [x0], x2
|
||||
ld1 {v6.S}[1], [x0], x2
|
||||
\macd v20.8H, v0.8B, v6.8B
|
||||
ld1 {v7.S}[0], [x1], x2
|
||||
ld1 {v7.S}[1], [x1], x2
|
||||
\macs v20.8H, v1.8B, v7.8B
|
||||
sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
sshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
mov v20.16B, v16.16B
|
||||
st1 {v2.S}[0], [x7], x2
|
||||
st1 {v2.S}[1], [x7], x2
|
||||
mov v2.16B, v16.16B
|
||||
st1 {v4.S}[0], [x7], x2
|
||||
st1 {v4.S}[1], [x7], x2
|
||||
ld1 {v6.s}[0], [x0], x2
|
||||
ld1 {v6.s}[1], [x0], x2
|
||||
\macd v20.8h, v0.8b, v6.8b
|
||||
ld1 {v7.s}[0], [x1], x2
|
||||
ld1 {v7.s}[1], [x1], x2
|
||||
\macs v20.8h, v1.8b, v7.8b
|
||||
sshl v2.8h, v2.8h, v18.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
sshl v20.8h, v20.8h, v18.8h
|
||||
sqxtun v4.8b, v20.8h
|
||||
mov v20.16b, v16.16b
|
||||
st1 {v2.s}[0], [x7], x2
|
||||
st1 {v2.s}[1], [x7], x2
|
||||
mov v2.16b, v16.16b
|
||||
st1 {v4.s}[0], [x7], x2
|
||||
st1 {v4.s}[1], [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
2: sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
st1 {v2.S}[0], [x7], x2
|
||||
st1 {v2.S}[1], [x7], x2
|
||||
2: sshl v2.8h, v2.8h, v18.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v2.s}[0], [x7], x2
|
||||
st1 {v2.s}[1], [x7], x2
|
||||
ret
|
||||
.endm
|
||||
|
||||
@ -689,10 +689,10 @@ function ff_biweight_h264_pixels_\w\()_neon, export=1
|
||||
add w7, w7, #1
|
||||
eor w8, w8, w6, lsr #30
|
||||
orr w7, w7, #1
|
||||
dup v18.8H, w4
|
||||
dup v18.8h, w4
|
||||
lsl w7, w7, w4
|
||||
not v18.16B, v18.16B
|
||||
dup v16.8H, w7
|
||||
not v18.16b, v18.16b
|
||||
dup v16.8h, w7
|
||||
mov x7, x0
|
||||
cbz w8, 10f
|
||||
subs w8, w8, #1
|
||||
@ -716,78 +716,78 @@ endfunc
|
||||
biweight_func 4
|
||||
|
||||
.macro weight_16 add
|
||||
dup v0.16B, w4
|
||||
dup v0.16b, w4
|
||||
1: subs w2, w2, #2
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
umull v4.8H, v0.8B, v20.8B
|
||||
umull2 v6.8H, v0.16B, v20.16B
|
||||
ld1 {v28.16B}, [x0], x1
|
||||
umull v24.8H, v0.8B, v28.8B
|
||||
umull2 v26.8H, v0.16B, v28.16B
|
||||
\add v4.8H, v16.8H, v4.8H
|
||||
srshl v4.8H, v4.8H, v18.8H
|
||||
\add v6.8H, v16.8H, v6.8H
|
||||
srshl v6.8H, v6.8H, v18.8H
|
||||
sqxtun v4.8B, v4.8H
|
||||
sqxtun2 v4.16B, v6.8H
|
||||
\add v24.8H, v16.8H, v24.8H
|
||||
srshl v24.8H, v24.8H, v18.8H
|
||||
\add v26.8H, v16.8H, v26.8H
|
||||
srshl v26.8H, v26.8H, v18.8H
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v26.8H
|
||||
st1 {v4.16B}, [x5], x1
|
||||
st1 {v24.16B}, [x5], x1
|
||||
ld1 {v20.16b}, [x0], x1
|
||||
umull v4.8h, v0.8b, v20.8b
|
||||
umull2 v6.8h, v0.16b, v20.16b
|
||||
ld1 {v28.16b}, [x0], x1
|
||||
umull v24.8h, v0.8b, v28.8b
|
||||
umull2 v26.8h, v0.16b, v28.16b
|
||||
\add v4.8h, v16.8h, v4.8h
|
||||
srshl v4.8h, v4.8h, v18.8h
|
||||
\add v6.8h, v16.8h, v6.8h
|
||||
srshl v6.8h, v6.8h, v18.8h
|
||||
sqxtun v4.8b, v4.8h
|
||||
sqxtun2 v4.16b, v6.8h
|
||||
\add v24.8h, v16.8h, v24.8h
|
||||
srshl v24.8h, v24.8h, v18.8h
|
||||
\add v26.8h, v16.8h, v26.8h
|
||||
srshl v26.8h, v26.8h, v18.8h
|
||||
sqxtun v24.8b, v24.8h
|
||||
sqxtun2 v24.16b, v26.8h
|
||||
st1 {v4.16b}, [x5], x1
|
||||
st1 {v24.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_8 add
|
||||
dup v0.8B, w4
|
||||
dup v0.8b, w4
|
||||
1: subs w2, w2, #2
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
umull v2.8H, v0.8B, v4.8B
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
umull v20.8H, v0.8B, v6.8B
|
||||
\add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
\add v20.8H, v16.8H, v20.8H
|
||||
srshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
st1 {v2.8B}, [x5], x1
|
||||
st1 {v4.8B}, [x5], x1
|
||||
ld1 {v4.8b}, [x0], x1
|
||||
umull v2.8h, v0.8b, v4.8b
|
||||
ld1 {v6.8b}, [x0], x1
|
||||
umull v20.8h, v0.8b, v6.8b
|
||||
\add v2.8h, v16.8h, v2.8h
|
||||
srshl v2.8h, v2.8h, v18.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
\add v20.8h, v16.8h, v20.8h
|
||||
srshl v20.8h, v20.8h, v18.8h
|
||||
sqxtun v4.8b, v20.8h
|
||||
st1 {v2.8b}, [x5], x1
|
||||
st1 {v4.8b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_4 add
|
||||
dup v0.8B, w4
|
||||
dup v0.8b, w4
|
||||
1: subs w2, w2, #4
|
||||
ld1 {v4.S}[0], [x0], x1
|
||||
ld1 {v4.S}[1], [x0], x1
|
||||
umull v2.8H, v0.8B, v4.8B
|
||||
ld1 {v4.s}[0], [x0], x1
|
||||
ld1 {v4.s}[1], [x0], x1
|
||||
umull v2.8h, v0.8b, v4.8b
|
||||
b.lt 2f
|
||||
ld1 {v6.S}[0], [x0], x1
|
||||
ld1 {v6.S}[1], [x0], x1
|
||||
umull v20.8H, v0.8B, v6.8B
|
||||
\add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
\add v20.8H, v16.8H, v20.8H
|
||||
srshl v20.8H, v20.8h, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
st1 {v2.S}[0], [x5], x1
|
||||
st1 {v2.S}[1], [x5], x1
|
||||
st1 {v4.S}[0], [x5], x1
|
||||
st1 {v4.S}[1], [x5], x1
|
||||
ld1 {v6.s}[0], [x0], x1
|
||||
ld1 {v6.s}[1], [x0], x1
|
||||
umull v20.8h, v0.8b, v6.8b
|
||||
\add v2.8h, v16.8h, v2.8h
|
||||
srshl v2.8h, v2.8h, v18.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
\add v20.8h, v16.8h, v20.8h
|
||||
srshl v20.8h, v20.8h, v18.8h
|
||||
sqxtun v4.8b, v20.8h
|
||||
st1 {v2.s}[0], [x5], x1
|
||||
st1 {v2.s}[1], [x5], x1
|
||||
st1 {v4.s}[0], [x5], x1
|
||||
st1 {v4.s}[1], [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
2: \add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
st1 {v2.S}[0], [x5], x1
|
||||
st1 {v2.S}[1], [x5], x1
|
||||
2: \add v2.8h, v16.8h, v2.8h
|
||||
srshl v2.8h, v2.8h, v18.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v2.s}[0], [x5], x1
|
||||
st1 {v2.s}[1], [x5], x1
|
||||
ret
|
||||
.endm
|
||||
|
||||
@ -796,18 +796,18 @@ function ff_weight_h264_pixels_\w\()_neon, export=1
|
||||
cmp w3, #1
|
||||
mov w6, #1
|
||||
lsl w5, w5, w3
|
||||
dup v16.8H, w5
|
||||
dup v16.8h, w5
|
||||
mov x5, x0
|
||||
b.le 20f
|
||||
sub w6, w6, w3
|
||||
dup v18.8H, w6
|
||||
dup v18.8h, w6
|
||||
cmp w4, #0
|
||||
b.lt 10f
|
||||
weight_\w shadd
|
||||
10: neg w4, w4
|
||||
weight_\w shsub
|
||||
20: neg w6, w3
|
||||
dup v18.8H, w6
|
||||
dup v18.8h, w6
|
||||
cmp w4, #0
|
||||
b.lt 10f
|
||||
weight_\w add
|
||||
@ -825,7 +825,7 @@ endfunc
|
||||
ldr w6, [x4]
|
||||
ccmp w3, #0, #0, ne
|
||||
lsl w2, w2, #2
|
||||
mov v24.S[0], w6
|
||||
mov v24.s[0], w6
|
||||
lsl w3, w3, #2
|
||||
and w8, w6, w6, lsl #16
|
||||
b.eq 1f
|
||||
|
@ -25,54 +25,54 @@
|
||||
function ff_h264_idct_add_neon, export=1
|
||||
.L_ff_h264_idct_add_neon:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1]
|
||||
sxtw x2, w2
|
||||
movi v30.8H, #0
|
||||
movi v30.8h, #0
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
sshr v16.4H, v1.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sshr v17.4H, v3.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
sub v6.4H, v16.4H, v3.4H
|
||||
add v7.4H, v1.4H, v17.4H
|
||||
add v0.4H, v4.4H, v7.4H
|
||||
add v1.4H, v5.4H, v6.4H
|
||||
sub v2.4H, v5.4H, v6.4H
|
||||
sub v3.4H, v4.4H, v7.4H
|
||||
add v4.4h, v0.4h, v2.4h
|
||||
sshr v16.4h, v1.4h, #1
|
||||
st1 {v30.8h}, [x1], #16
|
||||
sshr v17.4h, v3.4h, #1
|
||||
st1 {v30.8h}, [x1], #16
|
||||
sub v5.4h, v0.4h, v2.4h
|
||||
sub v6.4h, v16.4h, v3.4h
|
||||
add v7.4h, v1.4h, v17.4h
|
||||
add v0.4h, v4.4h, v7.4h
|
||||
add v1.4h, v5.4h, v6.4h
|
||||
sub v2.4h, v5.4h, v6.4h
|
||||
sub v3.4h, v4.4h, v7.4h
|
||||
|
||||
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
ld1 {v18.S}[0], [x0], x2
|
||||
sshr v16.4H, v3.4H, #1
|
||||
sshr v17.4H, v1.4H, #1
|
||||
ld1 {v18.S}[1], [x0], x2
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
ld1 {v19.S}[1], [x0], x2
|
||||
add v6.4H, v16.4H, v1.4H
|
||||
ins v4.D[1], v5.D[0]
|
||||
sub v7.4H, v17.4H, v3.4H
|
||||
ld1 {v19.S}[0], [x0], x2
|
||||
ins v6.D[1], v7.D[0]
|
||||
add v4.4h, v0.4h, v2.4h
|
||||
ld1 {v18.s}[0], [x0], x2
|
||||
sshr v16.4h, v3.4h, #1
|
||||
sshr v17.4h, v1.4h, #1
|
||||
ld1 {v18.s}[1], [x0], x2
|
||||
sub v5.4h, v0.4h, v2.4h
|
||||
ld1 {v19.s}[1], [x0], x2
|
||||
add v6.4h, v16.4h, v1.4h
|
||||
ins v4.d[1], v5.d[0]
|
||||
sub v7.4h, v17.4h, v3.4h
|
||||
ld1 {v19.s}[0], [x0], x2
|
||||
ins v6.d[1], v7.d[0]
|
||||
sub x0, x0, x2, lsl #2
|
||||
add v0.8H, v4.8H, v6.8H
|
||||
sub v1.8H, v4.8H, v6.8H
|
||||
add v0.8h, v4.8h, v6.8h
|
||||
sub v1.8h, v4.8h, v6.8h
|
||||
|
||||
srshr v0.8H, v0.8H, #6
|
||||
srshr v1.8H, v1.8H, #6
|
||||
srshr v0.8h, v0.8h, #6
|
||||
srshr v1.8h, v1.8h, #6
|
||||
|
||||
uaddw v0.8H, v0.8H, v18.8B
|
||||
uaddw v1.8H, v1.8H, v19.8B
|
||||
uaddw v0.8h, v0.8h, v18.8b
|
||||
uaddw v1.8h, v1.8h, v19.8b
|
||||
|
||||
sqxtun v0.8B, v0.8H
|
||||
sqxtun v1.8B, v1.8H
|
||||
sqxtun v0.8b, v0.8h
|
||||
sqxtun v1.8b, v1.8h
|
||||
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
st1 {v0.s}[0], [x0], x2
|
||||
st1 {v0.s}[1], [x0], x2
|
||||
st1 {v1.s}[1], [x0], x2
|
||||
st1 {v1.s}[0], [x0], x2
|
||||
|
||||
sub x1, x1, #32
|
||||
ret
|
||||
@ -83,22 +83,22 @@ function ff_h264_idct_dc_add_neon, export=1
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
sxtw x2, w2
|
||||
mov w3, #0
|
||||
ld1r {v2.8H}, [x1]
|
||||
ld1r {v2.8h}, [x1]
|
||||
strh w3, [x1]
|
||||
srshr v2.8H, v2.8H, #6
|
||||
ld1 {v0.S}[0], [x0], x2
|
||||
ld1 {v0.S}[1], [x0], x2
|
||||
uaddw v3.8H, v2.8H, v0.8B
|
||||
ld1 {v1.S}[0], [x0], x2
|
||||
ld1 {v1.S}[1], [x0], x2
|
||||
uaddw v4.8H, v2.8H, v1.8B
|
||||
sqxtun v0.8B, v3.8H
|
||||
sqxtun v1.8B, v4.8H
|
||||
srshr v2.8h, v2.8h, #6
|
||||
ld1 {v0.s}[0], [x0], x2
|
||||
ld1 {v0.s}[1], [x0], x2
|
||||
uaddw v3.8h, v2.8h, v0.8b
|
||||
ld1 {v1.s}[0], [x0], x2
|
||||
ld1 {v1.s}[1], [x0], x2
|
||||
uaddw v4.8h, v2.8h, v1.8b
|
||||
sqxtun v0.8b, v3.8h
|
||||
sqxtun v1.8b, v4.8h
|
||||
sub x0, x0, x2, lsl #2
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
st1 {v0.s}[0], [x0], x2
|
||||
st1 {v0.s}[1], [x0], x2
|
||||
st1 {v1.s}[0], [x0], x2
|
||||
st1 {v1.s}[1], [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -194,71 +194,71 @@ endfunc
|
||||
.if \pass == 0
|
||||
va .req v18
|
||||
vb .req v30
|
||||
sshr v18.8H, v26.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
ld1 {v30.8H, v31.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sshr v19.8H, v30.8H, #1
|
||||
sub v18.8H, v18.8H, v30.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
sshr v18.8h, v26.8h, #1
|
||||
add v16.8h, v24.8h, v28.8h
|
||||
ld1 {v30.8h, v31.8h}, [x1]
|
||||
st1 {v19.8h}, [x1], #16
|
||||
st1 {v19.8h}, [x1], #16
|
||||
sub v17.8h, v24.8h, v28.8h
|
||||
sshr v19.8h, v30.8h, #1
|
||||
sub v18.8h, v18.8h, v30.8h
|
||||
add v19.8h, v19.8h, v26.8h
|
||||
.else
|
||||
va .req v30
|
||||
vb .req v18
|
||||
sshr v30.8H, v26.8H, #1
|
||||
sshr v19.8H, v18.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sub v30.8H, v30.8H, v18.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
sshr v30.8h, v26.8h, #1
|
||||
sshr v19.8h, v18.8h, #1
|
||||
add v16.8h, v24.8h, v28.8h
|
||||
sub v17.8h, v24.8h, v28.8h
|
||||
sub v30.8h, v30.8h, v18.8h
|
||||
add v19.8h, v19.8h, v26.8h
|
||||
.endif
|
||||
add v26.8H, v17.8H, va.8H
|
||||
sub v28.8H, v17.8H, va.8H
|
||||
add v24.8H, v16.8H, v19.8H
|
||||
sub vb.8H, v16.8H, v19.8H
|
||||
sub v16.8H, v29.8H, v27.8H
|
||||
add v17.8H, v31.8H, v25.8H
|
||||
sub va.8H, v31.8H, v25.8H
|
||||
add v19.8H, v29.8H, v27.8H
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v25.8H, #1
|
||||
sshr v27.8H, v27.8H, #1
|
||||
sshr v29.8H, v29.8H, #1
|
||||
sshr v31.8H, v31.8H, #1
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v16.8H, #2
|
||||
sshr v27.8H, v17.8H, #2
|
||||
sshr v29.8H, va.8H, #2
|
||||
sshr v31.8H, v19.8H, #2
|
||||
sub v19.8H, v19.8H, v25.8H
|
||||
sub va.8H, v27.8H, va.8H
|
||||
add v17.8H, v17.8H, v29.8H
|
||||
add v16.8H, v16.8H, v31.8H
|
||||
add v26.8h, v17.8h, va.8h
|
||||
sub v28.8h, v17.8h, va.8h
|
||||
add v24.8h, v16.8h, v19.8h
|
||||
sub vb.8h, v16.8h, v19.8h
|
||||
sub v16.8h, v29.8h, v27.8h
|
||||
add v17.8h, v31.8h, v25.8h
|
||||
sub va.8h, v31.8h, v25.8h
|
||||
add v19.8h, v29.8h, v27.8h
|
||||
sub v16.8h, v16.8h, v31.8h
|
||||
sub v17.8h, v17.8h, v27.8h
|
||||
add va.8h, va.8h, v29.8h
|
||||
add v19.8h, v19.8h, v25.8h
|
||||
sshr v25.8h, v25.8h, #1
|
||||
sshr v27.8h, v27.8h, #1
|
||||
sshr v29.8h, v29.8h, #1
|
||||
sshr v31.8h, v31.8h, #1
|
||||
sub v16.8h, v16.8h, v31.8h
|
||||
sub v17.8h, v17.8h, v27.8h
|
||||
add va.8h, va.8h, v29.8h
|
||||
add v19.8h, v19.8h, v25.8h
|
||||
sshr v25.8h, v16.8h, #2
|
||||
sshr v27.8h, v17.8h, #2
|
||||
sshr v29.8h, va.8h, #2
|
||||
sshr v31.8h, v19.8h, #2
|
||||
sub v19.8h, v19.8h, v25.8h
|
||||
sub va.8h, v27.8h, va.8h
|
||||
add v17.8h, v17.8h, v29.8h
|
||||
add v16.8h, v16.8h, v31.8h
|
||||
.if \pass == 0
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v18.8H
|
||||
sub v18.8H, v26.8H, v18.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
add v27.8H, v30.8H, v16.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
sub v28.8H, v30.8H, v16.8H
|
||||
sub v31.8h, v24.8h, v19.8h
|
||||
add v24.8h, v24.8h, v19.8h
|
||||
add v25.8h, v26.8h, v18.8h
|
||||
sub v18.8h, v26.8h, v18.8h
|
||||
add v26.8h, v28.8h, v17.8h
|
||||
add v27.8h, v30.8h, v16.8h
|
||||
sub v29.8h, v28.8h, v17.8h
|
||||
sub v28.8h, v30.8h, v16.8h
|
||||
.else
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v30.8H
|
||||
sub v30.8H, v26.8H, v30.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
add v27.8H, v18.8H, v16.8H
|
||||
sub v28.8H, v18.8H, v16.8H
|
||||
sub v31.8h, v24.8h, v19.8h
|
||||
add v24.8h, v24.8h, v19.8h
|
||||
add v25.8h, v26.8h, v30.8h
|
||||
sub v30.8h, v26.8h, v30.8h
|
||||
add v26.8h, v28.8h, v17.8h
|
||||
sub v29.8h, v28.8h, v17.8h
|
||||
add v27.8h, v18.8h, v16.8h
|
||||
sub v28.8h, v18.8h, v16.8h
|
||||
.endif
|
||||
.unreq va
|
||||
.unreq vb
|
||||
@ -267,63 +267,63 @@ endfunc
|
||||
function ff_h264_idct8_add_neon, export=1
|
||||
.L_ff_h264_idct8_add_neon:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
movi v19.8H, #0
|
||||
movi v19.8h, #0
|
||||
sxtw x2, w2
|
||||
ld1 {v24.8H, v25.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v26.8H, v27.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v28.8H, v29.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v24.8h, v25.8h}, [x1]
|
||||
st1 {v19.8h}, [x1], #16
|
||||
st1 {v19.8h}, [x1], #16
|
||||
ld1 {v26.8h, v27.8h}, [x1]
|
||||
st1 {v19.8h}, [x1], #16
|
||||
st1 {v19.8h}, [x1], #16
|
||||
ld1 {v28.8h, v29.8h}, [x1]
|
||||
st1 {v19.8h}, [x1], #16
|
||||
st1 {v19.8h}, [x1], #16
|
||||
|
||||
idct8x8_cols 0
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
|
||||
idct8x8_cols 1
|
||||
|
||||
mov x3, x0
|
||||
srshr v24.8H, v24.8H, #6
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v25.8H, v25.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
srshr v26.8H, v26.8H, #6
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
srshr v27.8H, v27.8H, #6
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
srshr v28.8H, v28.8H, #6
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
srshr v29.8H, v29.8H, #6
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
srshr v30.8H, v30.8H, #6
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v24.8H, v24.8H, v0.8B
|
||||
uaddw v25.8H, v25.8H, v1.8B
|
||||
uaddw v26.8H, v26.8H, v2.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
uaddw v27.8H, v27.8H, v3.8B
|
||||
sqxtun v1.8B, v25.8H
|
||||
uaddw v28.8H, v28.8H, v4.8B
|
||||
sqxtun v2.8B, v26.8H
|
||||
st1 {v0.8B}, [x3], x2
|
||||
uaddw v29.8H, v29.8H, v5.8B
|
||||
sqxtun v3.8B, v27.8H
|
||||
st1 {v1.8B}, [x3], x2
|
||||
uaddw v30.8H, v30.8H, v6.8B
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v2.8B}, [x3], x2
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v3.8B}, [x3], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x3], x2
|
||||
st1 {v5.8B}, [x3], x2
|
||||
st1 {v6.8B}, [x3], x2
|
||||
st1 {v7.8B}, [x3], x2
|
||||
srshr v24.8h, v24.8h, #6
|
||||
ld1 {v0.8b}, [x0], x2
|
||||
srshr v25.8h, v25.8h, #6
|
||||
ld1 {v1.8b}, [x0], x2
|
||||
srshr v26.8h, v26.8h, #6
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
srshr v27.8h, v27.8h, #6
|
||||
ld1 {v3.8b}, [x0], x2
|
||||
srshr v28.8h, v28.8h, #6
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
srshr v29.8h, v29.8h, #6
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
srshr v30.8h, v30.8h, #6
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
srshr v31.8h, v31.8h, #6
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
uaddw v24.8h, v24.8h, v0.8b
|
||||
uaddw v25.8h, v25.8h, v1.8b
|
||||
uaddw v26.8h, v26.8h, v2.8b
|
||||
sqxtun v0.8b, v24.8h
|
||||
uaddw v27.8h, v27.8h, v3.8b
|
||||
sqxtun v1.8b, v25.8h
|
||||
uaddw v28.8h, v28.8h, v4.8b
|
||||
sqxtun v2.8b, v26.8h
|
||||
st1 {v0.8b}, [x3], x2
|
||||
uaddw v29.8h, v29.8h, v5.8b
|
||||
sqxtun v3.8b, v27.8h
|
||||
st1 {v1.8b}, [x3], x2
|
||||
uaddw v30.8h, v30.8h, v6.8b
|
||||
sqxtun v4.8b, v28.8h
|
||||
st1 {v2.8b}, [x3], x2
|
||||
uaddw v31.8h, v31.8h, v7.8b
|
||||
sqxtun v5.8b, v29.8h
|
||||
st1 {v3.8b}, [x3], x2
|
||||
sqxtun v6.8b, v30.8h
|
||||
sqxtun v7.8b, v31.8h
|
||||
st1 {v4.8b}, [x3], x2
|
||||
st1 {v5.8b}, [x3], x2
|
||||
st1 {v6.8b}, [x3], x2
|
||||
st1 {v7.8b}, [x3], x2
|
||||
|
||||
sub x1, x1, #128
|
||||
ret
|
||||
@ -334,42 +334,42 @@ function ff_h264_idct8_dc_add_neon, export=1
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov w3, #0
|
||||
sxtw x2, w2
|
||||
ld1r {v31.8H}, [x1]
|
||||
ld1r {v31.8h}, [x1]
|
||||
strh w3, [x1]
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
uaddw v24.8H, v31.8H, v0.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
uaddw v25.8H, v31.8H, v1.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
uaddw v26.8H, v31.8H, v2.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
uaddw v27.8H, v31.8H, v3.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
uaddw v28.8H, v31.8H, v4.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v29.8H, v31.8H, v5.8B
|
||||
uaddw v30.8H, v31.8H, v6.8B
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
sqxtun v1.8B, v25.8H
|
||||
sqxtun v2.8B, v26.8H
|
||||
sqxtun v3.8B, v27.8H
|
||||
ld1 {v0.8b}, [x0], x2
|
||||
srshr v31.8h, v31.8h, #6
|
||||
ld1 {v1.8b}, [x0], x2
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
uaddw v24.8h, v31.8h, v0.8b
|
||||
ld1 {v3.8b}, [x0], x2
|
||||
uaddw v25.8h, v31.8h, v1.8b
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
uaddw v26.8h, v31.8h, v2.8b
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
uaddw v27.8h, v31.8h, v3.8b
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
uaddw v28.8h, v31.8h, v4.8b
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
uaddw v29.8h, v31.8h, v5.8b
|
||||
uaddw v30.8h, v31.8h, v6.8b
|
||||
uaddw v31.8h, v31.8h, v7.8b
|
||||
sqxtun v0.8b, v24.8h
|
||||
sqxtun v1.8b, v25.8h
|
||||
sqxtun v2.8b, v26.8h
|
||||
sqxtun v3.8b, v27.8h
|
||||
sub x0, x0, x2, lsl #3
|
||||
st1 {v0.8B}, [x0], x2
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v1.8B}, [x0], x2
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v2.8B}, [x0], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
st1 {v3.8B}, [x0], x2
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v6.8B}, [x0], x2
|
||||
st1 {v7.8B}, [x0], x2
|
||||
st1 {v0.8b}, [x0], x2
|
||||
sqxtun v4.8b, v28.8h
|
||||
st1 {v1.8b}, [x0], x2
|
||||
sqxtun v5.8b, v29.8h
|
||||
st1 {v2.8b}, [x0], x2
|
||||
sqxtun v6.8b, v30.8h
|
||||
st1 {v3.8b}, [x0], x2
|
||||
sqxtun v7.8b, v31.8h
|
||||
st1 {v4.8b}, [x0], x2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
st1 {v6.8b}, [x0], x2
|
||||
st1 {v7.8b}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
@ -27,127 +27,127 @@
|
||||
.macro lowpass_const r
|
||||
movz \r, #20, lsl #16
|
||||
movk \r, #5
|
||||
mov v6.S[0], \r
|
||||
mov v6.s[0], \r
|
||||
.endm
|
||||
|
||||
//trashes v0-v5
|
||||
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
||||
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
||||
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
||||
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v1.8B, \r0\().8B, \r1\().8B, #5
|
||||
uaddl \d0\().8H, \r0\().8B, v1.8B
|
||||
ext v0.8B, \r2\().8B, \r3\().8B, #2
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
ext v1.8B, \r2\().8B, \r3\().8B, #3
|
||||
uaddl v0.8H, v0.8B, v1.8B
|
||||
ext v1.8B, \r2\().8B, \r3\().8B, #1
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
ext v3.8B, \r2\().8B, \r3\().8B, #4
|
||||
uaddl v1.8H, v1.8B, v3.8B
|
||||
ext v2.8B, \r2\().8B, \r3\().8B, #5
|
||||
uaddl \d1\().8H, \r2\().8B, v2.8B
|
||||
mla \d1\().8H, v0.8H, v6.H[1]
|
||||
mls \d1\().8H, v1.8H, v6.H[0]
|
||||
ext v2.8b, \r0\().8b, \r1\().8b, #2
|
||||
ext v3.8b, \r0\().8b, \r1\().8b, #3
|
||||
uaddl v2.8h, v2.8b, v3.8b
|
||||
ext v4.8b, \r0\().8b, \r1\().8b, #1
|
||||
ext v5.8b, \r0\().8b, \r1\().8b, #4
|
||||
uaddl v4.8h, v4.8b, v5.8b
|
||||
ext v1.8b, \r0\().8b, \r1\().8b, #5
|
||||
uaddl \d0\().8h, \r0\().8b, v1.8b
|
||||
ext v0.8b, \r2\().8b, \r3\().8b, #2
|
||||
mla \d0\().8h, v2.8h, v6.h[1]
|
||||
ext v1.8b, \r2\().8b, \r3\().8b, #3
|
||||
uaddl v0.8h, v0.8b, v1.8b
|
||||
ext v1.8b, \r2\().8b, \r3\().8b, #1
|
||||
mls \d0\().8h, v4.8h, v6.h[0]
|
||||
ext v3.8b, \r2\().8b, \r3\().8b, #4
|
||||
uaddl v1.8h, v1.8b, v3.8b
|
||||
ext v2.8b, \r2\().8b, \r3\().8b, #5
|
||||
uaddl \d1\().8h, \r2\().8b, v2.8b
|
||||
mla \d1\().8h, v0.8h, v6.h[1]
|
||||
mls \d1\().8h, v1.8h, v6.h[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
sqrshrun \d1\().8B, \d1\().8H, #5
|
||||
sqrshrun \d0\().8b, \d0\().8h, #5
|
||||
sqrshrun \d1\().8b, \d1\().8h, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//trashes v0-v4
|
||||
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
|
||||
uaddl v2.8H, \r2\().8B, \r3\().8B
|
||||
uaddl v0.8H, \r3\().8B, \r4\().8B
|
||||
uaddl v4.8H, \r1\().8B, \r4\().8B
|
||||
uaddl v1.8H, \r2\().8B, \r5\().8B
|
||||
uaddl \d0\().8H, \r0\().8B, \r5\().8B
|
||||
uaddl \d1\().8H, \r1\().8B, \r6\().8B
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
mla \d1\().8H, v0.8H, v6.H[1]
|
||||
mls \d1\().8H, v1.8H, v6.H[0]
|
||||
uaddl v2.8h, \r2\().8b, \r3\().8b
|
||||
uaddl v0.8h, \r3\().8b, \r4\().8b
|
||||
uaddl v4.8h, \r1\().8b, \r4\().8b
|
||||
uaddl v1.8h, \r2\().8b, \r5\().8b
|
||||
uaddl \d0\().8h, \r0\().8b, \r5\().8b
|
||||
uaddl \d1\().8h, \r1\().8b, \r6\().8b
|
||||
mla \d0\().8h, v2.8h, v6.h[1]
|
||||
mls \d0\().8h, v4.8h, v6.h[0]
|
||||
mla \d1\().8h, v0.8h, v6.h[1]
|
||||
mls \d1\().8h, v1.8h, v6.h[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
sqrshrun \d1\().8B, \d1\().8H, #5
|
||||
sqrshrun \d0\().8b, \d0\().8h, #5
|
||||
sqrshrun \d1\().8b, \d1\().8h, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//trashes v0-v5, v7, v30-v31
|
||||
.macro lowpass_8H r0, r1
|
||||
ext v0.16B, \r0\().16B, \r0\().16B, #2
|
||||
ext v1.16B, \r0\().16B, \r0\().16B, #3
|
||||
uaddl v0.8H, v0.8B, v1.8B
|
||||
ext v2.16B, \r0\().16B, \r0\().16B, #1
|
||||
ext v3.16B, \r0\().16B, \r0\().16B, #4
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v30.16B, \r0\().16B, \r0\().16B, #5
|
||||
uaddl \r0\().8H, \r0\().8B, v30.8B
|
||||
ext v4.16B, \r1\().16B, \r1\().16B, #2
|
||||
mla \r0\().8H, v0.8H, v6.H[1]
|
||||
ext v5.16B, \r1\().16B, \r1\().16B, #3
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v7.16B, \r1\().16B, \r1\().16B, #1
|
||||
mls \r0\().8H, v2.8H, v6.H[0]
|
||||
ext v0.16B, \r1\().16B, \r1\().16B, #4
|
||||
uaddl v7.8H, v7.8B, v0.8B
|
||||
ext v31.16B, \r1\().16B, \r1\().16B, #5
|
||||
uaddl \r1\().8H, \r1\().8B, v31.8B
|
||||
mla \r1\().8H, v4.8H, v6.H[1]
|
||||
mls \r1\().8H, v7.8H, v6.H[0]
|
||||
ext v0.16b, \r0\().16b, \r0\().16b, #2
|
||||
ext v1.16b, \r0\().16b, \r0\().16b, #3
|
||||
uaddl v0.8h, v0.8b, v1.8b
|
||||
ext v2.16b, \r0\().16b, \r0\().16b, #1
|
||||
ext v3.16b, \r0\().16b, \r0\().16b, #4
|
||||
uaddl v2.8h, v2.8b, v3.8b
|
||||
ext v30.16b, \r0\().16b, \r0\().16b, #5
|
||||
uaddl \r0\().8h, \r0\().8b, v30.8b
|
||||
ext v4.16b, \r1\().16b, \r1\().16b, #2
|
||||
mla \r0\().8h, v0.8h, v6.h[1]
|
||||
ext v5.16b, \r1\().16b, \r1\().16b, #3
|
||||
uaddl v4.8h, v4.8b, v5.8b
|
||||
ext v7.16b, \r1\().16b, \r1\().16b, #1
|
||||
mls \r0\().8h, v2.8h, v6.h[0]
|
||||
ext v0.16b, \r1\().16b, \r1\().16b, #4
|
||||
uaddl v7.8h, v7.8b, v0.8b
|
||||
ext v31.16b, \r1\().16b, \r1\().16b, #5
|
||||
uaddl \r1\().8h, \r1\().8b, v31.8b
|
||||
mla \r1\().8h, v4.8h, v6.h[1]
|
||||
mls \r1\().8h, v7.8h, v6.h[0]
|
||||
.endm
|
||||
|
||||
// trashes v2-v5, v30
|
||||
.macro lowpass_8_1 r0, r1, d0, narrow=1
|
||||
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
||||
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
||||
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v30.8B, \r0\().8B, \r1\().8B, #5
|
||||
uaddl \d0\().8H, \r0\().8B, v30.8B
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
ext v2.8b, \r0\().8b, \r1\().8b, #2
|
||||
ext v3.8b, \r0\().8b, \r1\().8b, #3
|
||||
uaddl v2.8h, v2.8b, v3.8b
|
||||
ext v4.8b, \r0\().8b, \r1\().8b, #1
|
||||
ext v5.8b, \r0\().8b, \r1\().8b, #4
|
||||
uaddl v4.8h, v4.8b, v5.8b
|
||||
ext v30.8b, \r0\().8b, \r1\().8b, #5
|
||||
uaddl \d0\().8h, \r0\().8b, v30.8b
|
||||
mla \d0\().8h, v2.8h, v6.h[1]
|
||||
mls \d0\().8h, v4.8h, v6.h[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
sqrshrun \d0\().8b, \d0\().8h, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// trashed v0-v7
|
||||
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5
|
||||
saddl v5.4S, \r2\().4H, \r3\().4H
|
||||
saddl2 v1.4S, \r2\().8H, \r3\().8H
|
||||
saddl v6.4S, \r1\().4H, \r4\().4H
|
||||
saddl2 v2.4S, \r1\().8H, \r4\().8H
|
||||
saddl v0.4S, \r0\().4H, \r5\().4H
|
||||
saddl2 v4.4S, \r0\().8H, \r5\().8H
|
||||
saddl v5.4s, \r2\().4h, \r3\().4h
|
||||
saddl2 v1.4s, \r2\().8h, \r3\().8h
|
||||
saddl v6.4s, \r1\().4h, \r4\().4h
|
||||
saddl2 v2.4s, \r1\().8h, \r4\().8h
|
||||
saddl v0.4s, \r0\().4h, \r5\().4h
|
||||
saddl2 v4.4s, \r0\().8h, \r5\().8h
|
||||
|
||||
shl v3.4S, v5.4S, #4
|
||||
shl v5.4S, v5.4S, #2
|
||||
shl v7.4S, v6.4S, #2
|
||||
add v5.4S, v5.4S, v3.4S
|
||||
add v6.4S, v6.4S, v7.4S
|
||||
shl v3.4s, v5.4s, #4
|
||||
shl v5.4s, v5.4s, #2
|
||||
shl v7.4s, v6.4s, #2
|
||||
add v5.4s, v5.4s, v3.4s
|
||||
add v6.4s, v6.4s, v7.4s
|
||||
|
||||
shl v3.4S, v1.4S, #4
|
||||
shl v1.4S, v1.4S, #2
|
||||
shl v7.4S, v2.4S, #2
|
||||
add v1.4S, v1.4S, v3.4S
|
||||
add v2.4S, v2.4S, v7.4S
|
||||
shl v3.4s, v1.4s, #4
|
||||
shl v1.4s, v1.4s, #2
|
||||
shl v7.4s, v2.4s, #2
|
||||
add v1.4s, v1.4s, v3.4s
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
|
||||
add v5.4S, v5.4S, v0.4S
|
||||
sub v5.4S, v5.4S, v6.4S
|
||||
add v5.4s, v5.4s, v0.4s
|
||||
sub v5.4s, v5.4s, v6.4s
|
||||
|
||||
add v1.4S, v1.4S, v4.4S
|
||||
sub v1.4S, v1.4S, v2.4S
|
||||
add v1.4s, v1.4s, v4.4s
|
||||
sub v1.4s, v1.4s, v2.4s
|
||||
|
||||
rshrn v5.4H, v5.4S, #10
|
||||
rshrn2 v5.8H, v1.4S, #10
|
||||
rshrn v5.4h, v5.4s, #10
|
||||
rshrn2 v5.8h, v1.4s, #10
|
||||
|
||||
sqxtun \r0\().8B, v5.8H
|
||||
sqxtun \r0\().8b, v5.8h
|
||||
.endm
|
||||
|
||||
function put_h264_qpel16_h_lowpass_neon_packed
|
||||
@ -176,19 +176,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_neon
|
||||
1: ld1 {v28.8B, v29.8B}, [x1], x2
|
||||
ld1 {v16.8B, v17.8B}, [x1], x2
|
||||
1: ld1 {v28.8b, v29.8b}, [x1], x2
|
||||
ld1 {v16.8b, v17.8b}, [x1], x2
|
||||
subs x12, x12, #2
|
||||
lowpass_8 v28, v29, v16, v17, v28, v16
|
||||
.ifc \type,avg
|
||||
ld1 {v2.8B}, [x0], x3
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v28.8B, v28.8B, v2.8B
|
||||
urhadd v16.8B, v16.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x3
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v28.8b, v28.8b, v2.8b
|
||||
urhadd v16.8b, v16.8b, v3.8b
|
||||
sub x0, x0, x3
|
||||
.endif
|
||||
st1 {v28.8B}, [x0], x3
|
||||
st1 {v16.8B}, [x0], x3
|
||||
st1 {v28.8b}, [x0], x3
|
||||
st1 {v16.8b}, [x0], x3
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@ -213,23 +213,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
1: ld1 {v26.8B, v27.8B}, [x1], x2
|
||||
ld1 {v16.8B, v17.8B}, [x1], x2
|
||||
ld1 {v28.8B}, [x3], x2
|
||||
ld1 {v29.8B}, [x3], x2
|
||||
1: ld1 {v26.8b, v27.8b}, [x1], x2
|
||||
ld1 {v16.8b, v17.8b}, [x1], x2
|
||||
ld1 {v28.8b}, [x3], x2
|
||||
ld1 {v29.8b}, [x3], x2
|
||||
subs x12, x12, #2
|
||||
lowpass_8 v26, v27, v16, v17, v26, v27
|
||||
urhadd v26.8B, v26.8B, v28.8B
|
||||
urhadd v27.8B, v27.8B, v29.8B
|
||||
urhadd v26.8b, v26.8b, v28.8b
|
||||
urhadd v27.8b, v27.8b, v29.8b
|
||||
.ifc \type,avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v26.8B, v26.8B, v2.8B
|
||||
urhadd v27.8B, v27.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v26.8b, v26.8b, v2.8b
|
||||
urhadd v27.8b, v27.8b, v3.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v26.8B}, [x0], x2
|
||||
st1 {v27.8B}, [x0], x2
|
||||
st1 {v26.8b}, [x0], x2
|
||||
st1 {v27.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@ -270,52 +270,52 @@ function \type\()_h264_qpel16_v_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_neon
|
||||
ld1 {v16.8B}, [x1], x3
|
||||
ld1 {v17.8B}, [x1], x3
|
||||
ld1 {v18.8B}, [x1], x3
|
||||
ld1 {v19.8B}, [x1], x3
|
||||
ld1 {v20.8B}, [x1], x3
|
||||
ld1 {v21.8B}, [x1], x3
|
||||
ld1 {v22.8B}, [x1], x3
|
||||
ld1 {v23.8B}, [x1], x3
|
||||
ld1 {v24.8B}, [x1], x3
|
||||
ld1 {v25.8B}, [x1], x3
|
||||
ld1 {v26.8B}, [x1], x3
|
||||
ld1 {v27.8B}, [x1], x3
|
||||
ld1 {v28.8B}, [x1]
|
||||
ld1 {v16.8b}, [x1], x3
|
||||
ld1 {v17.8b}, [x1], x3
|
||||
ld1 {v18.8b}, [x1], x3
|
||||
ld1 {v19.8b}, [x1], x3
|
||||
ld1 {v20.8b}, [x1], x3
|
||||
ld1 {v21.8b}, [x1], x3
|
||||
ld1 {v22.8b}, [x1], x3
|
||||
ld1 {v23.8b}, [x1], x3
|
||||
ld1 {v24.8b}, [x1], x3
|
||||
ld1 {v25.8b}, [x1], x3
|
||||
ld1 {v26.8b}, [x1], x3
|
||||
ld1 {v27.8b}, [x1], x3
|
||||
ld1 {v28.8b}, [x1]
|
||||
|
||||
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
||||
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
||||
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
||||
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
||||
.ifc \type,avg
|
||||
ld1 {v24.8B}, [x0], x2
|
||||
ld1 {v25.8B}, [x0], x2
|
||||
ld1 {v26.8B}, [x0], x2
|
||||
urhadd v16.8B, v16.8B, v24.8B
|
||||
ld1 {v27.8B}, [x0], x2
|
||||
urhadd v17.8B, v17.8B, v25.8B
|
||||
ld1 {v28.8B}, [x0], x2
|
||||
urhadd v18.8B, v18.8B, v26.8B
|
||||
ld1 {v29.8B}, [x0], x2
|
||||
urhadd v19.8B, v19.8B, v27.8B
|
||||
ld1 {v30.8B}, [x0], x2
|
||||
urhadd v20.8B, v20.8B, v28.8B
|
||||
ld1 {v31.8B}, [x0], x2
|
||||
urhadd v21.8B, v21.8B, v29.8B
|
||||
urhadd v22.8B, v22.8B, v30.8B
|
||||
urhadd v23.8B, v23.8B, v31.8B
|
||||
ld1 {v24.8b}, [x0], x2
|
||||
ld1 {v25.8b}, [x0], x2
|
||||
ld1 {v26.8b}, [x0], x2
|
||||
urhadd v16.8b, v16.8b, v24.8b
|
||||
ld1 {v27.8b}, [x0], x2
|
||||
urhadd v17.8b, v17.8b, v25.8b
|
||||
ld1 {v28.8b}, [x0], x2
|
||||
urhadd v18.8b, v18.8b, v26.8b
|
||||
ld1 {v29.8b}, [x0], x2
|
||||
urhadd v19.8b, v19.8b, v27.8b
|
||||
ld1 {v30.8b}, [x0], x2
|
||||
urhadd v20.8b, v20.8b, v28.8b
|
||||
ld1 {v31.8b}, [x0], x2
|
||||
urhadd v21.8b, v21.8b, v29.8b
|
||||
urhadd v22.8b, v22.8b, v30.8b
|
||||
urhadd v23.8b, v23.8b, v31.8b
|
||||
sub x0, x0, x2, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v18.8B}, [x0], x2
|
||||
st1 {v19.8B}, [x0], x2
|
||||
st1 {v20.8B}, [x0], x2
|
||||
st1 {v21.8B}, [x0], x2
|
||||
st1 {v22.8B}, [x0], x2
|
||||
st1 {v23.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
st1 {v18.8b}, [x0], x2
|
||||
st1 {v19.8b}, [x0], x2
|
||||
st1 {v20.8b}, [x0], x2
|
||||
st1 {v21.8b}, [x0], x2
|
||||
st1 {v22.8b}, [x0], x2
|
||||
st1 {v23.8b}, [x0], x2
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@ -343,70 +343,70 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
ld1 {v16.8B}, [x1], x3
|
||||
ld1 {v17.8B}, [x1], x3
|
||||
ld1 {v18.8B}, [x1], x3
|
||||
ld1 {v19.8B}, [x1], x3
|
||||
ld1 {v20.8B}, [x1], x3
|
||||
ld1 {v21.8B}, [x1], x3
|
||||
ld1 {v22.8B}, [x1], x3
|
||||
ld1 {v23.8B}, [x1], x3
|
||||
ld1 {v24.8B}, [x1], x3
|
||||
ld1 {v25.8B}, [x1], x3
|
||||
ld1 {v26.8B}, [x1], x3
|
||||
ld1 {v27.8B}, [x1], x3
|
||||
ld1 {v28.8B}, [x1]
|
||||
ld1 {v16.8b}, [x1], x3
|
||||
ld1 {v17.8b}, [x1], x3
|
||||
ld1 {v18.8b}, [x1], x3
|
||||
ld1 {v19.8b}, [x1], x3
|
||||
ld1 {v20.8b}, [x1], x3
|
||||
ld1 {v21.8b}, [x1], x3
|
||||
ld1 {v22.8b}, [x1], x3
|
||||
ld1 {v23.8b}, [x1], x3
|
||||
ld1 {v24.8b}, [x1], x3
|
||||
ld1 {v25.8b}, [x1], x3
|
||||
ld1 {v26.8b}, [x1], x3
|
||||
ld1 {v27.8b}, [x1], x3
|
||||
ld1 {v28.8b}, [x1]
|
||||
|
||||
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
|
||||
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
|
||||
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
|
||||
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
|
||||
|
||||
ld1 {v24.8B}, [x12], x2
|
||||
ld1 {v25.8B}, [x12], x2
|
||||
ld1 {v26.8B}, [x12], x2
|
||||
ld1 {v27.8B}, [x12], x2
|
||||
ld1 {v28.8B}, [x12], x2
|
||||
urhadd v16.8B, v24.8B, v16.8B
|
||||
urhadd v17.8B, v25.8B, v17.8B
|
||||
ld1 {v29.8B}, [x12], x2
|
||||
urhadd v18.8B, v26.8B, v18.8B
|
||||
urhadd v19.8B, v27.8B, v19.8B
|
||||
ld1 {v30.8B}, [x12], x2
|
||||
urhadd v20.8B, v28.8B, v20.8B
|
||||
urhadd v21.8B, v29.8B, v21.8B
|
||||
ld1 {v31.8B}, [x12], x2
|
||||
urhadd v22.8B, v30.8B, v22.8B
|
||||
urhadd v23.8B, v31.8B, v23.8B
|
||||
ld1 {v24.8b}, [x12], x2
|
||||
ld1 {v25.8b}, [x12], x2
|
||||
ld1 {v26.8b}, [x12], x2
|
||||
ld1 {v27.8b}, [x12], x2
|
||||
ld1 {v28.8b}, [x12], x2
|
||||
urhadd v16.8b, v24.8b, v16.8b
|
||||
urhadd v17.8b, v25.8b, v17.8b
|
||||
ld1 {v29.8b}, [x12], x2
|
||||
urhadd v18.8b, v26.8b, v18.8b
|
||||
urhadd v19.8b, v27.8b, v19.8b
|
||||
ld1 {v30.8b}, [x12], x2
|
||||
urhadd v20.8b, v28.8b, v20.8b
|
||||
urhadd v21.8b, v29.8b, v21.8b
|
||||
ld1 {v31.8b}, [x12], x2
|
||||
urhadd v22.8b, v30.8b, v22.8b
|
||||
urhadd v23.8b, v31.8b, v23.8b
|
||||
|
||||
.ifc \type,avg
|
||||
ld1 {v24.8B}, [x0], x3
|
||||
ld1 {v25.8B}, [x0], x3
|
||||
ld1 {v26.8B}, [x0], x3
|
||||
urhadd v16.8B, v16.8B, v24.8B
|
||||
ld1 {v27.8B}, [x0], x3
|
||||
urhadd v17.8B, v17.8B, v25.8B
|
||||
ld1 {v28.8B}, [x0], x3
|
||||
urhadd v18.8B, v18.8B, v26.8B
|
||||
ld1 {v29.8B}, [x0], x3
|
||||
urhadd v19.8B, v19.8B, v27.8B
|
||||
ld1 {v30.8B}, [x0], x3
|
||||
urhadd v20.8B, v20.8B, v28.8B
|
||||
ld1 {v31.8B}, [x0], x3
|
||||
urhadd v21.8B, v21.8B, v29.8B
|
||||
urhadd v22.8B, v22.8B, v30.8B
|
||||
urhadd v23.8B, v23.8B, v31.8B
|
||||
ld1 {v24.8b}, [x0], x3
|
||||
ld1 {v25.8b}, [x0], x3
|
||||
ld1 {v26.8b}, [x0], x3
|
||||
urhadd v16.8b, v16.8b, v24.8b
|
||||
ld1 {v27.8b}, [x0], x3
|
||||
urhadd v17.8b, v17.8b, v25.8b
|
||||
ld1 {v28.8b}, [x0], x3
|
||||
urhadd v18.8b, v18.8b, v26.8b
|
||||
ld1 {v29.8b}, [x0], x3
|
||||
urhadd v19.8b, v19.8b, v27.8b
|
||||
ld1 {v30.8b}, [x0], x3
|
||||
urhadd v20.8b, v20.8b, v28.8b
|
||||
ld1 {v31.8b}, [x0], x3
|
||||
urhadd v21.8b, v21.8b, v29.8b
|
||||
urhadd v22.8b, v22.8b, v30.8b
|
||||
urhadd v23.8b, v23.8b, v31.8b
|
||||
sub x0, x0, x3, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x3
|
||||
st1 {v17.8B}, [x0], x3
|
||||
st1 {v18.8B}, [x0], x3
|
||||
st1 {v19.8B}, [x0], x3
|
||||
st1 {v20.8B}, [x0], x3
|
||||
st1 {v21.8B}, [x0], x3
|
||||
st1 {v22.8B}, [x0], x3
|
||||
st1 {v23.8B}, [x0], x3
|
||||
st1 {v16.8b}, [x0], x3
|
||||
st1 {v17.8b}, [x0], x3
|
||||
st1 {v18.8b}, [x0], x3
|
||||
st1 {v19.8b}, [x0], x3
|
||||
st1 {v20.8b}, [x0], x3
|
||||
st1 {v21.8b}, [x0], x3
|
||||
st1 {v22.8b}, [x0], x3
|
||||
st1 {v23.8b}, [x0], x3
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@ -417,19 +417,19 @@ endfunc
|
||||
|
||||
function put_h264_qpel8_hv_lowpass_neon_top
|
||||
lowpass_const w12
|
||||
ld1 {v16.8H}, [x1], x3
|
||||
ld1 {v17.8H}, [x1], x3
|
||||
ld1 {v18.8H}, [x1], x3
|
||||
ld1 {v19.8H}, [x1], x3
|
||||
ld1 {v20.8H}, [x1], x3
|
||||
ld1 {v21.8H}, [x1], x3
|
||||
ld1 {v22.8H}, [x1], x3
|
||||
ld1 {v23.8H}, [x1], x3
|
||||
ld1 {v24.8H}, [x1], x3
|
||||
ld1 {v25.8H}, [x1], x3
|
||||
ld1 {v26.8H}, [x1], x3
|
||||
ld1 {v27.8H}, [x1], x3
|
||||
ld1 {v28.8H}, [x1]
|
||||
ld1 {v16.8h}, [x1], x3
|
||||
ld1 {v17.8h}, [x1], x3
|
||||
ld1 {v18.8h}, [x1], x3
|
||||
ld1 {v19.8h}, [x1], x3
|
||||
ld1 {v20.8h}, [x1], x3
|
||||
ld1 {v21.8h}, [x1], x3
|
||||
ld1 {v22.8h}, [x1], x3
|
||||
ld1 {v23.8h}, [x1], x3
|
||||
ld1 {v24.8h}, [x1], x3
|
||||
ld1 {v25.8h}, [x1], x3
|
||||
ld1 {v26.8h}, [x1], x3
|
||||
ld1 {v27.8h}, [x1], x3
|
||||
ld1 {v28.8h}, [x1]
|
||||
lowpass_8H v16, v17
|
||||
lowpass_8H v18, v19
|
||||
lowpass_8H v20, v21
|
||||
@ -458,33 +458,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
|
||||
mov x10, x30
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
.ifc \type,avg
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
urhadd v16.8B, v16.8B, v0.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
urhadd v17.8B, v17.8B, v1.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v18.8B, v18.8B, v2.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v19.8B, v19.8B, v3.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v20.8B, v20.8B, v4.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v21.8B, v21.8B, v5.8B
|
||||
urhadd v22.8B, v22.8B, v6.8B
|
||||
urhadd v23.8B, v23.8B, v7.8B
|
||||
ld1 {v0.8b}, [x0], x2
|
||||
ld1 {v1.8b}, [x0], x2
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
urhadd v16.8b, v16.8b, v0.8b
|
||||
ld1 {v3.8b}, [x0], x2
|
||||
urhadd v17.8b, v17.8b, v1.8b
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
urhadd v18.8b, v18.8b, v2.8b
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
urhadd v19.8b, v19.8b, v3.8b
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
urhadd v20.8b, v20.8b, v4.8b
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
urhadd v21.8b, v21.8b, v5.8b
|
||||
urhadd v22.8b, v22.8b, v6.8b
|
||||
urhadd v23.8b, v23.8b, v7.8b
|
||||
sub x0, x0, x2, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v18.8B}, [x0], x2
|
||||
st1 {v19.8B}, [x0], x2
|
||||
st1 {v20.8B}, [x0], x2
|
||||
st1 {v21.8B}, [x0], x2
|
||||
st1 {v22.8B}, [x0], x2
|
||||
st1 {v23.8B}, [x0], x2
|
||||
st1 {v16.8b}, [x0], x2
|
||||
st1 {v17.8b}, [x0], x2
|
||||
st1 {v18.8b}, [x0], x2
|
||||
st1 {v19.8b}, [x0], x2
|
||||
st1 {v20.8b}, [x0], x2
|
||||
st1 {v21.8b}, [x0], x2
|
||||
st1 {v22.8b}, [x0], x2
|
||||
st1 {v23.8b}, [x0], x2
|
||||
|
||||
ret x10
|
||||
endfunc
|
||||
@ -498,45 +498,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
mov x10, x30
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
|
||||
ld1 {v0.8B, v1.8B}, [x2], #16
|
||||
ld1 {v2.8B, v3.8B}, [x2], #16
|
||||
urhadd v0.8B, v0.8B, v16.8B
|
||||
urhadd v1.8B, v1.8B, v17.8B
|
||||
ld1 {v4.8B, v5.8B}, [x2], #16
|
||||
urhadd v2.8B, v2.8B, v18.8B
|
||||
urhadd v3.8B, v3.8B, v19.8B
|
||||
ld1 {v6.8B, v7.8B}, [x2], #16
|
||||
urhadd v4.8B, v4.8B, v20.8B
|
||||
urhadd v5.8B, v5.8B, v21.8B
|
||||
urhadd v6.8B, v6.8B, v22.8B
|
||||
urhadd v7.8B, v7.8B, v23.8B
|
||||
ld1 {v0.8b, v1.8b}, [x2], #16
|
||||
ld1 {v2.8b, v3.8b}, [x2], #16
|
||||
urhadd v0.8b, v0.8b, v16.8b
|
||||
urhadd v1.8b, v1.8b, v17.8b
|
||||
ld1 {v4.8b, v5.8b}, [x2], #16
|
||||
urhadd v2.8b, v2.8b, v18.8b
|
||||
urhadd v3.8b, v3.8b, v19.8b
|
||||
ld1 {v6.8b, v7.8b}, [x2], #16
|
||||
urhadd v4.8b, v4.8b, v20.8b
|
||||
urhadd v5.8b, v5.8b, v21.8b
|
||||
urhadd v6.8b, v6.8b, v22.8b
|
||||
urhadd v7.8b, v7.8b, v23.8b
|
||||
.ifc \type,avg
|
||||
ld1 {v16.8B}, [x0], x3
|
||||
ld1 {v17.8B}, [x0], x3
|
||||
ld1 {v18.8B}, [x0], x3
|
||||
urhadd v0.8B, v0.8B, v16.8B
|
||||
ld1 {v19.8B}, [x0], x3
|
||||
urhadd v1.8B, v1.8B, v17.8B
|
||||
ld1 {v20.8B}, [x0], x3
|
||||
urhadd v2.8B, v2.8B, v18.8B
|
||||
ld1 {v21.8B}, [x0], x3
|
||||
urhadd v3.8B, v3.8B, v19.8B
|
||||
ld1 {v22.8B}, [x0], x3
|
||||
urhadd v4.8B, v4.8B, v20.8B
|
||||
ld1 {v23.8B}, [x0], x3
|
||||
urhadd v5.8B, v5.8B, v21.8B
|
||||
urhadd v6.8B, v6.8B, v22.8B
|
||||
urhadd v7.8B, v7.8B, v23.8B
|
||||
ld1 {v16.8b}, [x0], x3
|
||||
ld1 {v17.8b}, [x0], x3
|
||||
ld1 {v18.8b}, [x0], x3
|
||||
urhadd v0.8b, v0.8b, v16.8b
|
||||
ld1 {v19.8b}, [x0], x3
|
||||
urhadd v1.8b, v1.8b, v17.8b
|
||||
ld1 {v20.8b}, [x0], x3
|
||||
urhadd v2.8b, v2.8b, v18.8b
|
||||
ld1 {v21.8b}, [x0], x3
|
||||
urhadd v3.8b, v3.8b, v19.8b
|
||||
ld1 {v22.8b}, [x0], x3
|
||||
urhadd v4.8b, v4.8b, v20.8b
|
||||
ld1 {v23.8b}, [x0], x3
|
||||
urhadd v5.8b, v5.8b, v21.8b
|
||||
urhadd v6.8b, v6.8b, v22.8b
|
||||
urhadd v7.8b, v7.8b, v23.8b
|
||||
sub x0, x0, x3, lsl #3
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x3
|
||||
st1 {v1.8B}, [x0], x3
|
||||
st1 {v2.8B}, [x0], x3
|
||||
st1 {v3.8B}, [x0], x3
|
||||
st1 {v4.8B}, [x0], x3
|
||||
st1 {v5.8B}, [x0], x3
|
||||
st1 {v6.8B}, [x0], x3
|
||||
st1 {v7.8B}, [x0], x3
|
||||
st1 {v0.8b}, [x0], x3
|
||||
st1 {v1.8b}, [x0], x3
|
||||
st1 {v2.8b}, [x0], x3
|
||||
st1 {v3.8b}, [x0], x3
|
||||
st1 {v4.8b}, [x0], x3
|
||||
st1 {v5.8b}, [x0], x3
|
||||
st1 {v6.8b}, [x0], x3
|
||||
st1 {v7.8b}, [x0], x3
|
||||
|
||||
ret x10
|
||||
endfunc
|
||||
|
@ -26,295 +26,295 @@
|
||||
.if \avg
|
||||
mov x12, x0
|
||||
.endif
|
||||
1: ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v2.16B}, [x1], x2
|
||||
ld1 {v3.16B}, [x1], x2
|
||||
1: ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
ld1 {v2.16b}, [x1], x2
|
||||
ld1 {v3.16b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x12], x2
|
||||
urhadd v0.16B, v0.16B, v4.16B
|
||||
ld1 {v5.16B}, [x12], x2
|
||||
urhadd v1.16B, v1.16B, v5.16B
|
||||
ld1 {v6.16B}, [x12], x2
|
||||
urhadd v2.16B, v2.16B, v6.16B
|
||||
ld1 {v7.16B}, [x12], x2
|
||||
urhadd v3.16B, v3.16B, v7.16B
|
||||
ld1 {v4.16b}, [x12], x2
|
||||
urhadd v0.16b, v0.16b, v4.16b
|
||||
ld1 {v5.16b}, [x12], x2
|
||||
urhadd v1.16b, v1.16b, v5.16b
|
||||
ld1 {v6.16b}, [x12], x2
|
||||
urhadd v2.16b, v2.16b, v6.16b
|
||||
ld1 {v7.16b}, [x12], x2
|
||||
urhadd v3.16b, v3.16b, v7.16b
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v1.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
st1 {v0.16b}, [x0], x2
|
||||
st1 {v1.16b}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
st1 {v3.16b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
1: ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
ld1 {v2.16b, v3.16b}, [x1], x2
|
||||
subs w3, w3, #2
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
avg v0.16B, v0.16B, v1.16B
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
avg v2.16B, v2.16B, v3.16B
|
||||
ext v1.16b, v0.16b, v1.16b, #1
|
||||
avg v0.16b, v0.16b, v1.16b
|
||||
ext v3.16b, v2.16b, v3.16b, #1
|
||||
avg v2.16b, v2.16b, v3.16b
|
||||
.if \avg
|
||||
ld1 {v1.16B}, [x0], x2
|
||||
ld1 {v3.16B}, [x0]
|
||||
urhadd v0.16B, v0.16B, v1.16B
|
||||
urhadd v2.16B, v2.16B, v3.16B
|
||||
ld1 {v1.16b}, [x0], x2
|
||||
ld1 {v3.16b}, [x0]
|
||||
urhadd v0.16b, v0.16b, v1.16b
|
||||
urhadd v2.16b, v2.16b, v3.16b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v0.16b}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
avg v2.16b, v0.16b, v1.16b
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
avg v3.16b, v0.16b, v1.16b
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
ld1 {v4.16b}, [x0], x2
|
||||
ld1 {v5.16b}, [x0]
|
||||
urhadd v2.16b, v2.16b, v4.16b
|
||||
urhadd v3.16b, v3.16b, v5.16b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
st1 {v3.16b}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
avg v2.16b, v0.16b, v1.16b
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
avg v3.16b, v0.16b, v1.16b
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
ld1 {v4.16b}, [x0], x2
|
||||
ld1 {v5.16b}, [x0]
|
||||
urhadd v2.16b, v2.16b, v4.16b
|
||||
urhadd v3.16b, v3.16b, v5.16b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
st1 {v2.16b}, [x0], x2
|
||||
st1 {v3.16b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v4.16B, v5.16B}, [x1], x2
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
ld1 {v4.16b, v5.16b}, [x1], x2
|
||||
NRND movi v26.8H, #1
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
ext v5.16B, v4.16B, v5.16B, #1
|
||||
uaddl v16.8H, v0.8B, v1.8B
|
||||
uaddl2 v20.8H, v0.16B, v1.16B
|
||||
uaddl v18.8H, v4.8B, v5.8B
|
||||
uaddl2 v22.8H, v4.16B, v5.16B
|
||||
ext v1.16b, v0.16b, v1.16b, #1
|
||||
ext v5.16b, v4.16b, v5.16b, #1
|
||||
uaddl v16.8h, v0.8b, v1.8b
|
||||
uaddl2 v20.8h, v0.16b, v1.16b
|
||||
uaddl v18.8h, v4.8b, v5.8b
|
||||
uaddl2 v22.8h, v4.16b, v5.16b
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
ext v30.16b, v0.16b, v1.16b, #1
|
||||
add v1.8h, v20.8h, v22.8h
|
||||
mshrn v28.8b, v24.8h, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
mshrn2 v28.16b, v1.8h, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
ld1 {v16.16b}, [x0]
|
||||
urhadd v28.16b, v28.16b, v16.16b
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
uaddl v16.8h, v0.8b, v30.8b
|
||||
ld1 {v2.16b, v3.16b}, [x1], x2
|
||||
uaddl2 v20.8h, v0.16b, v30.16b
|
||||
st1 {v28.16b}, [x0], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
ext v3.16b, v2.16b, v3.16b, #1
|
||||
add v0.8h, v20.8h, v22.8h
|
||||
mshrn v30.8b, v24.8h, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
mshrn2 v30.16b, v0.8h, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
ld1 {v18.16b}, [x0]
|
||||
urhadd v30.16b, v30.16b, v18.16b
|
||||
.endif
|
||||
uaddl v18.8H, v2.8B, v3.8B
|
||||
uaddl2 v22.8H, v2.16B, v3.16B
|
||||
st1 {v30.16B}, [x0], x2
|
||||
uaddl v18.8h, v2.8b, v3.8b
|
||||
uaddl2 v22.8h, v2.16b, v3.16b
|
||||
st1 {v30.16b}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
ext v30.16b, v0.16b, v1.16b, #1
|
||||
add v1.8h, v20.8h, v22.8h
|
||||
mshrn v28.8b, v24.8h, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
mshrn2 v28.16b, v1.8h, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
ld1 {v16.16b}, [x0]
|
||||
urhadd v28.16b, v28.16b, v16.16b
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
uaddl v16.8h, v0.8b, v30.8b
|
||||
uaddl2 v20.8h, v0.16b, v30.16b
|
||||
st1 {v28.16b}, [x0], x2
|
||||
add v24.8h, v16.8h, v18.8h
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
add v0.8h, v20.8h, v22.8h
|
||||
mshrn v30.8b, v24.8h, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
mshrn2 v30.16b, v0.8h, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
ld1 {v18.16b}, [x0]
|
||||
urhadd v30.16b, v30.16b, v18.16b
|
||||
.endif
|
||||
st1 {v30.16B}, [x0], x2
|
||||
st1 {v30.16b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8 rnd=1, avg=0
|
||||
1: ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
ld1 {v2.8B}, [x1], x2
|
||||
ld1 {v3.8B}, [x1], x2
|
||||
1: ld1 {v0.8b}, [x1], x2
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
ld1 {v2.8b}, [x1], x2
|
||||
ld1 {v3.8b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v1.8B, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v2.8B, v2.8B, v6.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v3.8B, v3.8B, v7.8B
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
urhadd v0.8b, v0.8b, v4.8b
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
urhadd v1.8b, v1.8b, v5.8b
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
urhadd v2.8b, v2.8b, v6.8b
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
urhadd v3.8b, v3.8b, v7.8b
|
||||
sub x0, x0, x2, lsl #2
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v1.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
st1 {v3.8B}, [x0], x2
|
||||
st1 {v0.8b}, [x0], x2
|
||||
st1 {v1.8b}, [x0], x2
|
||||
st1 {v2.8b}, [x0], x2
|
||||
st1 {v3.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.8B, v1.8B}, [x1], x2
|
||||
ext v1.8B, v0.8B, v1.8B, #1
|
||||
ld1 {v2.8B, v3.8B}, [x1], x2
|
||||
ext v3.8B, v2.8B, v3.8B, #1
|
||||
1: ld1 {v0.8b, v1.8b}, [x1], x2
|
||||
ext v1.8b, v0.8b, v1.8b, #1
|
||||
ld1 {v2.8b, v3.8b}, [x1], x2
|
||||
ext v3.8b, v2.8b, v3.8b, #1
|
||||
subs w3, w3, #2
|
||||
avg v0.8B, v0.8B, v1.8B
|
||||
avg v2.8B, v2.8B, v3.8B
|
||||
avg v0.8b, v0.8b, v1.8b
|
||||
avg v2.8b, v2.8b, v3.8b
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
urhadd v2.8B, v2.8B, v5.8B
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
ld1 {v5.8b}, [x0]
|
||||
urhadd v0.8b, v0.8b, v4.8b
|
||||
urhadd v2.8b, v2.8b, v5.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
st1 {v0.8b}, [x0], x2
|
||||
st1 {v2.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
avg v4.8b, v0.8b, v1.8b
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
avg v5.8b, v0.8b, v1.8b
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v4.8b, v4.8b, v2.8b
|
||||
urhadd v5.8b, v5.8b, v3.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v4.8b}, [x0], x2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
avg v4.8b, v0.8b, v1.8b
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
avg v5.8b, v0.8b, v1.8b
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
ld1 {v3.8b}, [x0]
|
||||
urhadd v4.8b, v4.8b, v2.8b
|
||||
urhadd v5.8b, v5.8b, v3.8b
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v4.8b}, [x0], x2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
NRND movi v19.8H, #1
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
ext v4.16b, v0.16b, v4.16b, #1
|
||||
ext v6.16b, v1.16b, v6.16b, #1
|
||||
uaddl v16.8h, v0.8b, v4.8b
|
||||
uaddl v17.8h, v1.8b, v6.8b
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
ext v4.16b, v0.16b, v4.16b, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
uaddl v16.8h, v0.8b, v4.8b
|
||||
mshrn v5.8b, v18.8h, #2
|
||||
ld1 {v1.16b}, [x1], x2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
ld1 {v7.8b}, [x0]
|
||||
urhadd v5.8b, v5.8b, v7.8b
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
mshrn v7.8b, v18.8h, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
ld1 {v5.8b}, [x0]
|
||||
urhadd v7.8b, v7.8b, v5.8b
|
||||
.endif
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
st1 {v7.8B}, [x0], x2
|
||||
ext v6.16b, v1.16b, v6.16b, #1
|
||||
uaddl v17.8h, v1.8b, v6.8b
|
||||
st1 {v7.8b}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
ext v4.16b, v0.16b, v4.16b, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
uaddl v16.8h, v0.8b, v4.8b
|
||||
mshrn v5.8b, v18.8h, #2
|
||||
add v18.8h, v16.8h, v17.8h
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
ld1 {v7.8b}, [x0]
|
||||
urhadd v5.8b, v5.8b, v7.8b
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
mshrn v7.8b, v18.8h, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
ld1 {v5.8b}, [x0]
|
||||
urhadd v7.8b, v7.8b, v5.8b
|
||||
.endif
|
||||
st1 {v7.8B}, [x0], x2
|
||||
st1 {v7.8b}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
@ -1099,7 +1099,7 @@ function vsse_intra16_neon, export=1
|
||||
cbnz w4, 2b
|
||||
|
||||
3:
|
||||
add v16.4s, v16.4s, v17.4S
|
||||
add v16.4s, v16.4s, v17.4s
|
||||
uaddlv d17, v16.4s
|
||||
fmov w0, s17
|
||||
|
||||
|
@ -28,146 +28,146 @@
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \r9\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \r1\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \r3\().8B, \r2\().8B, \r3\().8B
|
||||
trn1 \r0\().8B, \r4\().8B, \r5\().8B
|
||||
trn2 \r5\().8B, \r4\().8B, \r5\().8B
|
||||
trn1 \r2\().8B, \r6\().8B, \r7\().8B
|
||||
trn2 \r7\().8B, \r6\().8B, \r7\().8B
|
||||
trn1 \r8\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \r9\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
||||
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
||||
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
||||
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
||||
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
||||
|
||||
trn1 \r4\().4H, \r0\().4H, \r2\().4H
|
||||
trn2 \r2\().4H, \r0\().4H, \r2\().4H
|
||||
trn1 \r6\().4H, \r5\().4H, \r7\().4H
|
||||
trn2 \r7\().4H, \r5\().4H, \r7\().4H
|
||||
trn1 \r5\().4H, \r9\().4H, \r3\().4H
|
||||
trn2 \r9\().4H, \r9\().4H, \r3\().4H
|
||||
trn1 \r3\().4H, \r8\().4H, \r1\().4H
|
||||
trn2 \r8\().4H, \r8\().4H, \r1\().4H
|
||||
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
||||
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
||||
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
||||
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
||||
trn1 \r5\().4h, \r9\().4h, \r3\().4h
|
||||
trn2 \r9\().4h, \r9\().4h, \r3\().4h
|
||||
trn1 \r3\().4h, \r8\().4h, \r1\().4h
|
||||
trn2 \r8\().4h, \r8\().4h, \r1\().4h
|
||||
|
||||
trn1 \r0\().2S, \r3\().2S, \r4\().2S
|
||||
trn2 \r4\().2S, \r3\().2S, \r4\().2S
|
||||
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
||||
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
||||
|
||||
trn1 \r1\().2S, \r5\().2S, \r6\().2S
|
||||
trn2 \r5\().2S, \r5\().2S, \r6\().2S
|
||||
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
||||
|
||||
trn2 \r6\().2S, \r8\().2S, \r2\().2S
|
||||
trn1 \r2\().2S, \r8\().2S, \r2\().2S
|
||||
trn2 \r6\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r2\().2s, \r8\().2s, \r2\().2s
|
||||
|
||||
trn1 \r3\().2S, \r9\().2S, \r7\().2S
|
||||
trn2 \r7\().2S, \r9\().2S, \r7\().2S
|
||||
trn1 \r3\().2s, \r9\().2s, \r7\().2s
|
||||
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||
trn1 \t0\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t1\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \r1\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \r3\().16B, \r2\().16B, \r3\().16B
|
||||
trn1 \r0\().16B, \r4\().16B, \r5\().16B
|
||||
trn2 \r5\().16B, \r4\().16B, \r5\().16B
|
||||
trn1 \r2\().16B, \r6\().16B, \r7\().16B
|
||||
trn2 \r7\().16B, \r6\().16B, \r7\().16B
|
||||
trn1 \t0\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t1\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r4\().8H, \r0\().8H, \r2\().8H
|
||||
trn2 \r2\().8H, \r0\().8H, \r2\().8H
|
||||
trn1 \r6\().8H, \r5\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r5\().8H, \r7\().8H
|
||||
trn1 \r5\().8H, \t1\().8H, \r3\().8H
|
||||
trn2 \t1\().8H, \t1\().8H, \r3\().8H
|
||||
trn1 \r3\().8H, \t0\().8H, \r1\().8H
|
||||
trn2 \t0\().8H, \t0\().8H, \r1\().8H
|
||||
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||
trn1 \r5\().8h, \t1\().8h, \r3\().8h
|
||||
trn2 \t1\().8h, \t1\().8h, \r3\().8h
|
||||
trn1 \r3\().8h, \t0\().8h, \r1\().8h
|
||||
trn2 \t0\().8h, \t0\().8h, \r1\().8h
|
||||
|
||||
trn1 \r0\().4S, \r3\().4S, \r4\().4S
|
||||
trn2 \r4\().4S, \r3\().4S, \r4\().4S
|
||||
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||
|
||||
trn1 \r1\().4S, \r5\().4S, \r6\().4S
|
||||
trn2 \r5\().4S, \r5\().4S, \r6\().4S
|
||||
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||
|
||||
trn2 \r6\().4S, \t0\().4S, \r2\().4S
|
||||
trn1 \r2\().4S, \t0\().4S, \r2\().4S
|
||||
trn2 \r6\().4s, \t0\().4s, \r2\().4s
|
||||
trn1 \r2\().4s, \t0\().4s, \r2\().4s
|
||||
|
||||
trn1 \r3\().4S, \t1\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \t1\().4S, \r7\().4S
|
||||
trn1 \r3\().4s, \t1\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \t1\().4s, \r7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t5\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \t6\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \t7\().16B, \r2\().16B, \r3\().16B
|
||||
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||
|
||||
trn1 \r0\().8H, \t4\().8H, \t6\().8H
|
||||
trn2 \r2\().8H, \t4\().8H, \t6\().8H
|
||||
trn1 \r1\().8H, \t5\().8H, \t7\().8H
|
||||
trn2 \r3\().8H, \t5\().8H, \t7\().8H
|
||||
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \t5\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \t6\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \t7\().8B, \r2\().8B, \r3\().8B
|
||||
trn1 \t4\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \t5\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \t6\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \t7\().8b, \r2\().8b, \r3\().8b
|
||||
|
||||
trn1 \r0\().4H, \t4\().4H, \t6\().4H
|
||||
trn2 \r2\().4H, \t4\().4H, \t6\().4H
|
||||
trn1 \r1\().4H, \t5\().4H, \t7\().4H
|
||||
trn2 \r3\().4H, \t5\().4H, \t7\().4H
|
||||
trn1 \r0\().4h, \t4\().4h, \t6\().4h
|
||||
trn2 \r2\().4h, \t4\().4h, \t6\().4h
|
||||
trn1 \r1\().4h, \t5\().4h, \t7\().4h
|
||||
trn2 \r3\().4h, \t5\().4h, \t7\().4h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
||||
trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
||||
trn1 \r6\().4H, \r2\().4H, \r3\().4H
|
||||
trn2 \r7\().4H, \r2\().4H, \r3\().4H
|
||||
trn1 \r4\().4h, \r0\().4h, \r1\().4h
|
||||
trn2 \r5\().4h, \r0\().4h, \r1\().4h
|
||||
trn1 \r6\().4h, \r2\().4h, \r3\().4h
|
||||
trn2 \r7\().4h, \r2\().4h, \r3\().4h
|
||||
|
||||
trn1 \r0\().2S, \r4\().2S, \r6\().2S
|
||||
trn2 \r2\().2S, \r4\().2S, \r6\().2S
|
||||
trn1 \r1\().2S, \r5\().2S, \r7\().2S
|
||||
trn2 \r3\().2S, \r5\().2S, \r7\().2S
|
||||
trn1 \r0\().2s, \r4\().2s, \r6\().2s
|
||||
trn2 \r2\().2s, \r4\().2s, \r6\().2s
|
||||
trn1 \r1\().2s, \r5\().2s, \r7\().2s
|
||||
trn2 \r3\().2s, \r5\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8H, \r0\().8H, \r1\().8H
|
||||
trn2 \t5\().8H, \r0\().8H, \r1\().8H
|
||||
trn1 \t6\().8H, \r2\().8H, \r3\().8H
|
||||
trn2 \t7\().8H, \r2\().8H, \r3\().8H
|
||||
trn1 \t4\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \t5\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \t6\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \t7\().8h, \r2\().8h, \r3\().8h
|
||||
|
||||
trn1 \r0\().4S, \t4\().4S, \t6\().4S
|
||||
trn2 \r2\().4S, \t4\().4S, \t6\().4S
|
||||
trn1 \r1\().4S, \t5\().4S, \t7\().4S
|
||||
trn2 \r3\().4S, \t5\().4S, \t7\().4S
|
||||
trn1 \r0\().4s, \t4\().4s, \t6\().4s
|
||||
trn2 \r2\().4s, \t4\().4s, \t6\().4s
|
||||
trn1 \r1\().4s, \t5\().4s, \t7\().4s
|
||||
trn2 \r3\().4s, \t5\().4s, \t7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8H, \r0\().8H, \r1\().8H
|
||||
trn2 \r9\().8H, \r0\().8H, \r1\().8H
|
||||
trn1 \r1\().8H, \r2\().8H, \r3\().8H
|
||||
trn2 \r3\().8H, \r2\().8H, \r3\().8H
|
||||
trn1 \r0\().8H, \r4\().8H, \r5\().8H
|
||||
trn2 \r5\().8H, \r4\().8H, \r5\().8H
|
||||
trn1 \r2\().8H, \r6\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r6\().8H, \r7\().8H
|
||||
trn1 \r8\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \r9\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \r1\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \r3\().8h, \r2\().8h, \r3\().8h
|
||||
trn1 \r0\().8h, \r4\().8h, \r5\().8h
|
||||
trn2 \r5\().8h, \r4\().8h, \r5\().8h
|
||||
trn1 \r2\().8h, \r6\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r6\().8h, \r7\().8h
|
||||
|
||||
trn1 \r4\().4S, \r0\().4S, \r2\().4S
|
||||
trn2 \r2\().4S, \r0\().4S, \r2\().4S
|
||||
trn1 \r6\().4S, \r5\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \r5\().4S, \r7\().4S
|
||||
trn1 \r5\().4S, \r9\().4S, \r3\().4S
|
||||
trn2 \r9\().4S, \r9\().4S, \r3\().4S
|
||||
trn1 \r3\().4S, \r8\().4S, \r1\().4S
|
||||
trn2 \r8\().4S, \r8\().4S, \r1\().4S
|
||||
trn1 \r4\().4s, \r0\().4s, \r2\().4s
|
||||
trn2 \r2\().4s, \r0\().4s, \r2\().4s
|
||||
trn1 \r6\().4s, \r5\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \r5\().4s, \r7\().4s
|
||||
trn1 \r5\().4s, \r9\().4s, \r3\().4s
|
||||
trn2 \r9\().4s, \r9\().4s, \r3\().4s
|
||||
trn1 \r3\().4s, \r8\().4s, \r1\().4s
|
||||
trn2 \r8\().4s, \r8\().4s, \r1\().4s
|
||||
|
||||
trn1 \r0\().2D, \r3\().2D, \r4\().2D
|
||||
trn2 \r4\().2D, \r3\().2D, \r4\().2D
|
||||
trn1 \r0\().2d, \r3\().2d, \r4\().2d
|
||||
trn2 \r4\().2d, \r3\().2d, \r4\().2d
|
||||
|
||||
trn1 \r1\().2D, \r5\().2D, \r6\().2D
|
||||
trn2 \r5\().2D, \r5\().2D, \r6\().2D
|
||||
trn1 \r1\().2d, \r5\().2d, \r6\().2d
|
||||
trn2 \r5\().2d, \r5\().2d, \r6\().2d
|
||||
|
||||
trn2 \r6\().2D, \r8\().2D, \r2\().2D
|
||||
trn1 \r2\().2D, \r8\().2D, \r2\().2D
|
||||
trn2 \r6\().2d, \r8\().2d, \r2\().2d
|
||||
trn1 \r2\().2d, \r8\().2d, \r2\().2d
|
||||
|
||||
trn1 \r3\().2D, \r9\().2D, \r7\().2D
|
||||
trn2 \r7\().2D, \r9\().2D, \r7\().2D
|
||||
trn1 \r3\().2d, \r9\().2d, \r7\().2d
|
||||
trn2 \r7\().2d, \r9\().2d, \r7\().2d
|
||||
|
||||
.endm
|
||||
|
@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
|
||||
add x3, x0, #192*4
|
||||
add x4, x0, #256*4
|
||||
mov x5, #64
|
||||
1: ld1 {v0.4S}, [x0]
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
fadd v0.4S, v0.4S, v1.4S
|
||||
ld1 {v2.4S}, [x2], #16
|
||||
fadd v0.4S, v0.4S, v2.4S
|
||||
ld1 {v3.4S}, [x3], #16
|
||||
fadd v0.4S, v0.4S, v3.4S
|
||||
ld1 {v4.4S}, [x4], #16
|
||||
fadd v0.4S, v0.4S, v4.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
1: ld1 {v0.4s}, [x0]
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v1.4s
|
||||
ld1 {v2.4s}, [x2], #16
|
||||
fadd v0.4s, v0.4s, v2.4s
|
||||
ld1 {v3.4s}, [x3], #16
|
||||
fadd v0.4s, v0.4s, v3.4s
|
||||
ld1 {v4.4s}, [x4], #16
|
||||
fadd v0.4s, v0.4s, v4.4s
|
||||
st1 {v0.4s}, [x0], #16
|
||||
subs x5, x5, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_sum_square_neon, export=1
|
||||
movi v0.4S, #0
|
||||
1: ld1 {v1.4S}, [x0], #16
|
||||
fmla v0.4S, v1.4S, v1.4S
|
||||
movi v0.4s, #0
|
||||
1: ld1 {v1.4s}, [x0], #16
|
||||
fmla v0.4s, v1.4s, v1.4s
|
||||
subs w1, w1, #2
|
||||
b.gt 1b
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
faddp v0.4s, v0.4s, v0.4s
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_neg_odd_64_neon, export=1
|
||||
mov x1, x0
|
||||
movi v5.4S, #1<<7, lsl #24
|
||||
ld2 {v0.4S, v1.4S}, [x0], #32
|
||||
eor v1.16B, v1.16B, v5.16B
|
||||
ld2 {v2.4S, v3.4S}, [x0], #32
|
||||
movi v5.4s, #1<<7, lsl #24
|
||||
ld2 {v0.4s, v1.4s}, [x0], #32
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
ld2 {v2.4s, v3.4s}, [x0], #32
|
||||
.rept 3
|
||||
st2 {v0.4S, v1.4S}, [x1], #32
|
||||
eor v3.16B, v3.16B, v5.16B
|
||||
ld2 {v0.4S, v1.4S}, [x0], #32
|
||||
st2 {v2.4S, v3.4S}, [x1], #32
|
||||
eor v1.16B, v1.16B, v5.16B
|
||||
ld2 {v2.4S, v3.4S}, [x0], #32
|
||||
st2 {v0.4s, v1.4s}, [x1], #32
|
||||
eor v3.16b, v3.16b, v5.16b
|
||||
ld2 {v0.4s, v1.4s}, [x0], #32
|
||||
st2 {v2.4s, v3.4s}, [x1], #32
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
ld2 {v2.4s, v3.4s}, [x0], #32
|
||||
.endr
|
||||
eor v3.16B, v3.16B, v5.16B
|
||||
st2 {v0.4S, v1.4S}, [x1], #32
|
||||
st2 {v2.4S, v3.4S}, [x1], #32
|
||||
eor v3.16b, v3.16b, v5.16b
|
||||
st2 {v0.4s, v1.4s}, [x1], #32
|
||||
st2 {v2.4s, v3.4s}, [x1], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
|
||||
add x2, x0, #64*4
|
||||
mov x3, #-16
|
||||
mov x4, #-4
|
||||
movi v6.4S, #1<<7, lsl #24
|
||||
ld1 {v0.2S}, [x0], #8
|
||||
st1 {v0.2S}, [x2], #8
|
||||
movi v6.4s, #1<<7, lsl #24
|
||||
ld1 {v0.2s}, [x0], #8
|
||||
st1 {v0.2s}, [x2], #8
|
||||
.rept 7
|
||||
ld1 {v1.4S}, [x1], x3
|
||||
ld1 {v2.4S}, [x0], #16
|
||||
eor v1.16B, v1.16B, v6.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
ext v1.16B, v1.16B, v1.16B, #8
|
||||
st2 {v1.4S, v2.4S}, [x2], #32
|
||||
ld1 {v1.4s}, [x1], x3
|
||||
ld1 {v2.4s}, [x0], #16
|
||||
eor v1.16b, v1.16b, v6.16b
|
||||
rev64 v1.4s, v1.4s
|
||||
ext v1.16b, v1.16b, v1.16b, #8
|
||||
st2 {v1.4s, v2.4s}, [x2], #32
|
||||
.endr
|
||||
add x1, x1, #8
|
||||
ld1 {v1.2S}, [x1], x4
|
||||
ld1 {v2.2S}, [x0], #8
|
||||
ld1 {v1.S}[3], [x1]
|
||||
ld1 {v2.S}[2], [x0]
|
||||
eor v1.16B, v1.16B, v6.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
st2 {v1.2S, v2.2S}, [x2], #16
|
||||
st2 {v1.S, v2.S}[2], [x2]
|
||||
ld1 {v1.2s}, [x1], x4
|
||||
ld1 {v2.2s}, [x0], #8
|
||||
ld1 {v1.s}[3], [x1]
|
||||
ld1 {v2.s}[2], [x0]
|
||||
eor v1.16b, v1.16b, v6.16b
|
||||
rev64 v1.4s, v1.4s
|
||||
st2 {v1.2s, v2.2s}, [x2], #16
|
||||
st2 {v1.s, v2.s}[2], [x2]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
|
||||
add x2, x1, #60*4
|
||||
mov x3, #-16
|
||||
mov x4, #32
|
||||
movi v6.4S, #1<<7, lsl #24
|
||||
1: ld1 {v0.4S}, [x2], x3
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
eor v0.16B, v0.16B, v6.16B
|
||||
rev64 v0.4S, v0.4S
|
||||
ext v0.16B, v0.16B, v0.16B, #8
|
||||
st2 {v0.4S, v1.4S}, [x0], #32
|
||||
movi v6.4s, #1<<7, lsl #24
|
||||
1: ld1 {v0.4s}, [x2], x3
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
eor v0.16b, v0.16b, v6.16b
|
||||
rev64 v0.4s, v0.4s
|
||||
ext v0.16b, v0.16b, v0.16b, #8
|
||||
st2 {v0.4s, v1.4s}, [x0], #32
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
|
||||
add x2, x0, #60*4
|
||||
mov x3, #-32
|
||||
mov x4, #32
|
||||
movi v2.4S, #1<<7, lsl #24
|
||||
1: ld2 {v0.4S, v1.4S}, [x1], x3
|
||||
eor v0.16B, v0.16B, v2.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
ext v1.16B, v1.16B, v1.16B, #8
|
||||
st1 {v0.4S}, [x2]
|
||||
st1 {v1.4S}, [x0], #16
|
||||
movi v2.4s, #1<<7, lsl #24
|
||||
1: ld2 {v0.4s, v1.4s}, [x1], x3
|
||||
eor v0.16b, v0.16b, v2.16b
|
||||
rev64 v1.4s, v1.4s
|
||||
ext v1.16b, v1.16b, v1.16b, #8
|
||||
st1 {v0.4s}, [x2]
|
||||
st1 {v1.4s}, [x0], #16
|
||||
sub x2, x2, #16
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
|
||||
add x3, x0, #124*4
|
||||
mov x4, #64
|
||||
mov x5, #-16
|
||||
1: ld1 {v0.4S}, [x1], #16
|
||||
ld1 {v1.4S}, [x2], x5
|
||||
rev64 v2.4S, v0.4S
|
||||
ext v2.16B, v2.16B, v2.16B, #8
|
||||
rev64 v3.4S, v1.4S
|
||||
ext v3.16B, v3.16B, v3.16B, #8
|
||||
fadd v1.4S, v1.4S, v2.4S
|
||||
fsub v0.4S, v0.4S, v3.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
st1 {v1.4S}, [x3], x5
|
||||
1: ld1 {v0.4s}, [x1], #16
|
||||
ld1 {v1.4s}, [x2], x5
|
||||
rev64 v2.4s, v0.4s
|
||||
ext v2.16b, v2.16b, v2.16b, #8
|
||||
rev64 v3.4s, v1.4s
|
||||
ext v3.16b, v3.16b, v3.16b, #8
|
||||
fadd v1.4s, v1.4s, v2.4s
|
||||
fsub v0.4s, v0.4s, v3.4s
|
||||
st1 {v0.4s}, [x0], #16
|
||||
st1 {v1.4s}, [x3], x5
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
|
||||
sxtw x4, w4
|
||||
sxtw x5, w5
|
||||
movrel x6, factors
|
||||
ld1 {v7.4S}, [x6]
|
||||
dup v1.4S, v0.S[0]
|
||||
mov v2.8B, v1.8B
|
||||
mov v2.S[2], v7.S[0]
|
||||
mov v2.S[3], v7.S[0]
|
||||
fmul v1.4S, v1.4S, v2.4S
|
||||
ld1 {v0.D}[0], [x3]
|
||||
ld1 {v0.D}[1], [x2]
|
||||
fmul v0.4S, v0.4S, v1.4S
|
||||
fmul v1.4S, v0.4S, v7.4S
|
||||
rev64 v0.4S, v0.4S
|
||||
ld1 {v7.4s}, [x6]
|
||||
dup v1.4s, v0.s[0]
|
||||
mov v2.8b, v1.8b
|
||||
mov v2.s[2], v7.s[0]
|
||||
mov v2.s[3], v7.s[0]
|
||||
fmul v1.4s, v1.4s, v2.4s
|
||||
ld1 {v0.d}[0], [x3]
|
||||
ld1 {v0.d}[1], [x2]
|
||||
fmul v0.4s, v0.4s, v1.4s
|
||||
fmul v1.4s, v0.4s, v7.4s
|
||||
rev64 v0.4s, v0.4s
|
||||
sub x7, x5, x4
|
||||
add x0, x0, x4, lsl #3
|
||||
add x1, x1, x4, lsl #3
|
||||
sub x1, x1, #16
|
||||
1: ld1 {v2.4S}, [x1], #16
|
||||
ld1 {v3.2S}, [x1]
|
||||
fmul v4.4S, v2.4S, v1.4S
|
||||
fmul v5.4S, v2.4S, v0.4S
|
||||
faddp v4.4S, v4.4S, v4.4S
|
||||
faddp v5.4S, v5.4S, v5.4S
|
||||
faddp v4.4S, v4.4S, v4.4S
|
||||
faddp v5.4S, v5.4S, v5.4S
|
||||
mov v4.S[1], v5.S[0]
|
||||
fadd v4.2S, v4.2S, v3.2S
|
||||
st1 {v4.2S}, [x0], #8
|
||||
1: ld1 {v2.4s}, [x1], #16
|
||||
ld1 {v3.2s}, [x1]
|
||||
fmul v4.4s, v2.4s, v1.4s
|
||||
fmul v5.4s, v2.4s, v0.4s
|
||||
faddp v4.4s, v4.4s, v4.4s
|
||||
faddp v5.4s, v5.4s, v5.4s
|
||||
faddp v4.4s, v4.4s, v4.4s
|
||||
faddp v5.4s, v5.4s, v5.4s
|
||||
mov v4.s[1], v5.s[0]
|
||||
fadd v4.2s, v4.2s, v3.2s
|
||||
st1 {v4.2s}, [x0], #8
|
||||
sub x1, x1, #8
|
||||
subs x7, x7, #1
|
||||
b.gt 1b
|
||||
@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
|
||||
sxtw x4, w4
|
||||
mov x5, #40*2*4
|
||||
add x1, x1, x4, lsl #3
|
||||
1: ld1 {v0.2S}, [x1], x5
|
||||
ld1 {v1.S}[0], [x2], #4
|
||||
fmul v2.4S, v0.4S, v1.S[0]
|
||||
st1 {v2.2S}, [x0], #8
|
||||
1: ld1 {v0.2s}, [x1], x5
|
||||
ld1 {v1.s}[0], [x2], #4
|
||||
fmul v2.4s, v0.4s, v1.s[0]
|
||||
st1 {v2.2s}, [x0], #8
|
||||
subs x3, x3, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
@ -227,46 +227,46 @@ endfunc
|
||||
function ff_sbr_autocorrelate_neon, export=1
|
||||
mov x2, #38
|
||||
movrel x3, factors
|
||||
ld1 {v0.4S}, [x3]
|
||||
movi v1.4S, #0
|
||||
movi v2.4S, #0
|
||||
movi v3.4S, #0
|
||||
ld1 {v4.2S}, [x0], #8
|
||||
ld1 {v5.2S}, [x0], #8
|
||||
fmul v16.2S, v4.2S, v4.2S
|
||||
fmul v17.2S, v5.2S, v4.S[0]
|
||||
fmul v18.2S, v5.2S, v4.S[1]
|
||||
1: ld1 {v5.D}[1], [x0], #8
|
||||
fmla v1.2S, v4.2S, v4.2S
|
||||
fmla v2.4S, v5.4S, v4.S[0]
|
||||
fmla v3.4S, v5.4S, v4.S[1]
|
||||
mov v4.D[0], v5.D[0]
|
||||
mov v5.D[0], v5.D[1]
|
||||
ld1 {v0.4s}, [x3]
|
||||
movi v1.4s, #0
|
||||
movi v2.4s, #0
|
||||
movi v3.4s, #0
|
||||
ld1 {v4.2s}, [x0], #8
|
||||
ld1 {v5.2s}, [x0], #8
|
||||
fmul v16.2s, v4.2s, v4.2s
|
||||
fmul v17.2s, v5.2s, v4.s[0]
|
||||
fmul v18.2s, v5.2s, v4.s[1]
|
||||
1: ld1 {v5.d}[1], [x0], #8
|
||||
fmla v1.2s, v4.2s, v4.2s
|
||||
fmla v2.4s, v5.4s, v4.s[0]
|
||||
fmla v3.4s, v5.4s, v4.s[1]
|
||||
mov v4.d[0], v5.d[0]
|
||||
mov v5.d[0], v5.d[1]
|
||||
subs x2, x2, #1
|
||||
b.gt 1b
|
||||
fmul v19.2S, v4.2S, v4.2S
|
||||
fmul v20.2S, v5.2S, v4.S[0]
|
||||
fmul v21.2S, v5.2S, v4.S[1]
|
||||
fadd v22.4S, v2.4S, v20.4S
|
||||
fsub v22.4S, v22.4S, v17.4S
|
||||
fadd v23.4S, v3.4S, v21.4S
|
||||
fsub v23.4S, v23.4S, v18.4S
|
||||
rev64 v23.4S, v23.4S
|
||||
fmul v23.4S, v23.4S, v0.4S
|
||||
fadd v22.4S, v22.4S, v23.4S
|
||||
st1 {v22.4S}, [x1], #16
|
||||
fadd v23.2S, v1.2S, v19.2S
|
||||
fsub v23.2S, v23.2S, v16.2S
|
||||
faddp v23.2S, v23.2S, v23.2S
|
||||
st1 {v23.S}[0], [x1]
|
||||
fmul v19.2s, v4.2s, v4.2s
|
||||
fmul v20.2s, v5.2s, v4.s[0]
|
||||
fmul v21.2s, v5.2s, v4.s[1]
|
||||
fadd v22.4s, v2.4s, v20.4s
|
||||
fsub v22.4s, v22.4s, v17.4s
|
||||
fadd v23.4s, v3.4s, v21.4s
|
||||
fsub v23.4s, v23.4s, v18.4s
|
||||
rev64 v23.4s, v23.4s
|
||||
fmul v23.4s, v23.4s, v0.4s
|
||||
fadd v22.4s, v22.4s, v23.4s
|
||||
st1 {v22.4s}, [x1], #16
|
||||
fadd v23.2s, v1.2s, v19.2s
|
||||
fsub v23.2s, v23.2s, v16.2s
|
||||
faddp v23.2s, v23.2s, v23.2s
|
||||
st1 {v23.s}[0], [x1]
|
||||
add x1, x1, #8
|
||||
rev64 v3.2S, v3.2S
|
||||
fmul v3.2S, v3.2S, v0.2S
|
||||
fadd v2.2S, v2.2S, v3.2S
|
||||
st1 {v2.2S}, [x1]
|
||||
rev64 v3.2s, v3.2s
|
||||
fmul v3.2s, v3.2s, v0.2s
|
||||
fadd v2.2s, v2.2s, v3.2s
|
||||
st1 {v2.2s}, [x1]
|
||||
add x1, x1, #16
|
||||
faddp v1.2S, v1.2S, v1.2S
|
||||
st1 {v1.S}[0], [x1]
|
||||
faddp v1.2s, v1.2s, v1.2s
|
||||
st1 {v1.s}[0], [x1]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -278,25 +278,25 @@ endfunc
|
||||
1: and x3, x3, #0x1ff
|
||||
add x8, x7, x3, lsl #3
|
||||
add x3, x3, #2
|
||||
ld1 {v2.4S}, [x0]
|
||||
ld1 {v3.2S}, [x1], #8
|
||||
ld1 {v4.2S}, [x2], #8
|
||||
ld1 {v5.4S}, [x8]
|
||||
mov v6.16B, v2.16B
|
||||
zip1 v3.4S, v3.4S, v3.4S
|
||||
zip1 v4.4S, v4.4S, v4.4S
|
||||
fmla v6.4S, v1.4S, v3.4S
|
||||
fmla v2.4S, v5.4S, v4.4S
|
||||
fcmeq v7.4S, v3.4S, #0
|
||||
bif v2.16B, v6.16B, v7.16B
|
||||
st1 {v2.4S}, [x0], #16
|
||||
ld1 {v2.4s}, [x0]
|
||||
ld1 {v3.2s}, [x1], #8
|
||||
ld1 {v4.2s}, [x2], #8
|
||||
ld1 {v5.4s}, [x8]
|
||||
mov v6.16b, v2.16b
|
||||
zip1 v3.4s, v3.4s, v3.4s
|
||||
zip1 v4.4s, v4.4s, v4.4s
|
||||
fmla v6.4s, v1.4s, v3.4s
|
||||
fmla v2.4s, v5.4s, v4.4s
|
||||
fcmeq v7.4s, v3.4s, #0
|
||||
bif v2.16b, v6.16b, v7.16b
|
||||
st1 {v2.4s}, [x0], #16
|
||||
subs x5, x5, #2
|
||||
b.gt 1b
|
||||
.endm
|
||||
|
||||
function ff_sbr_hf_apply_noise_0_neon, export=1
|
||||
movrel x9, phi_noise_0
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
|
||||
movrel x9, phi_noise_1
|
||||
and x4, x4, #1
|
||||
add x9, x9, x4, lsl #4
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_apply_noise_2_neon, export=1
|
||||
movrel x9, phi_noise_2
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
|
||||
movrel x9, phi_noise_3
|
||||
and x4, x4, #1
|
||||
add x9, x9, x4, lsl #4
|
||||
ld1 {v1.4S}, [x9]
|
||||
ld1 {v1.4s}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
@ -54,7 +54,7 @@ endconst
|
||||
prfm pldl1keep, [\data]
|
||||
mov x10, x30
|
||||
movrel x3, idct_coeff_neon
|
||||
ld1 {v0.2D}, [x3]
|
||||
ld1 {v0.2d}, [x3]
|
||||
.endm
|
||||
|
||||
.macro idct_end
|
||||
@ -74,146 +74,146 @@ endconst
|
||||
.endm
|
||||
|
||||
.macro idct_col4_top y1, y2, y3, y4, i, l
|
||||
smull\i v7.4S, \y3\l, z2
|
||||
smull\i v16.4S, \y3\l, z6
|
||||
smull\i v17.4S, \y2\l, z1
|
||||
add v19.4S, v23.4S, v7.4S
|
||||
smull\i v18.4S, \y2\l, z3
|
||||
add v20.4S, v23.4S, v16.4S
|
||||
smull\i v5.4S, \y2\l, z5
|
||||
sub v21.4S, v23.4S, v16.4S
|
||||
smull\i v6.4S, \y2\l, z7
|
||||
sub v22.4S, v23.4S, v7.4S
|
||||
smull\i v7.4s, \y3\l, z2
|
||||
smull\i v16.4s, \y3\l, z6
|
||||
smull\i v17.4s, \y2\l, z1
|
||||
add v19.4s, v23.4s, v7.4s
|
||||
smull\i v18.4s, \y2\l, z3
|
||||
add v20.4s, v23.4s, v16.4s
|
||||
smull\i v5.4s, \y2\l, z5
|
||||
sub v21.4s, v23.4s, v16.4s
|
||||
smull\i v6.4s, \y2\l, z7
|
||||
sub v22.4s, v23.4s, v7.4s
|
||||
|
||||
smlal\i v17.4S, \y4\l, z3
|
||||
smlsl\i v18.4S, \y4\l, z7
|
||||
smlsl\i v5.4S, \y4\l, z1
|
||||
smlsl\i v6.4S, \y4\l, z5
|
||||
smlal\i v17.4s, \y4\l, z3
|
||||
smlsl\i v18.4s, \y4\l, z7
|
||||
smlsl\i v5.4s, \y4\l, z1
|
||||
smlsl\i v6.4s, \y4\l, z5
|
||||
.endm
|
||||
|
||||
.macro idct_row4_neon y1, y2, y3, y4, pass
|
||||
ld1 {\y1\().2D,\y2\().2D}, [x2], #32
|
||||
movi v23.4S, #1<<2, lsl #8
|
||||
orr v5.16B, \y1\().16B, \y2\().16B
|
||||
ld1 {\y3\().2D,\y4\().2D}, [x2], #32
|
||||
orr v6.16B, \y3\().16B, \y4\().16B
|
||||
orr v5.16B, v5.16B, v6.16B
|
||||
mov x3, v5.D[1]
|
||||
smlal v23.4S, \y1\().4H, z4
|
||||
ld1 {\y1\().2d,\y2\().2d}, [x2], #32
|
||||
movi v23.4s, #1<<2, lsl #8
|
||||
orr v5.16b, \y1\().16b, \y2\().16b
|
||||
ld1 {\y3\().2d,\y4\().2d}, [x2], #32
|
||||
orr v6.16b, \y3\().16b, \y4\().16b
|
||||
orr v5.16b, v5.16b, v6.16b
|
||||
mov x3, v5.d[1]
|
||||
smlal v23.4s, \y1\().4h, z4
|
||||
|
||||
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
|
||||
idct_col4_top \y1, \y2, \y3, \y4, 1, .4h
|
||||
|
||||
cmp x3, #0
|
||||
b.eq \pass\()f
|
||||
|
||||
smull2 v7.4S, \y1\().8H, z4
|
||||
smlal2 v17.4S, \y2\().8H, z5
|
||||
smlsl2 v18.4S, \y2\().8H, z1
|
||||
smull2 v16.4S, \y3\().8H, z2
|
||||
smlal2 v5.4S, \y2\().8H, z7
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
smlal2 v6.4S, \y2\().8H, z3
|
||||
smull2 v7.4S, \y3\().8H, z6
|
||||
smlal2 v17.4S, \y4\().8H, z7
|
||||
smlsl2 v18.4S, \y4\().8H, z5
|
||||
smlal2 v5.4S, \y4\().8H, z3
|
||||
smlsl2 v6.4S, \y4\().8H, z1
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
smull2 v7.4s, \y1\().8h, z4
|
||||
smlal2 v17.4s, \y2\().8h, z5
|
||||
smlsl2 v18.4s, \y2\().8h, z1
|
||||
smull2 v16.4s, \y3\().8h, z2
|
||||
smlal2 v5.4s, \y2\().8h, z7
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v7.4s
|
||||
sub v21.4s, v21.4s, v7.4s
|
||||
add v22.4s, v22.4s, v7.4s
|
||||
smlal2 v6.4s, \y2\().8h, z3
|
||||
smull2 v7.4s, \y3\().8h, z6
|
||||
smlal2 v17.4s, \y4\().8h, z7
|
||||
smlsl2 v18.4s, \y4\().8h, z5
|
||||
smlal2 v5.4s, \y4\().8h, z3
|
||||
smlsl2 v6.4s, \y4\().8h, z1
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v16.4s
|
||||
add v21.4s, v21.4s, v16.4s
|
||||
sub v22.4s, v22.4s, v7.4s
|
||||
|
||||
\pass: add \y3\().4S, v19.4S, v17.4S
|
||||
add \y4\().4S, v20.4S, v18.4S
|
||||
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
|
||||
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
|
||||
add v7.4S, v21.4S, v5.4S
|
||||
add v16.4S, v22.4S, v6.4S
|
||||
shrn \y3\().4H, v7.4S, #ROW_SHIFT
|
||||
shrn \y4\().4H, v16.4S, #ROW_SHIFT
|
||||
sub v22.4S, v22.4S, v6.4S
|
||||
sub v19.4S, v19.4S, v17.4S
|
||||
sub v21.4S, v21.4S, v5.4S
|
||||
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
|
||||
sub v20.4S, v20.4S, v18.4S
|
||||
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
|
||||
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
|
||||
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
|
||||
add \y4\().4s, v20.4s, v18.4s
|
||||
shrn \y1\().4h, \y3\().4s, #ROW_SHIFT
|
||||
shrn \y2\().4h, \y4\().4s, #ROW_SHIFT
|
||||
add v7.4s, v21.4s, v5.4s
|
||||
add v16.4s, v22.4s, v6.4s
|
||||
shrn \y3\().4h, v7.4s, #ROW_SHIFT
|
||||
shrn \y4\().4h, v16.4s, #ROW_SHIFT
|
||||
sub v22.4s, v22.4s, v6.4s
|
||||
sub v19.4s, v19.4s, v17.4s
|
||||
sub v21.4s, v21.4s, v5.4s
|
||||
shrn2 \y1\().8h, v22.4s, #ROW_SHIFT
|
||||
sub v20.4s, v20.4s, v18.4s
|
||||
shrn2 \y2\().8h, v21.4s, #ROW_SHIFT
|
||||
shrn2 \y3\().8h, v20.4s, #ROW_SHIFT
|
||||
shrn2 \y4\().8h, v19.4s, #ROW_SHIFT
|
||||
|
||||
trn1 v16.8H, \y1\().8H, \y2\().8H
|
||||
trn2 v17.8H, \y1\().8H, \y2\().8H
|
||||
trn1 v18.8H, \y3\().8H, \y4\().8H
|
||||
trn2 v19.8H, \y3\().8H, \y4\().8H
|
||||
trn1 \y1\().4S, v16.4S, v18.4S
|
||||
trn1 \y2\().4S, v17.4S, v19.4S
|
||||
trn2 \y3\().4S, v16.4S, v18.4S
|
||||
trn2 \y4\().4S, v17.4S, v19.4S
|
||||
trn1 v16.8h, \y1\().8h, \y2\().8h
|
||||
trn2 v17.8h, \y1\().8h, \y2\().8h
|
||||
trn1 v18.8h, \y3\().8h, \y4\().8h
|
||||
trn2 v19.8h, \y3\().8h, \y4\().8h
|
||||
trn1 \y1\().4s, v16.4s, v18.4s
|
||||
trn1 \y2\().4s, v17.4s, v19.4s
|
||||
trn2 \y3\().4s, v16.4s, v18.4s
|
||||
trn2 \y4\().4s, v17.4s, v19.4s
|
||||
.endm
|
||||
|
||||
.macro declare_idct_col4_neon i, l
|
||||
function idct_col4_neon\i
|
||||
dup v23.4H, z4c
|
||||
dup v23.4h, z4c
|
||||
.if \i == 1
|
||||
add v23.4H, v23.4H, v24.4H
|
||||
add v23.4h, v23.4h, v24.4h
|
||||
.else
|
||||
mov v5.D[0], v24.D[1]
|
||||
add v23.4H, v23.4H, v5.4H
|
||||
mov v5.d[0], v24.d[1]
|
||||
add v23.4h, v23.4h, v5.4h
|
||||
.endif
|
||||
smull v23.4S, v23.4H, z4
|
||||
smull v23.4s, v23.4h, z4
|
||||
|
||||
idct_col4_top v24, v25, v26, v27, \i, \l
|
||||
|
||||
mov x4, v28.D[\i - 1]
|
||||
mov x5, v29.D[\i - 1]
|
||||
mov x4, v28.d[\i - 1]
|
||||
mov x5, v29.d[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 1f
|
||||
|
||||
smull\i v7.4S, v28\l, z4
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
smull\i v7.4s, v28\l, z4
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v7.4s
|
||||
sub v21.4s, v21.4s, v7.4s
|
||||
add v22.4s, v22.4s, v7.4s
|
||||
|
||||
1: mov x4, v30.D[\i - 1]
|
||||
1: mov x4, v30.d[\i - 1]
|
||||
cmp x5, #0
|
||||
b.eq 2f
|
||||
|
||||
smlal\i v17.4S, v29\l, z5
|
||||
smlsl\i v18.4S, v29\l, z1
|
||||
smlal\i v5.4S, v29\l, z7
|
||||
smlal\i v6.4S, v29\l, z3
|
||||
smlal\i v17.4s, v29\l, z5
|
||||
smlsl\i v18.4s, v29\l, z1
|
||||
smlal\i v5.4s, v29\l, z7
|
||||
smlal\i v6.4s, v29\l, z3
|
||||
|
||||
2: mov x5, v31.D[\i - 1]
|
||||
2: mov x5, v31.d[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 3f
|
||||
|
||||
smull\i v7.4S, v30\l, z6
|
||||
smull\i v16.4S, v30\l, z2
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
smull\i v7.4s, v30\l, z6
|
||||
smull\i v16.4s, v30\l, z2
|
||||
add v19.4s, v19.4s, v7.4s
|
||||
sub v22.4s, v22.4s, v7.4s
|
||||
sub v20.4s, v20.4s, v16.4s
|
||||
add v21.4s, v21.4s, v16.4s
|
||||
|
||||
3: cmp x5, #0
|
||||
b.eq 4f
|
||||
|
||||
smlal\i v17.4S, v31\l, z7
|
||||
smlsl\i v18.4S, v31\l, z5
|
||||
smlal\i v5.4S, v31\l, z3
|
||||
smlsl\i v6.4S, v31\l, z1
|
||||
smlal\i v17.4s, v31\l, z7
|
||||
smlsl\i v18.4s, v31\l, z5
|
||||
smlal\i v5.4s, v31\l, z3
|
||||
smlsl\i v6.4s, v31\l, z1
|
||||
|
||||
4: addhn v7.4H, v19.4S, v17.4S
|
||||
addhn2 v7.8H, v20.4S, v18.4S
|
||||
subhn v18.4H, v20.4S, v18.4S
|
||||
subhn2 v18.8H, v19.4S, v17.4S
|
||||
4: addhn v7.4h, v19.4s, v17.4s
|
||||
addhn2 v7.8h, v20.4s, v18.4s
|
||||
subhn v18.4h, v20.4s, v18.4s
|
||||
subhn2 v18.8h, v19.4s, v17.4s
|
||||
|
||||
addhn v16.4H, v21.4S, v5.4S
|
||||
addhn2 v16.8H, v22.4S, v6.4S
|
||||
subhn v17.4H, v22.4S, v6.4S
|
||||
subhn2 v17.8H, v21.4S, v5.4S
|
||||
addhn v16.4h, v21.4s, v5.4s
|
||||
addhn2 v16.8h, v22.4s, v6.4s
|
||||
subhn v17.4h, v22.4s, v6.4s
|
||||
subhn2 v17.8h, v21.4s, v5.4s
|
||||
|
||||
ret
|
||||
endfunc
|
||||
@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sqshrun v1.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v3.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
|
||||
sqshrun v1.8b, v7.8h, #COL_SHIFT-16
|
||||
sqshrun2 v1.16b, v16.8h, #COL_SHIFT-16
|
||||
sqshrun v3.8b, v17.8h, #COL_SHIFT-16
|
||||
sqshrun2 v3.16b, v18.8h, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sqshrun v2.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v4.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
|
||||
sqshrun v2.8b, v7.8h, #COL_SHIFT-16
|
||||
sqshrun2 v2.16b, v16.8h, #COL_SHIFT-16
|
||||
sqshrun v4.8b, v17.8h, #COL_SHIFT-16
|
||||
sqshrun2 v4.16b, v18.8h, #COL_SHIFT-16
|
||||
|
||||
zip1 v16.4S, v1.4S, v2.4S
|
||||
zip2 v17.4S, v1.4S, v2.4S
|
||||
zip1 v16.4s, v1.4s, v2.4s
|
||||
zip2 v17.4s, v1.4s, v2.4s
|
||||
|
||||
st1 {v16.D}[0], [x0], x1
|
||||
st1 {v16.D}[1], [x0], x1
|
||||
st1 {v16.d}[0], [x0], x1
|
||||
st1 {v16.d}[1], [x0], x1
|
||||
|
||||
zip1 v18.4S, v3.4S, v4.4S
|
||||
zip2 v19.4S, v3.4S, v4.4S
|
||||
zip1 v18.4s, v3.4s, v4.4s
|
||||
zip2 v19.4s, v3.4s, v4.4s
|
||||
|
||||
st1 {v17.D}[0], [x0], x1
|
||||
st1 {v17.D}[1], [x0], x1
|
||||
st1 {v18.D}[0], [x0], x1
|
||||
st1 {v18.D}[1], [x0], x1
|
||||
st1 {v19.D}[0], [x0], x1
|
||||
st1 {v19.D}[1], [x0], x1
|
||||
st1 {v17.d}[0], [x0], x1
|
||||
st1 {v17.d}[1], [x0], x1
|
||||
st1 {v18.d}[0], [x0], x1
|
||||
st1 {v18.d}[1], [x0], x1
|
||||
st1 {v19.d}[0], [x0], x1
|
||||
st1 {v19.d}[1], [x0], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v1.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v2.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v3.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v4.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v7.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v16.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v17.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v18.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
mov x9, x0
|
||||
ld1 {v19.D}[0], [x0], x1
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
ld1 {v19.D}[1], [x0], x1
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
ld1 {v20.D}[0], [x0], x1
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
ld1 {v21.D}[0], [x0], x1
|
||||
uaddw v23.8H, v23.8H, v19.8B
|
||||
uaddw2 v24.8H, v24.8H, v19.16B
|
||||
ld1 {v21.D}[1], [x0], x1
|
||||
sqxtun v23.8B, v23.8H
|
||||
sqxtun2 v23.16B, v24.8H
|
||||
ld1 {v22.D}[0], [x0], x1
|
||||
uaddw v24.8H, v25.8H, v20.8B
|
||||
uaddw2 v25.8H, v26.8H, v20.16B
|
||||
ld1 {v22.D}[1], [x0], x1
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v25.8H
|
||||
st1 {v23.D}[0], [x9], x1
|
||||
uaddw v25.8H, v27.8H, v21.8B
|
||||
uaddw2 v26.8H, v28.8H, v21.16B
|
||||
st1 {v23.D}[1], [x9], x1
|
||||
sqxtun v25.8B, v25.8H
|
||||
sqxtun2 v25.16B, v26.8H
|
||||
st1 {v24.D}[0], [x9], x1
|
||||
uaddw v26.8H, v29.8H, v22.8B
|
||||
uaddw2 v27.8H, v30.8H, v22.16B
|
||||
st1 {v24.D}[1], [x9], x1
|
||||
sqxtun v26.8B, v26.8H
|
||||
sqxtun2 v26.16B, v27.8H
|
||||
st1 {v25.D}[0], [x9], x1
|
||||
st1 {v25.D}[1], [x9], x1
|
||||
st1 {v26.D}[0], [x9], x1
|
||||
st1 {v26.D}[1], [x9], x1
|
||||
ld1 {v19.d}[0], [x0], x1
|
||||
zip1 v23.2d, v1.2d, v7.2d
|
||||
zip2 v24.2d, v1.2d, v7.2d
|
||||
ld1 {v19.d}[1], [x0], x1
|
||||
zip1 v25.2d, v2.2d, v16.2d
|
||||
zip2 v26.2d, v2.2d, v16.2d
|
||||
ld1 {v20.d}[0], [x0], x1
|
||||
zip1 v27.2d, v3.2d, v17.2d
|
||||
zip2 v28.2d, v3.2d, v17.2d
|
||||
ld1 {v20.d}[1], [x0], x1
|
||||
zip1 v29.2d, v4.2d, v18.2d
|
||||
zip2 v30.2d, v4.2d, v18.2d
|
||||
ld1 {v21.d}[0], [x0], x1
|
||||
uaddw v23.8h, v23.8h, v19.8b
|
||||
uaddw2 v24.8h, v24.8h, v19.16b
|
||||
ld1 {v21.d}[1], [x0], x1
|
||||
sqxtun v23.8b, v23.8h
|
||||
sqxtun2 v23.16b, v24.8h
|
||||
ld1 {v22.d}[0], [x0], x1
|
||||
uaddw v24.8h, v25.8h, v20.8b
|
||||
uaddw2 v25.8h, v26.8h, v20.16b
|
||||
ld1 {v22.d}[1], [x0], x1
|
||||
sqxtun v24.8b, v24.8h
|
||||
sqxtun2 v24.16b, v25.8h
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
uaddw v25.8h, v27.8h, v21.8b
|
||||
uaddw2 v26.8h, v28.8h, v21.16b
|
||||
st1 {v23.d}[1], [x9], x1
|
||||
sqxtun v25.8b, v25.8h
|
||||
sqxtun2 v25.16b, v26.8h
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
uaddw v26.8h, v29.8h, v22.8b
|
||||
uaddw2 v27.8h, v30.8h, v22.16b
|
||||
st1 {v24.d}[1], [x9], x1
|
||||
sqxtun v26.8b, v26.8h
|
||||
sqxtun2 v26.16b, v27.8h
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x9], x1
|
||||
st1 {v26.d}[0], [x9], x1
|
||||
st1 {v26.d}[1], [x9], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
|
||||
sub x2, x2, #128
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v1.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v2.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v3.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v4.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
sshr v7.8h, v7.8h, #COL_SHIFT-16
|
||||
sshr v16.8h, v16.8h, #COL_SHIFT-16
|
||||
sshr v17.8h, v17.8h, #COL_SHIFT-16
|
||||
sshr v18.8h, v18.8h, #COL_SHIFT-16
|
||||
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
st1 {v23.2D,v24.2D}, [x2], #32
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
st1 {v25.2D,v26.2D}, [x2], #32
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
st1 {v27.2D,v28.2D}, [x2], #32
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
st1 {v29.2D,v30.2D}, [x2], #32
|
||||
zip1 v23.2d, v1.2d, v7.2d
|
||||
zip2 v24.2d, v1.2d, v7.2d
|
||||
st1 {v23.2d,v24.2d}, [x2], #32
|
||||
zip1 v25.2d, v2.2d, v16.2d
|
||||
zip2 v26.2d, v2.2d, v16.2d
|
||||
st1 {v25.2d,v26.2d}, [x2], #32
|
||||
zip1 v27.2d, v3.2d, v17.2d
|
||||
zip2 v28.2d, v3.2d, v17.2d
|
||||
st1 {v27.2d,v28.2d}, [x2], #32
|
||||
zip1 v29.2d, v4.2d, v18.2d
|
||||
zip2 v30.2d, v4.2d, v18.2d
|
||||
st1 {v29.2d,v30.2d}, [x2], #32
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
|
@ -22,19 +22,19 @@
|
||||
|
||||
// acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
|
||||
.macro acc_sum_store x, xb
|
||||
dup v24.4S, v24.S[3] // ...X -> XXXX
|
||||
ext v25.16B, v26.16B, \xb, #12 // ext(0000,ABCD,12)=0ABC
|
||||
add v24.4S, v24.4S, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
|
||||
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
|
||||
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,0ABC,12)=00AB
|
||||
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
|
||||
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,00AB,12)=000A
|
||||
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
|
||||
st1 {v24.4S}, [x0], #16 // write 4x32-bit final values
|
||||
dup v24.4s, v24.s[3] // ...X -> XXXX
|
||||
ext v25.16b, v26.16b, \xb, #12 // ext(0000,ABCD,12)=0ABC
|
||||
add v24.4s, v24.4s, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
|
||||
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
|
||||
ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,0ABC,12)=00AB
|
||||
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
|
||||
ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,00AB,12)=000A
|
||||
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
|
||||
st1 {v24.4s}, [x0], #16 // write 4x32-bit final values
|
||||
.endm
|
||||
|
||||
function ff_compute_safe_ssd_integral_image_neon, export=1
|
||||
movi v26.4S, #0 // used as zero for the "rotations" in acc_sum_store
|
||||
movi v26.4s, #0 // used as zero for the "rotations" in acc_sum_store
|
||||
sub x3, x3, w6, UXTW // s1 padding (s1_linesize - w)
|
||||
sub x5, x5, w6, UXTW // s2 padding (s2_linesize - w)
|
||||
sub x9, x0, w1, UXTW #2 // dst_top
|
||||
@ -43,31 +43,31 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
|
||||
1: mov w10, w6 // width copy for each line
|
||||
sub x0, x0, #16 // beginning of the dst line minus 4 sums
|
||||
sub x8, x9, #4 // dst_top-1
|
||||
ld1 {v24.4S}, [x0], #16 // load ...X (contextual last sums)
|
||||
2: ld1 {v0.16B}, [x2], #16 // s1[x + 0..15]
|
||||
ld1 {v1.16B}, [x4], #16 // s2[x + 0..15]
|
||||
ld1 {v16.4S,v17.4S}, [x8], #32 // dst_top[x + 0..7 - 1]
|
||||
usubl v2.8H, v0.8B, v1.8B // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
|
||||
usubl2 v3.8H, v0.16B, v1.16B // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
|
||||
ld1 {v18.4S,v19.4S}, [x8], #32 // dst_top[x + 8..15 - 1]
|
||||
smull v4.4S, v2.4H, v2.4H // d[x + 0..3]^2
|
||||
smull2 v5.4S, v2.8H, v2.8H // d[x + 4..7]^2
|
||||
ld1 {v20.4S,v21.4S}, [x9], #32 // dst_top[x + 0..7]
|
||||
smull v6.4S, v3.4H, v3.4H // d[x + 8..11]^2
|
||||
smull2 v7.4S, v3.8H, v3.8H // d[x + 12..15]^2
|
||||
ld1 {v22.4S,v23.4S}, [x9], #32 // dst_top[x + 8..15]
|
||||
sub v0.4S, v20.4S, v16.4S // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
|
||||
sub v1.4S, v21.4S, v17.4S // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
|
||||
add v0.4S, v0.4S, v4.4S // + d[x + 0..3]^2
|
||||
add v1.4S, v1.4S, v5.4S // + d[x + 4..7]^2
|
||||
sub v2.4S, v22.4S, v18.4S // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
|
||||
sub v3.4S, v23.4S, v19.4S // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
|
||||
add v2.4S, v2.4S, v6.4S // + d[x + 8..11]^2
|
||||
add v3.4S, v3.4S, v7.4S // + d[x + 12..15]^2
|
||||
acc_sum_store v0.4S, v0.16B // accumulate and store dst[ 0..3]
|
||||
acc_sum_store v1.4S, v1.16B // accumulate and store dst[ 4..7]
|
||||
acc_sum_store v2.4S, v2.16B // accumulate and store dst[ 8..11]
|
||||
acc_sum_store v3.4S, v3.16B // accumulate and store dst[12..15]
|
||||
ld1 {v24.4s}, [x0], #16 // load ...X (contextual last sums)
|
||||
2: ld1 {v0.16b}, [x2], #16 // s1[x + 0..15]
|
||||
ld1 {v1.16b}, [x4], #16 // s2[x + 0..15]
|
||||
ld1 {v16.4s,v17.4s}, [x8], #32 // dst_top[x + 0..7 - 1]
|
||||
usubl v2.8h, v0.8b, v1.8b // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
|
||||
usubl2 v3.8h, v0.16b, v1.16b // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
|
||||
ld1 {v18.4s,v19.4s}, [x8], #32 // dst_top[x + 8..15 - 1]
|
||||
smull v4.4s, v2.4h, v2.4h // d[x + 0..3]^2
|
||||
smull2 v5.4s, v2.8h, v2.8h // d[x + 4..7]^2
|
||||
ld1 {v20.4s,v21.4s}, [x9], #32 // dst_top[x + 0..7]
|
||||
smull v6.4s, v3.4h, v3.4h // d[x + 8..11]^2
|
||||
smull2 v7.4s, v3.8h, v3.8h // d[x + 12..15]^2
|
||||
ld1 {v22.4s,v23.4s}, [x9], #32 // dst_top[x + 8..15]
|
||||
sub v0.4s, v20.4s, v16.4s // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
|
||||
sub v1.4s, v21.4s, v17.4s // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
|
||||
add v0.4s, v0.4s, v4.4s // + d[x + 0..3]^2
|
||||
add v1.4s, v1.4s, v5.4s // + d[x + 4..7]^2
|
||||
sub v2.4s, v22.4s, v18.4s // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
|
||||
sub v3.4s, v23.4s, v19.4s // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
|
||||
add v2.4s, v2.4s, v6.4s // + d[x + 8..11]^2
|
||||
add v3.4s, v3.4s, v7.4s // + d[x + 12..15]^2
|
||||
acc_sum_store v0.4s, v0.16b // accumulate and store dst[ 0..3]
|
||||
acc_sum_store v1.4s, v1.16b // accumulate and store dst[ 4..7]
|
||||
acc_sum_store v2.4s, v2.16b // accumulate and store dst[ 8..11]
|
||||
acc_sum_store v3.4s, v3.16b // accumulate and store dst[12..15]
|
||||
subs w10, w10, #16 // width dec
|
||||
b.ne 2b // loop til next line
|
||||
add x2, x2, x3 // skip to next line (s1)
|
||||
|
@ -25,16 +25,16 @@
|
||||
|
||||
function ff_vector_fmul_neon, export=1
|
||||
1: subs w3, w3, #16
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x1], #32
|
||||
ld1 {v4.4S, v5.4S}, [x2], #32
|
||||
ld1 {v6.4S, v7.4S}, [x2], #32
|
||||
fmul v16.4S, v0.4S, v4.4S
|
||||
fmul v17.4S, v1.4S, v5.4S
|
||||
fmul v18.4S, v2.4S, v6.4S
|
||||
fmul v19.4S, v3.4S, v7.4S
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
st1 {v18.4S, v19.4S}, [x0], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
ld1 {v4.4s, v5.4s}, [x2], #32
|
||||
ld1 {v6.4s, v7.4s}, [x2], #32
|
||||
fmul v16.4s, v0.4s, v4.4s
|
||||
fmul v17.4s, v1.4s, v5.4s
|
||||
fmul v18.4s, v2.4s, v6.4s
|
||||
fmul v19.4s, v3.4s, v7.4s
|
||||
st1 {v16.4s, v17.4s}, [x0], #32
|
||||
st1 {v18.4s, v19.4s}, [x0], #32
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@ -42,16 +42,16 @@ endfunc
|
||||
function ff_vector_fmac_scalar_neon, export=1
|
||||
mov x3, #-32
|
||||
1: subs w2, w2, #16
|
||||
ld1 {v16.4S, v17.4S}, [x0], #32
|
||||
ld1 {v18.4S, v19.4S}, [x0], x3
|
||||
ld1 {v4.4S, v5.4S}, [x1], #32
|
||||
ld1 {v6.4S, v7.4S}, [x1], #32
|
||||
fmla v16.4S, v4.4S, v0.S[0]
|
||||
fmla v17.4S, v5.4S, v0.S[0]
|
||||
fmla v18.4S, v6.4S, v0.S[0]
|
||||
fmla v19.4S, v7.4S, v0.S[0]
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
st1 {v18.4S, v19.4S}, [x0], #32
|
||||
ld1 {v16.4s, v17.4s}, [x0], #32
|
||||
ld1 {v18.4s, v19.4s}, [x0], x3
|
||||
ld1 {v4.4s, v5.4s}, [x1], #32
|
||||
ld1 {v6.4s, v7.4s}, [x1], #32
|
||||
fmla v16.4s, v4.4s, v0.s[0]
|
||||
fmla v17.4s, v5.4s, v0.s[0]
|
||||
fmla v18.4s, v6.4s, v0.s[0]
|
||||
fmla v19.4s, v7.4s, v0.s[0]
|
||||
st1 {v16.4s, v17.4s}, [x0], #32
|
||||
st1 {v18.4s, v19.4s}, [x0], #32
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
@ -59,43 +59,43 @@ endfunc
|
||||
function ff_vector_fmul_scalar_neon, export=1
|
||||
mov w4, #15
|
||||
bics w3, w2, w4
|
||||
dup v16.4S, v0.S[0]
|
||||
dup v16.4s, v0.s[0]
|
||||
b.eq 3f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
1: subs w3, w3, #16
|
||||
fmul v0.4S, v0.4S, v16.4S
|
||||
ld1 {v2.4S, v3.4S}, [x1], #32
|
||||
fmul v1.4S, v1.4S, v16.4S
|
||||
fmul v2.4S, v2.4S, v16.4S
|
||||
st1 {v0.4S, v1.4S}, [x0], #32
|
||||
fmul v3.4S, v3.4S, v16.4S
|
||||
fmul v0.4s, v0.4s, v16.4s
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
fmul v1.4s, v1.4s, v16.4s
|
||||
fmul v2.4s, v2.4s, v16.4s
|
||||
st1 {v0.4s, v1.4s}, [x0], #32
|
||||
fmul v3.4s, v3.4s, v16.4s
|
||||
b.eq 2f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
st1 {v2.4S, v3.4S}, [x0], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
st1 {v2.4s, v3.4s}, [x0], #32
|
||||
b 1b
|
||||
2: ands w2, w2, #15
|
||||
st1 {v2.4S, v3.4S}, [x0], #32
|
||||
st1 {v2.4s, v3.4s}, [x0], #32
|
||||
b.eq 4f
|
||||
3: ld1 {v0.4S}, [x1], #16
|
||||
fmul v0.4S, v0.4S, v16.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
3: ld1 {v0.4s}, [x1], #16
|
||||
fmul v0.4s, v0.4s, v16.4s
|
||||
st1 {v0.4s}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 3b
|
||||
4: ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_dmul_scalar_neon, export=1
|
||||
dup v16.2D, v0.D[0]
|
||||
ld1 {v0.2D, v1.2D}, [x1], #32
|
||||
dup v16.2d, v0.d[0]
|
||||
ld1 {v0.2d, v1.2d}, [x1], #32
|
||||
1: subs w2, w2, #8
|
||||
fmul v0.2D, v0.2D, v16.2D
|
||||
ld1 {v2.2D, v3.2D}, [x1], #32
|
||||
fmul v1.2D, v1.2D, v16.2D
|
||||
fmul v2.2D, v2.2D, v16.2D
|
||||
st1 {v0.2D, v1.2D}, [x0], #32
|
||||
fmul v3.2D, v3.2D, v16.2D
|
||||
ld1 {v0.2D, v1.2D}, [x1], #32
|
||||
st1 {v2.2D, v3.2D}, [x0], #32
|
||||
fmul v0.2d, v0.2d, v16.2d
|
||||
ld1 {v2.2d, v3.2d}, [x1], #32
|
||||
fmul v1.2d, v1.2d, v16.2d
|
||||
fmul v2.2d, v2.2d, v16.2d
|
||||
st1 {v0.2d, v1.2d}, [x0], #32
|
||||
fmul v3.2d, v3.2d, v16.2d
|
||||
ld1 {v0.2d, v1.2d}, [x1], #32
|
||||
st1 {v2.2d, v3.2d}, [x0], #32
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
@ -108,49 +108,49 @@ function ff_vector_fmul_window_neon, export=1
|
||||
add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
|
||||
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
|
||||
mov x7, #-16
|
||||
ld1 {v0.4S}, [x1], #16 // s0
|
||||
ld1 {v2.4S}, [x3], #16 // wi
|
||||
ld1 {v1.4S}, [x2], x7 // s1
|
||||
1: ld1 {v3.4S}, [x6], x7 // wj
|
||||
ld1 {v0.4s}, [x1], #16 // s0
|
||||
ld1 {v2.4s}, [x3], #16 // wi
|
||||
ld1 {v1.4s}, [x2], x7 // s1
|
||||
1: ld1 {v3.4s}, [x6], x7 // wj
|
||||
subs x4, x4, #4
|
||||
fmul v17.4S, v0.4S, v2.4S // s0 * wi
|
||||
rev64 v4.4S, v1.4S
|
||||
rev64 v5.4S, v3.4S
|
||||
rev64 v17.4S, v17.4S
|
||||
ext v4.16B, v4.16B, v4.16B, #8 // s1_r
|
||||
ext v5.16B, v5.16B, v5.16B, #8 // wj_r
|
||||
ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
|
||||
fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
|
||||
fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
|
||||
fmul v17.4s, v0.4s, v2.4s // s0 * wi
|
||||
rev64 v4.4s, v1.4s
|
||||
rev64 v5.4s, v3.4s
|
||||
rev64 v17.4s, v17.4s
|
||||
ext v4.16b, v4.16b, v4.16b, #8 // s1_r
|
||||
ext v5.16b, v5.16b, v5.16b, #8 // wj_r
|
||||
ext v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev
|
||||
fmul v16.4s, v0.4s, v5.4s // s0 * wj_r
|
||||
fmla v17.4s, v1.4s, v3.4s // (s0 * wi)_rev + s1 * wj
|
||||
b.eq 2f
|
||||
ld1 {v0.4S}, [x1], #16
|
||||
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4S}, [x5], x7
|
||||
ld1 {v2.4S}, [x3], #16
|
||||
ld1 {v1.4S}, [x2], x7
|
||||
st1 {v16.4S}, [x0], #16
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4s}, [x5], x7
|
||||
ld1 {v2.4s}, [x3], #16
|
||||
ld1 {v1.4s}, [x2], x7
|
||||
st1 {v16.4s}, [x0], #16
|
||||
b 1b
|
||||
2:
|
||||
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4S}, [x5], x7
|
||||
st1 {v16.4S}, [x0], #16
|
||||
fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4s}, [x5], x7
|
||||
st1 {v16.4s}, [x0], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_add_neon, export=1
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x2], #32
|
||||
ld1 {v4.4S, v5.4S}, [x3], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], #32
|
||||
ld1 {v4.4s, v5.4s}, [x3], #32
|
||||
1: subs w4, w4, #8
|
||||
fmla v4.4S, v0.4S, v2.4S
|
||||
fmla v5.4S, v1.4S, v3.4S
|
||||
fmla v4.4s, v0.4s, v2.4s
|
||||
fmla v5.4s, v1.4s, v3.4s
|
||||
b.eq 2f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x2], #32
|
||||
st1 {v4.4S, v5.4S}, [x0], #32
|
||||
ld1 {v4.4S, v5.4S}, [x3], #32
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], #32
|
||||
st1 {v4.4s, v5.4s}, [x0], #32
|
||||
ld1 {v4.4s, v5.4s}, [x3], #32
|
||||
b 1b
|
||||
2: st1 {v4.4S, v5.4S}, [x0], #32
|
||||
2: st1 {v4.4s, v5.4s}, [x0], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -159,44 +159,44 @@ function ff_vector_fmul_reverse_neon, export=1
|
||||
add x2, x2, x3, lsl #2
|
||||
sub x2, x2, #32
|
||||
mov x4, #-32
|
||||
ld1 {v2.4S, v3.4S}, [x2], x4
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], x4
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
1: subs x3, x3, #8
|
||||
rev64 v3.4S, v3.4S
|
||||
rev64 v2.4S, v2.4S
|
||||
ext v3.16B, v3.16B, v3.16B, #8
|
||||
ext v2.16B, v2.16B, v2.16B, #8
|
||||
fmul v16.4S, v0.4S, v3.4S
|
||||
fmul v17.4S, v1.4S, v2.4S
|
||||
rev64 v3.4s, v3.4s
|
||||
rev64 v2.4s, v2.4s
|
||||
ext v3.16b, v3.16b, v3.16b, #8
|
||||
ext v2.16b, v2.16b, v2.16b, #8
|
||||
fmul v16.4s, v0.4s, v3.4s
|
||||
fmul v17.4s, v1.4s, v2.4s
|
||||
b.eq 2f
|
||||
ld1 {v2.4S, v3.4S}, [x2], x4
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x2], x4
|
||||
ld1 {v0.4s, v1.4s}, [x1], #32
|
||||
st1 {v16.4s, v17.4s}, [x0], #32
|
||||
b 1b
|
||||
2: st1 {v16.4S, v17.4S}, [x0], #32
|
||||
2: st1 {v16.4s, v17.4s}, [x0], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_butterflies_float_neon, export=1
|
||||
1: ld1 {v0.4S}, [x0]
|
||||
ld1 {v1.4S}, [x1]
|
||||
1: ld1 {v0.4s}, [x0]
|
||||
ld1 {v1.4s}, [x1]
|
||||
subs w2, w2, #4
|
||||
fsub v2.4S, v0.4S, v1.4S
|
||||
fadd v3.4S, v0.4S, v1.4S
|
||||
st1 {v2.4S}, [x1], #16
|
||||
st1 {v3.4S}, [x0], #16
|
||||
fsub v2.4s, v0.4s, v1.4s
|
||||
fadd v3.4s, v0.4s, v1.4s
|
||||
st1 {v2.4s}, [x1], #16
|
||||
st1 {v3.4s}, [x0], #16
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_scalarproduct_float_neon, export=1
|
||||
movi v2.4S, #0
|
||||
1: ld1 {v0.4S}, [x0], #16
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
movi v2.4s, #0
|
||||
1: ld1 {v0.4s}, [x0], #16
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
subs w2, w2, #4
|
||||
fmla v2.4S, v0.4S, v1.4S
|
||||
fmla v2.4s, v0.4s, v1.4s
|
||||
b.gt 1b
|
||||
faddp v0.4S, v2.4S, v2.4S
|
||||
faddp s0, v0.2S
|
||||
faddp v0.4s, v2.4s, v2.4s
|
||||
faddp s0, v0.2s
|
||||
ret
|
||||
endfunc
|
||||
|
@ -21,57 +21,57 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_resample_common_apply_filter_x4_float_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.4S}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4S}, [x2], #16 // filter[0..3]
|
||||
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3]
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x8_float_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.4S}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4S}, [x2], #16 // filter[0..3]
|
||||
ld1 {v3.4S}, [x1], #16 // src[4..7]
|
||||
ld1 {v4.4S}, [x2], #16 // filter[4..7]
|
||||
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3]
|
||||
fmla v0.4S, v3.4S, v4.4S // accumulator += src[4..7] * filter[4..7]
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
|
||||
ld1 {v2.4s}, [x2], #16 // filter[0..3]
|
||||
ld1 {v3.4s}, [x1], #16 // src[4..7]
|
||||
ld1 {v4.4s}, [x2], #16 // filter[4..7]
|
||||
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
|
||||
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x4_s16_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.4H}, [x1], #8 // src[0..3]
|
||||
ld1 {v2.4H}, [x2], #8 // filter[0..3]
|
||||
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3]
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
|
||||
ld1 {v2.4h}, [x2], #8 // filter[0..3]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
subs w3, w3, #4 // filter_length -= 4
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_resample_common_apply_filter_x8_s16_neon, export=1
|
||||
movi v0.4S, #0 // accumulator
|
||||
1: ld1 {v1.8H}, [x1], #16 // src[0..7]
|
||||
ld1 {v2.8H}, [x2], #16 // filter[0..7]
|
||||
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3]
|
||||
smlal2 v0.4S, v1.8H, v2.8H // accumulator += src[4..7] * filter[4..7]
|
||||
movi v0.4s, #0 // accumulator
|
||||
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
|
||||
ld1 {v2.8h}, [x2], #16 // filter[0..7]
|
||||
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
|
||||
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
|
||||
subs w3, w3, #8 // filter_length -= 8
|
||||
b.gt 1b // loop until filter_length
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.S}[0], [x0], #4 // write accumulator
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
|
||||
st1 {v0.s}[0], [x0], #4 // write accumulator
|
||||
ret
|
||||
endfunc
|
||||
|
@ -50,43 +50,43 @@ function ff_hscale8to15_X8_neon, export=1
|
||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
movi v0.2D, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2D, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2D, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2D, #0 // val sum part 4 (for dst[3])
|
||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
||||
add x8, x3, w0, UXTW // srcp + filterPos[1]
|
||||
add x0, x3, w11, UXTW // srcp + filterPos[2]
|
||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
|
||||
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
|
||||
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
|
||||
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
|
||||
smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
|
||||
smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
|
||||
smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
|
||||
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
|
||||
smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
|
||||
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
|
||||
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4H}, [x1], #8 // write to destination part0123
|
||||
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4h}, [x1], #8 // write to destination part0123
|
||||
b.gt 1b // loop until end of line
|
||||
ret
|
||||
endfunc
|
||||
@ -245,7 +245,7 @@ function ff_hscale8to15_4_neon, export=1
|
||||
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
|
||||
|
||||
1:
|
||||
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] // transpose 8 bytes each from src into 4 registers
|
||||
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
|
||||
|
||||
// load 8 values from filterPos to be used as offsets into src
|
||||
ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
|
||||
@ -253,74 +253,74 @@ function ff_hscale8to15_4_neon, export=1
|
||||
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
|
||||
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
|
||||
|
||||
movi v0.2D, #0 // Clear madd accumulator for idx 0..3
|
||||
movi v5.2D, #0 // Clear madd accumulator for idx 4..7
|
||||
movi v0.2d, #0 // Clear madd accumulator for idx 0..3
|
||||
movi v5.2d, #0 // Clear madd accumulator for idx 4..7
|
||||
|
||||
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
|
||||
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
|
||||
|
||||
add x5, x5, #32 // advance filterPos
|
||||
|
||||
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
|
||||
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
|
||||
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration
|
||||
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration
|
||||
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
|
||||
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration
|
||||
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration
|
||||
|
||||
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
|
||||
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
|
||||
smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
|
||||
smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
|
||||
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration
|
||||
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration
|
||||
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
|
||||
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
|
||||
smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
|
||||
smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
|
||||
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration
|
||||
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration
|
||||
|
||||
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
|
||||
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
|
||||
smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
|
||||
smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
|
||||
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
|
||||
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
|
||||
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
|
||||
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
|
||||
smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
|
||||
smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
|
||||
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
|
||||
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
|
||||
|
||||
sub w2, w2, #8 // dstW -= 8
|
||||
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
|
||||
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
|
||||
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
|
||||
sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
|
||||
cmp w2, #16 // continue on main loop if there are at least 16 iterations left
|
||||
b.ge 1b
|
||||
|
||||
// last full iteration
|
||||
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp]
|
||||
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
|
||||
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
|
||||
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
|
||||
|
||||
movi v0.2D, #0 // Clear madd accumulator for idx 0..3
|
||||
movi v5.2D, #0 // Clear madd accumulator for idx 4..7
|
||||
movi v0.2d, #0 // Clear madd accumulator for idx 0..3
|
||||
movi v5.2d, #0 // Clear madd accumulator for idx 4..7
|
||||
|
||||
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
|
||||
uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
|
||||
|
||||
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
|
||||
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
|
||||
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
|
||||
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
|
||||
smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
|
||||
smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
|
||||
smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
|
||||
smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
|
||||
|
||||
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
|
||||
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
|
||||
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
|
||||
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
|
||||
smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
|
||||
smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
|
||||
smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
|
||||
smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
|
||||
|
||||
subs w2, w2, #8 // dstW -= 8
|
||||
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
|
||||
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
|
||||
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
|
||||
sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
|
||||
|
||||
cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
|
||||
|
||||
@ -332,15 +332,15 @@ function ff_hscale8to15_4_neon, export=1
|
||||
// load src
|
||||
ldr w8, [x5], #4 // filterPos[i]
|
||||
add x9, x3, w8, UXTW // calculate the address for src load
|
||||
ld1 {v5.S}[0], [x9] // src[filterPos[i] + 0..3]
|
||||
ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
|
||||
// load filter
|
||||
ld1 {v6.4H}, [x4], #8 // filter[filterSize * i + 0..3]
|
||||
ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
|
||||
|
||||
uxtl v5.8H, v5.8B // unsigned exten long, convert src data to 16-bit
|
||||
smull v0.4S, v5.4H, v6.4H // 4 iterations of src[...] * filter[...]
|
||||
addv s0, v0.4S // add up products of src and filter values
|
||||
uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
|
||||
smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
|
||||
addv s0, v0.4s // add up products of src and filter values
|
||||
sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
|
||||
st1 {v0.H}[0], [x1], #2 // dst[i] = ...
|
||||
st1 {v0.h}[0], [x1], #2 // dst[i] = ...
|
||||
sub w2, w2, #1 // dstW--
|
||||
cbnz w2, 2b
|
||||
|
||||
@ -445,12 +445,12 @@ function ff_hscale8to19_4_neon, export=1
|
||||
smull v5.4s, v0.4h, v28.4h
|
||||
smull2 v6.4s, v0.8h, v28.8h
|
||||
uxtl v2.8h, v2.8b
|
||||
smlal v5.4s, v1.4h, v29.4H
|
||||
smlal2 v6.4s, v1.8h, v29.8H
|
||||
smlal v5.4s, v1.4h, v29.4h
|
||||
smlal2 v6.4s, v1.8h, v29.8h
|
||||
uxtl v3.8h, v3.8b
|
||||
smlal v5.4s, v2.4h, v30.4H
|
||||
smlal2 v6.4s, v2.8h, v30.8H
|
||||
smlal v5.4s, v3.4h, v31.4H
|
||||
smlal v5.4s, v2.4h, v30.4h
|
||||
smlal2 v6.4s, v2.8h, v30.8h
|
||||
smlal v5.4s, v3.4h, v31.4h
|
||||
smlal2 v6.4s, v3.8h, v31.8h
|
||||
|
||||
sshr v5.4s, v5.4s, #3
|
||||
@ -472,8 +472,8 @@ function ff_hscale8to19_4_neon, export=1
|
||||
ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
|
||||
ld1 {v31.4h}, [x4], #8
|
||||
uxtl v0.8h, v0.8b
|
||||
smull v5.4s, v0.4h, v31.4H
|
||||
saddlv d0, v5.4S
|
||||
smull v5.4s, v0.4h, v31.4h
|
||||
saddlv d0, v5.4s
|
||||
sqshrn s0, d0, #3
|
||||
smin v0.4s, v0.4s, v18.4s
|
||||
st1 {v0.s}[0], [x1], #4
|
||||
@ -499,42 +499,42 @@ function ff_hscale8to19_X8_neon, export=1
|
||||
ldr w11, [x5], #4 // filterPos[idx + 2]
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
ldr w9, [x5], #4 // filterPos[idx + 3]
|
||||
movi v0.2D, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2D, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2D, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2D, #0 // val sum part 4 (for dst[3])
|
||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
||||
add x8, x3, w0, UXTW // srcp + filterPos[1]
|
||||
add x0, x3, w11, UXTW // srcp + filterPos[2]
|
||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
|
||||
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
|
||||
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
|
||||
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
|
||||
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
|
||||
smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
|
||||
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
|
||||
smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
|
||||
smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
|
||||
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
|
||||
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sshr v0.4s, v0.4S, #3 // shift and clip the 2x16-bit final values
|
||||
sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
|
||||
smin v0.4s, v0.4s, v20.4s
|
||||
st1 {v0.4s}, [x1], #16 // write to destination part0123
|
||||
b.gt 1b // loop until end of line
|
||||
@ -588,16 +588,16 @@ function ff_hscale8to19_X4_neon, export=1
|
||||
smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
|
||||
ldr d6, [x10], #8 // load src values for idx 2
|
||||
ldr q29, [x14, x16] // load filter values for idx 2
|
||||
smlal v17.4s, v5.4h, v30.4H // multiplication of lower half for idx 1
|
||||
smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
|
||||
ldr d7, [x11], #8 // load src values for idx 3
|
||||
smlal2 v17.4s, v5.8h, v30.8H // multiplication of upper half for idx 1
|
||||
uxtl v6.8h, v6.8B // extend tpye to matchi the filter's size
|
||||
smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
|
||||
uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
|
||||
ldr q28, [x15, x16] // load filter values for idx 3
|
||||
smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
|
||||
uxtl v7.8h, v7.8B
|
||||
smlal2 v18.4s, v6.8h, v29.8H // multiplication of upper half for idx 2
|
||||
uxtl v7.8h, v7.8b
|
||||
smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
|
||||
sub w0, w0, #8
|
||||
smlal v19.4s, v7.4h, v28.4H // multiplication of lower half for idx 3
|
||||
smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
|
||||
cmp w0, #8
|
||||
smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
|
||||
add x16, x16, #16 // advance filter values indexing
|
||||
@ -618,11 +618,11 @@ function ff_hscale8to19_X4_neon, export=1
|
||||
uxtl v5.8h, v5.8b // extend type to match the filter' size
|
||||
ldr s6, [x10] // load src values for idx 2
|
||||
smlal v17.4s, v5.4h, v30.4h
|
||||
uxtl v6.8h, v6.8B // extend type to match the filter's size
|
||||
uxtl v6.8h, v6.8b // extend type to match the filter's size
|
||||
ldr d29, [x14, x17] // load filter values for idx 2
|
||||
ldr s7, [x11] // load src values for idx 3
|
||||
addp v16.4s, v16.4s, v17.4s
|
||||
uxtl v7.8h, v7.8B
|
||||
uxtl v7.8h, v7.8b
|
||||
ldr d28, [x15, x17] // load filter values for idx 3
|
||||
smlal v18.4s, v6.4h, v29.4h
|
||||
smlal v19.4s, v7.4h, v28.4h
|
||||
@ -700,31 +700,31 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||
// Extending to 32 bits is necessary, as unit16_t values can't
|
||||
// be represented as int16_t without type promotion.
|
||||
uxtl v26.4s, v0.4h
|
||||
sxtl v27.4s, v28.4H
|
||||
sxtl v27.4s, v28.4h
|
||||
uxtl2 v0.4s, v0.8h
|
||||
mul v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v28.8H
|
||||
sxtl2 v28.4s, v28.8h
|
||||
uxtl v26.4s, v1.4h
|
||||
mul v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v29.4H
|
||||
sxtl v27.4s, v29.4h
|
||||
uxtl2 v0.4s, v1.8h
|
||||
mla v5.4s, v27.4s, v26.4s
|
||||
sxtl2 v28.4s, v29.8H
|
||||
sxtl2 v28.4s, v29.8h
|
||||
uxtl v26.4s, v2.4h
|
||||
mla v6.4s, v28.4s, v0.4s
|
||||
|
||||
sxtl v27.4s, v30.4H
|
||||
sxtl v27.4s, v30.4h
|
||||
uxtl2 v0.4s, v2.8h
|
||||
mla v5.4s, v27.4s, v26.4s
|
||||
sxtl2 v28.4s, v30.8H
|
||||
sxtl2 v28.4s, v30.8h
|
||||
uxtl v26.4s, v3.4h
|
||||
mla v6.4s, v28.4s, v0.4s
|
||||
|
||||
sxtl v27.4s, v31.4H
|
||||
sxtl v27.4s, v31.4h
|
||||
uxtl2 v0.4s, v3.8h
|
||||
mla v5.4s, v27.4s, v26.4s
|
||||
sxtl2 v28.4s, v31.8H
|
||||
sxtl2 v28.4s, v31.8h
|
||||
sub w2, w2, #8
|
||||
mla v6.4s, v28.4s, v0.4s
|
||||
|
||||
@ -775,31 +775,31 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
|
||||
|
||||
uxtl v26.4s, v0.4h
|
||||
sxtl v27.4s, v28.4H
|
||||
sxtl v27.4s, v28.4h
|
||||
uxtl2 v0.4s, v0.8h
|
||||
mul v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v28.8H
|
||||
sxtl2 v28.4s, v28.8h
|
||||
uxtl v26.4s, v1.4h
|
||||
mul v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v29.4H
|
||||
sxtl v27.4s, v29.4h
|
||||
uxtl2 v0.4s, v1.8h
|
||||
mla v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v29.8H
|
||||
sxtl2 v28.4s, v29.8h
|
||||
uxtl v26.4s, v2.4h
|
||||
mla v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v30.4H
|
||||
sxtl v27.4s, v30.4h
|
||||
uxtl2 v0.4s, v2.8h
|
||||
mla v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v30.8H
|
||||
sxtl2 v28.4s, v30.8h
|
||||
uxtl v26.4s, v3.4h
|
||||
mla v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v31.4H
|
||||
sxtl v27.4s, v31.4h
|
||||
uxtl2 v0.4s, v3.8h
|
||||
mla v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v31.8H
|
||||
sxtl2 v28.4s, v31.8h
|
||||
subs w2, w2, #8
|
||||
mla v6.4s, v0.4s, v28.4s
|
||||
|
||||
@ -807,7 +807,7 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||
sshl v6.4s, v6.4s, v17.4s
|
||||
smin v5.4s, v5.4s, v18.4s
|
||||
smin v6.4s, v6.4s, v18.4s
|
||||
xtn v5.4h, v5.4S
|
||||
xtn v5.4h, v5.4s
|
||||
xtn2 v5.8h, v6.4s
|
||||
|
||||
st1 {v5.8h}, [x1], #16
|
||||
@ -826,7 +826,7 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||
uxtl v0.4s, v0.4h
|
||||
sxtl v31.4s, v31.4h
|
||||
mul v5.4s, v0.4s, v31.4s
|
||||
addv s0, v5.4S
|
||||
addv s0, v5.4s
|
||||
sshl v0.4s, v0.4s, v17.4s
|
||||
smin v0.4s, v0.4s, v18.4s
|
||||
st1 {v0.h}[0], [x1], #2
|
||||
@ -865,58 +865,58 @@ function ff_hscale16to15_X8_neon_asm, export=1
|
||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
movi v0.2D, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2D, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2D, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2D, #0 // val sum part 4 (for dst[3])
|
||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
||||
add x8, x3, w10, UXTW // srcp + filterPos[1]
|
||||
add x10, x3, w11, UXTW // srcp + filterPos[2]
|
||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign
|
||||
sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size
|
||||
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
|
||||
sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
|
||||
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
|
||||
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
|
||||
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
|
||||
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
|
||||
mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
|
||||
sxtl v27.4s, v7.4H // exted filter lower half
|
||||
uxtl2 v6.4s, v6.8H // extend srcp upper half
|
||||
mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
|
||||
sxtl v27.4s, v7.4h // exted filter lower half
|
||||
uxtl2 v6.4s, v6.8h // extend srcp upper half
|
||||
sxtl2 v7.4s, v7.8h // extend filter upper half
|
||||
ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}]
|
||||
mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v22.4s, v16.4H // extend srcp lower half
|
||||
sxtl v23.4s, v17.4H // extend filter lower half
|
||||
uxtl2 v16.4s, v16.8H // extend srcp upper half
|
||||
ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
|
||||
mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v22.4s, v16.4h // extend srcp lower half
|
||||
sxtl v23.4s, v17.4h // extend filter lower half
|
||||
uxtl2 v16.4s, v16.8h // extend srcp upper half
|
||||
sxtl2 v17.4s, v17.8h // extend filter upper half
|
||||
mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}]
|
||||
mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
|
||||
mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
uxtl v28.4s, v18.4H // extend srcp lower half
|
||||
sxtl v29.4s, v19.4H // extend filter lower half
|
||||
uxtl2 v18.4s, v18.8H // extend srcp upper half
|
||||
uxtl v28.4s, v18.4h // extend srcp lower half
|
||||
sxtl v29.4s, v19.4h // extend filter lower half
|
||||
uxtl2 v18.4s, v18.8h // extend srcp upper half
|
||||
sxtl2 v19.4s, v19.8h // extend filter upper half
|
||||
mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
|
||||
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
|
||||
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
|
||||
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
|
||||
xtn v0.4h, v0.4s // narrow down to 16 bits
|
||||
|
||||
st1 {v0.4H}, [x1], #8 // write to destination part0123
|
||||
st1 {v0.4h}, [x1], #8 // write to destination part0123
|
||||
b.gt 1b // loop until end of line
|
||||
ret
|
||||
endfunc
|
||||
@ -1108,31 +1108,31 @@ function ff_hscale16to19_4_neon_asm, export=1
|
||||
// Extending to 32 bits is necessary, as unit16_t values can't
|
||||
// be represented as int16_t without type promotion.
|
||||
uxtl v26.4s, v0.4h
|
||||
sxtl v27.4s, v28.4H
|
||||
sxtl v27.4s, v28.4h
|
||||
uxtl2 v0.4s, v0.8h
|
||||
mul v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v28.8H
|
||||
sxtl2 v28.4s, v28.8h
|
||||
uxtl v26.4s, v1.4h
|
||||
mul v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v29.4H
|
||||
sxtl v27.4s, v29.4h
|
||||
uxtl2 v0.4s, v1.8h
|
||||
mla v5.4s, v27.4s, v26.4s
|
||||
sxtl2 v28.4s, v29.8H
|
||||
sxtl2 v28.4s, v29.8h
|
||||
uxtl v26.4s, v2.4h
|
||||
mla v6.4s, v28.4s, v0.4s
|
||||
|
||||
sxtl v27.4s, v30.4H
|
||||
sxtl v27.4s, v30.4h
|
||||
uxtl2 v0.4s, v2.8h
|
||||
mla v5.4s, v27.4s, v26.4s
|
||||
sxtl2 v28.4s, v30.8H
|
||||
sxtl2 v28.4s, v30.8h
|
||||
uxtl v26.4s, v3.4h
|
||||
mla v6.4s, v28.4s, v0.4s
|
||||
|
||||
sxtl v27.4s, v31.4H
|
||||
sxtl v27.4s, v31.4h
|
||||
uxtl2 v0.4s, v3.8h
|
||||
mla v5.4s, v27.4s, v26.4s
|
||||
sxtl2 v28.4s, v31.8H
|
||||
sxtl2 v28.4s, v31.8h
|
||||
sub w2, w2, #8
|
||||
mla v6.4s, v28.4s, v0.4s
|
||||
|
||||
@ -1181,31 +1181,31 @@ function ff_hscale16to19_4_neon_asm, export=1
|
||||
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
|
||||
|
||||
uxtl v26.4s, v0.4h
|
||||
sxtl v27.4s, v28.4H
|
||||
sxtl v27.4s, v28.4h
|
||||
uxtl2 v0.4s, v0.8h
|
||||
mul v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v28.8H
|
||||
sxtl2 v28.4s, v28.8h
|
||||
uxtl v26.4s, v1.4h
|
||||
mul v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v29.4H
|
||||
sxtl v27.4s, v29.4h
|
||||
uxtl2 v0.4s, v1.8h
|
||||
mla v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v29.8H
|
||||
sxtl2 v28.4s, v29.8h
|
||||
uxtl v26.4s, v2.4h
|
||||
mla v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v30.4H
|
||||
sxtl v27.4s, v30.4h
|
||||
uxtl2 v0.4s, v2.8h
|
||||
mla v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v30.8H
|
||||
sxtl2 v28.4s, v30.8h
|
||||
uxtl v26.4s, v3.4h
|
||||
mla v6.4s, v0.4s, v28.4s
|
||||
|
||||
sxtl v27.4s, v31.4H
|
||||
sxtl v27.4s, v31.4h
|
||||
uxtl2 v0.4s, v3.8h
|
||||
mla v5.4s, v26.4s, v27.4s
|
||||
sxtl2 v28.4s, v31.8H
|
||||
sxtl2 v28.4s, v31.8h
|
||||
subs w2, w2, #8
|
||||
mla v6.4s, v0.4s, v28.4s
|
||||
|
||||
@ -1232,7 +1232,7 @@ function ff_hscale16to19_4_neon_asm, export=1
|
||||
sxtl v31.4s, v31.4h
|
||||
subs w2, w2, #1
|
||||
mul v5.4s, v0.4s, v31.4s
|
||||
addv s0, v5.4S
|
||||
addv s0, v5.4s
|
||||
sshl v0.4s, v0.4s, v17.4s
|
||||
smin v0.4s, v0.4s, v18.4s
|
||||
st1 {v0.s}[0], [x1], #4
|
||||
@ -1270,52 +1270,52 @@ function ff_hscale16to19_X8_neon_asm, export=1
|
||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||
lsl w10, w10, #1
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
movi v0.2D, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2D, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2D, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2D, #0 // val sum part 4 (for dst[3])
|
||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
||||
add x8, x3, w10, UXTW // srcp + filterPos[1]
|
||||
add x10, x3, w11, UXTW // srcp + filterPos[2]
|
||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign
|
||||
sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size
|
||||
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
|
||||
sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
|
||||
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
|
||||
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
|
||||
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
|
||||
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
|
||||
mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
|
||||
sxtl v27.4s, v7.4H // exted filter lower half
|
||||
uxtl2 v6.4s, v6.8H // extend srcp upper half
|
||||
mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
|
||||
sxtl v27.4s, v7.4h // exted filter lower half
|
||||
uxtl2 v6.4s, v6.8h // extend srcp upper half
|
||||
sxtl2 v7.4s, v7.8h // extend filter upper half
|
||||
ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}]
|
||||
mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v22.4s, v16.4H // extend srcp lower half
|
||||
sxtl v23.4s, v17.4H // extend filter lower half
|
||||
uxtl2 v16.4s, v16.8H // extend srcp upper half
|
||||
ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
|
||||
mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v22.4s, v16.4h // extend srcp lower half
|
||||
sxtl v23.4s, v17.4h // extend filter lower half
|
||||
uxtl2 v16.4s, v16.8h // extend srcp upper half
|
||||
sxtl2 v17.4s, v17.8h // extend filter upper half
|
||||
mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}]
|
||||
mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
|
||||
mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
uxtl v28.4s, v18.4H // extend srcp lower half
|
||||
sxtl v29.4s, v19.4H // extend filter lower half
|
||||
uxtl2 v18.4s, v18.8H // extend srcp upper half
|
||||
uxtl v28.4s, v18.4h // extend srcp lower half
|
||||
sxtl v29.4s, v19.4h // extend filter lower half
|
||||
uxtl2 v18.4s, v18.8h // extend srcp upper half
|
||||
sxtl2 v19.4s, v19.8h // extend filter upper half
|
||||
mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
|
||||
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
|
||||
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
|
||||
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
|
||||
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
|
||||
|
@ -29,13 +29,13 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
// x5 - const uint8_t *dither,
|
||||
// w6 - int offset
|
||||
|
||||
ld1 {v0.8B}, [x5] // load 8x8-bit dither
|
||||
ld1 {v0.8b}, [x5] // load 8x8-bit dither
|
||||
and w6, w6, #7
|
||||
cbz w6, 1f // check if offsetting present
|
||||
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8H, v0.8B // extend dither to 16-bit
|
||||
ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
|
||||
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
cmp w1, #8 // if filterSize == 8, branch to specialized version
|
||||
b.eq 6f
|
||||
cmp w1, #4 // if filterSize == 4, branch to specialized version
|
||||
@ -48,8 +48,8 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
mov x7, #0 // i = 0
|
||||
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
|
||||
// fs % 2 == 0
|
||||
2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
@ -57,12 +57,12 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
add x12, x12, x7, lsl #1 // &src[j+1][i]
|
||||
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
smlal v3.4S, v5.4H, v7.H[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4S, v5.8H, v7.H[0] // val1 += {E,F,G,H} * X
|
||||
smlal v3.4S, v6.4H, v7.H[1] // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4S, v6.8H, v7.H[1] // val1 += {M,N,O,P} * Y
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
|
||||
smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
|
||||
subs w8, w8, #2 // tmpfilterSize -= 2
|
||||
b.gt 3b // loop until filterSize consumed
|
||||
|
||||
@ -77,17 +77,17 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
|
||||
// If filter size is odd (most likely == 1), then use this section.
|
||||
// fs % 2 != 0
|
||||
4: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
|
||||
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
smlal v3.4S, v5.4H, v6.H[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4S, v5.8H, v6.H[0] // val1 += {E,F,G,H} * X
|
||||
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
|
||||
subs w8, w8, #1 // tmpfilterSize -= 2
|
||||
b.gt 5b // loop until filterSize consumed
|
||||
|
||||
@ -107,36 +107,36 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
|
||||
|
||||
// load 8x16-bit values for filter[j], where j=0..7
|
||||
ld1 {v6.8H}, [x0]
|
||||
ld1 {v6.8h}, [x0]
|
||||
7:
|
||||
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
|
||||
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
ld1 {v28.8H}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
|
||||
ld1 {v29.8H}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
|
||||
ld1 {v30.8H}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
|
||||
ld1 {v31.8H}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
|
||||
ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
|
||||
ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
|
||||
ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
|
||||
|
||||
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
smlal v3.4S, v28.4H, v6.H[4] // val0 += src[4][i + {0..3}] * filter[4]
|
||||
smlal2 v4.4S, v28.8H, v6.H[4] // val1 += src[4][i + {4..7}] * filter[4]
|
||||
smlal v3.4S, v29.4H, v6.H[5] // val0 += src[5][i + {0..3}] * filter[5]
|
||||
smlal2 v4.4S, v29.8H, v6.H[5] // val1 += src[5][i + {4..7}] * filter[5]
|
||||
smlal v3.4S, v30.4H, v6.H[6] // val0 += src[6][i + {0..3}] * filter[6]
|
||||
smlal2 v4.4S, v30.8H, v6.H[6] // val1 += src[6][i + {4..7}] * filter[6]
|
||||
smlal v3.4S, v31.4H, v6.H[7] // val0 += src[7][i + {0..3}] * filter[7]
|
||||
smlal2 v4.4S, v31.8H, v6.H[7] // val1 += src[7][i + {4..7}] * filter[7]
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
|
||||
smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
|
||||
smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
|
||||
smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
|
||||
smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
|
||||
smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
|
||||
smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
|
||||
smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
@ -151,24 +151,24 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
|
||||
|
||||
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
|
||||
ld1 {v6.4H}, [x0]
|
||||
ld1 {v6.4h}, [x0]
|
||||
9:
|
||||
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
|
||||
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
|
||||
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
|
||||
|
||||
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
|
||||
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
|
||||
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
|
||||
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
@ -184,16 +184,16 @@ function ff_yuv2planeX_8_neon, export=1
|
||||
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
|
||||
ldr s6, [x0]
|
||||
11:
|
||||
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
|
||||
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
|
||||
|
||||
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
|
||||
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
|
||||
|
||||
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
|
||||
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
|
||||
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
|
||||
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
@ -210,11 +210,11 @@ function ff_yuv2plane1_8_neon, export=1
|
||||
// w2 - int dstW,
|
||||
// x3 - const uint8_t *dither,
|
||||
// w4 - int offset
|
||||
ld1 {v0.8B}, [x3] // load 8x8-bit dither
|
||||
ld1 {v0.8b}, [x3] // load 8x8-bit dither
|
||||
and w4, w4, #7
|
||||
cbz w4, 1f // check if offsetting present
|
||||
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8H, v0.8B // extend dither to 32-bit
|
||||
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8h, v0.8b // extend dither to 32-bit
|
||||
uxtl v1.4s, v0.4h
|
||||
uxtl2 v2.4s, v0.8h
|
||||
2:
|
||||
|
@ -33,9 +33,9 @@
|
||||
.macro load_args_nv12
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
@ -51,9 +51,9 @@
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
@ -67,9 +67,9 @@
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
@ -77,22 +77,22 @@
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv12
|
||||
ld2 {v16.8B, v17.8B}, [x6], #16
|
||||
ushll v18.8H, v16.8B, #3
|
||||
ushll v19.8H, v17.8B, #3
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv21
|
||||
ld2 {v16.8B, v17.8B}, [x6], #16
|
||||
ushll v19.8H, v16.8B, #3
|
||||
ushll v18.8H, v17.8B, #3
|
||||
ld2 {v16.8b, v17.8b}, [x6], #16
|
||||
ushll v19.8h, v16.8b, #3
|
||||
ushll v18.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv420p
|
||||
ld1 {v16.8B}, [ x6], #8
|
||||
ld1 {v17.8B}, [x13], #8
|
||||
ushll v18.8H, v16.8B, #3
|
||||
ushll v19.8H, v17.8B, #3
|
||||
ld1 {v16.8b}, [ x6], #8
|
||||
ld1 {v17.8b}, [x13], #8
|
||||
ushll v18.8h, v16.8b, #3
|
||||
ushll v19.8h, v17.8b, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv422p
|
||||
@ -123,18 +123,18 @@
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
add v20.8H, v26.8H, v20.8H // Y1 + R1
|
||||
add v21.8H, v27.8H, v21.8H // Y2 + R2
|
||||
add v22.8H, v26.8H, v22.8H // Y1 + G1
|
||||
add v23.8H, v27.8H, v23.8H // Y2 + G2
|
||||
add v24.8H, v26.8H, v24.8H // Y1 + B1
|
||||
add v25.8H, v27.8H, v25.8H // Y2 + B2
|
||||
sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
add v20.8h, v26.8h, v20.8h // Y1 + R1
|
||||
add v21.8h, v27.8h, v21.8h // Y2 + R2
|
||||
add v22.8h, v26.8h, v22.8h // Y1 + G1
|
||||
add v23.8h, v27.8h, v23.8h // Y2 + G2
|
||||
add v24.8h, v26.8h, v24.8h // Y1 + B1
|
||||
add v25.8h, v27.8h, v25.8h // Y2 + B2
|
||||
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
.endm
|
||||
@ -146,47 +146,47 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
1:
|
||||
mov w8, w0 // w8 = width
|
||||
2:
|
||||
movi v5.8H, #4, lsl #8 // 128 * (1<<3)
|
||||
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
|
||||
load_chroma_\ifmt
|
||||
sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R)
|
||||
sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g
|
||||
sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g
|
||||
add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B)
|
||||
zip2 v21.8H, v20.8H, v20.8H // R2
|
||||
zip1 v20.8H, v20.8H, v20.8H // R1
|
||||
zip2 v23.8H, v22.8H, v22.8H // G2
|
||||
zip1 v22.8H, v22.8H, v22.8H // G1
|
||||
zip2 v25.8H, v24.8H, v24.8H // B2
|
||||
zip1 v24.8H, v24.8H, v24.8H // B1
|
||||
ld1 {v2.16B}, [x4], #16 // load luma
|
||||
ushll v26.8H, v2.8B, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3)
|
||||
sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset
|
||||
sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
|
||||
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
|
||||
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
|
||||
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
|
||||
zip2 v21.8h, v20.8h, v20.8h // R2
|
||||
zip1 v20.8h, v20.8h, v20.8h // R1
|
||||
zip2 v23.8h, v22.8h, v22.8h // G2
|
||||
zip1 v22.8h, v22.8h, v22.8h // G1
|
||||
zip2 v25.8h, v24.8h, v24.8h // B2
|
||||
zip1 v24.8h, v24.8h, v24.8h // B1
|
||||
ld1 {v2.16b}, [x4], #16 // load luma
|
||||
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
|
||||
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
|
||||
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
|
||||
.ifc \ofmt,argb // 1 2 3 0
|
||||
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
|
||||
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,rgba // 0 1 2 3
|
||||
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
|
||||
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,abgr // 3 2 1 0
|
||||
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
|
||||
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,bgra // 2 1 0 3
|
||||
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
|
||||
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
|
||||
.endif
|
||||
|
||||
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
|
||||
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
|
||||
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
|
||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, SXTW // dst += padding
|
||||
|
Loading…
Reference in New Issue
Block a user