mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
h264/aarch64: optimize neon loop filter
Exit as soon as possible if no filtering will be done. Improves the checkasm --bench cycle count on a Snapdragon 820e: h264_h_loop_filter_luma_8bpp_c: 72.4 -> 72.5 h264_h_loop_filter_luma_8bpp_neon: 97.1 -> 56.3 h264_v_loop_filter_luma_8bpp_c: 174.0 -> 173.5 h264_v_loop_filter_luma_8bpp_neon: 62.9 -> 60.9 h264_h_loop_filter_chroma_8bpp_c: 30.2 -> 30.3 h264_h_loop_filter_chroma_8bpp_neon: 51.6 -> 25.7 h264_v_loop_filter_chroma_8bpp_c: 57.3 -> 57.3 h264_v_loop_filter_chroma_8bpp_neon: 28.0 -> 24.0
This commit is contained in:
parent
d7f4f5c4a1
commit
846c3d6aca
@ -54,9 +54,12 @@
|
|||||||
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
||||||
and v21.16B, v21.16B, v28.16B
|
and v21.16B, v21.16B, v28.16B
|
||||||
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
||||||
|
and v21.16B, v21.16B, v30.16B // < beta
|
||||||
|
shrn v30.8b, v21.8h, #4
|
||||||
|
mov x7, v30.d[0]
|
||||||
cmhi v17.16B, v22.16B, v17.16B // < beta
|
cmhi v17.16B, v22.16B, v17.16B // < beta
|
||||||
and v21.16B, v21.16B, v30.16B
|
|
||||||
cmhi v19.16B, v22.16B, v19.16B // < beta
|
cmhi v19.16B, v22.16B, v19.16B // < beta
|
||||||
|
cbz x7, 9f
|
||||||
and v17.16B, v17.16B, v21.16B
|
and v17.16B, v17.16B, v21.16B
|
||||||
and v19.16B, v19.16B, v21.16B
|
and v19.16B, v19.16B, v21.16B
|
||||||
and v24.16B, v24.16B, v21.16B
|
and v24.16B, v24.16B, v21.16B
|
||||||
@ -124,7 +127,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
|
|||||||
st1 {v16.16B}, [x0], x1
|
st1 {v16.16B}, [x0], x1
|
||||||
st1 {v0.16B}, [x0], x1
|
st1 {v0.16B}, [x0], x1
|
||||||
st1 {v19.16B}, [x0]
|
st1 {v19.16B}, [x0]
|
||||||
|
9:
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
@ -174,32 +177,34 @@ function ff_h264_h_loop_filter_luma_neon, export=1
|
|||||||
st1 {v16.S}[3], [x0], x1
|
st1 {v16.S}[3], [x0], x1
|
||||||
st1 {v0.S}[3], [x0], x1
|
st1 {v0.S}[3], [x0], x1
|
||||||
st1 {v19.S}[3], [x0], x1
|
st1 {v19.S}[3], [x0], x1
|
||||||
|
9:
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro h264_loop_filter_chroma
|
.macro h264_loop_filter_chroma
|
||||||
dup v22.8B, w2 // alpha
|
dup v22.8B, w2 // alpha
|
||||||
|
dup v23.8B, w3 // beta
|
||||||
uxtl v24.8H, v24.8B
|
uxtl v24.8H, v24.8B
|
||||||
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
||||||
uxtl v4.8H, v0.8B
|
|
||||||
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
||||||
usubw v4.8H, v4.8H, v16.8B
|
|
||||||
sli v24.8H, v24.8H, #8
|
|
||||||
shl v4.8H, v4.8H, #2
|
|
||||||
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
||||||
uaddw v4.8H, v4.8H, v18.8B
|
|
||||||
cmhi v26.8B, v22.8B, v26.8B // < alpha
|
cmhi v26.8B, v22.8B, v26.8B // < alpha
|
||||||
|
cmhi v28.8B, v23.8B, v28.8B // < beta
|
||||||
|
cmhi v30.8B, v23.8B, v30.8B // < beta
|
||||||
|
uxtl v4.8H, v0.8B
|
||||||
|
and v26.8B, v26.8B, v28.8B
|
||||||
|
usubw v4.8H, v4.8H, v16.8B
|
||||||
|
and v26.8B, v26.8B, v30.8B
|
||||||
|
shl v4.8H, v4.8H, #2
|
||||||
|
mov x2, v26.d[0]
|
||||||
|
sli v24.8H, v24.8H, #8
|
||||||
|
uaddw v4.8H, v4.8H, v18.8B
|
||||||
|
cbz x2, 9f
|
||||||
usubw v4.8H, v4.8H, v2.8B
|
usubw v4.8H, v4.8H, v2.8B
|
||||||
dup v22.8B, w3 // beta
|
|
||||||
rshrn v4.8B, v4.8H, #3
|
rshrn v4.8B, v4.8H, #3
|
||||||
cmhi v28.8B, v22.8B, v28.8B // < beta
|
|
||||||
cmhi v30.8B, v22.8B, v30.8B // < beta
|
|
||||||
smin v4.8B, v4.8B, v24.8B
|
smin v4.8B, v4.8B, v24.8B
|
||||||
neg v25.8B, v24.8B
|
neg v25.8B, v24.8B
|
||||||
and v26.8B, v26.8B, v28.8B
|
|
||||||
smax v4.8B, v4.8B, v25.8B
|
smax v4.8B, v4.8B, v25.8B
|
||||||
and v26.8B, v26.8B, v30.8B
|
|
||||||
uxtl v22.8H, v0.8B
|
uxtl v22.8H, v0.8B
|
||||||
and v4.8B, v4.8B, v26.8B
|
and v4.8B, v4.8B, v26.8B
|
||||||
uxtl v28.8H, v16.8B
|
uxtl v28.8H, v16.8B
|
||||||
@ -224,7 +229,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1
|
|||||||
sub x0, x0, x1, lsl #1
|
sub x0, x0, x1, lsl #1
|
||||||
st1 {v16.8B}, [x0], x1
|
st1 {v16.8B}, [x0], x1
|
||||||
st1 {v0.8B}, [x0], x1
|
st1 {v0.8B}, [x0], x1
|
||||||
|
9:
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
@ -257,7 +262,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
|
|||||||
st1 {v16.S}[1], [x0], x1
|
st1 {v16.S}[1], [x0], x1
|
||||||
st1 {v0.S}[1], [x0], x1
|
st1 {v0.S}[1], [x0], x1
|
||||||
st1 {v2.S}[1], [x0], x1
|
st1 {v2.S}[1], [x0], x1
|
||||||
|
9:
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user