1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-03 05:10:03 +02:00

aarch64: hevc: Deduplicate the hevc_put_hevc_qpel_uni_w_hv*_8_end_neon functions

The hv32 and hv64 functions were identical - both loop and
process 16 pixels at a time.

The hv16 function was near identical, except for the outer loop
(and using sp instead of a separate register).

Given the size of these functions, the extra cost of the outer
loop is negligible, so use the same function for hv16 as well.

This removes over 200 lines of duplicated assembly, and over 4 KB
of binary size.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2024-03-25 13:32:18 +02:00
parent 4063e50eec
commit 4f71e4ebf2

View File

@ -4381,231 +4381,17 @@ function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1
b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
endfunc
function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
ldp q16, q1, [sp]
add sp, sp, x10
ldp q17, q2, [sp]
add sp, sp, x10
ldp q18, q3, [sp]
add sp, sp, x10
ldp q19, q4, [sp]
add sp, sp, x10
ldp q20, q5, [sp]
add sp, sp, x10
ldp q21, q6, [sp]
add sp, sp, x10
ldp q22, q7, [sp]
add sp, sp, x10
1:
ldp q23, q31, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q16, q1, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q17, q2, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q18, q3, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q19, q4, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q20, q5, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q21, q6, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q22, q7, [sp]
add sp, sp, x10
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.hi 1b
2:
QPEL_UNI_W_HV_END
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
QPEL_UNI_W_HV_HEADER 32
b hevc_put_hevc_qpel_uni_w_hv32_8_end_neon
endfunc
function hevc_put_hevc_qpel_uni_w_hv32_8_end_neon
mov x11, sp
mov w12, w22
mov x13, x20
mov x14, sp
3:
ldp q16, q1, [x11]
add x11, x11, x10
ldp q17, q2, [x11]
add x11, x11, x10
ldp q18, q3, [x11]
add x11, x11, x10
ldp q19, q4, [x11]
add x11, x11, x10
ldp q20, q5, [x11]
add x11, x11, x10
ldp q21, q6, [x11]
add x11, x11, x10
ldp q22, q7, [x11]
add x11, x11, x10
1:
ldp q23, q31, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q16, q1, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q17, q2, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q18, q3, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q19, q4, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q20, q5, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q21, q6, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.eq 2f
ldp q22, q7, [x11]
add x11, x11, x10
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
QPEL_UNI_W_HV_16
subs w22, w22, #1
b.hi 1b
2:
subs w27, w27, #16
add x11, x14, #32
add x20, x13, #16
mov w22, w12
mov x14, x11
mov x13, x20
b.hi 3b
QPEL_UNI_W_HV_END
ret
b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
endfunc
function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
QPEL_UNI_W_HV_HEADER 64
b hevc_put_hevc_qpel_uni_w_hv64_8_end_neon
b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
endfunc
function hevc_put_hevc_qpel_uni_w_hv64_8_end_neon
function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
mov x11, sp
mov w12, w22
mov x13, x20