From 38cd829dce7184400c944ead299a11e57c8ec7f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Sun, 9 Oct 2022 21:17:47 +0200 Subject: [PATCH] aarch64: Implement stack spilling in a consistent way. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently it is done in several different ways, which might cause needless dependencies or in case of tx_float_neon.S is incorrect. Reviewed-by: Martin Storsjö Signed-off-by: Reimar Döffinger --- libavcodec/aarch64/fft_neon.S | 3 +- libavcodec/aarch64/h264idct_neon.S | 6 +- libavcodec/aarch64/hevcdsp_sao_neon.S | 3 +- libavcodec/aarch64/mdct_neon.S | 18 ++---- libavcodec/aarch64/me_cmp_neon.S | 6 +- libavcodec/aarch64/synth_filter_neon.S | 3 +- libavcodec/aarch64/vp9itxfm_neon.S | 28 ++++----- libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +++++------ libavcodec/aarch64/vp9lpf_neon.S | 80 +++++++++++++------------- libavutil/aarch64/tx_float_neon.S | 52 ++++++++--------- 10 files changed, 109 insertions(+), 122 deletions(-) diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S index 9ff3f9c526..d7225511dd 100644 --- a/libavcodec/aarch64/fft_neon.S +++ b/libavcodec/aarch64/fft_neon.S @@ -342,8 +342,7 @@ endfunc function fft\n\()_neon, align=6 AARCH64_VALID_JUMP_TARGET AARCH64_SIGN_LINK_REGISTER - sub sp, sp, #16 - stp x28, x30, [sp] + stp x28, x30, [sp, #-16]! add x28, x0, #\n4*2*8 bl fft\n2\()_neon mov x0, x28 diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S index 7d2879b0ce..375da31d65 100644 --- a/libavcodec/aarch64/h264idct_neon.S +++ b/libavcodec/aarch64/h264idct_neon.S @@ -157,8 +157,7 @@ function ff_h264_idct_add16intra_neon, export=1 endfunc function ff_h264_idct_add8_neon, export=1 - sub sp, sp, #0x40 - stp x19, x20, [sp] + stp x19, x20, [sp, #-0x40]! mov x12, x30 ldp x6, x15, [x0] // dest[0], dest[1] add x5, x1, #16*4 // block_offset @@ -187,8 +186,7 @@ function ff_h264_idct_add8_neon, export=1 csel x6, x15, x6, eq cmp x10, #20 b.lt 1b - ldp x19, x20, [sp] - add sp, sp, #0x40 + ldp x19, x20, [sp], #0x40 ret x12 endfunc diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index d4decfde3b..30e83dda5d 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -33,8 +33,7 @@ // int16_t *sao_offset_val, int sao_left_class, // int width, int height) function ff_hevc_sao_band_filter_8x8_8_neon, export=1 - sub sp, sp, #64 - stp xzr, xzr, [sp] + stp xzr, xzr, [sp, #-64]! stp xzr, xzr, [sp, #16] stp xzr, xzr, [sp, #32] stp xzr, xzr, [sp, #48] diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S index 6091e72022..98b09bf1ab 100644 --- a/libavcodec/aarch64/mdct_neon.S +++ b/libavcodec/aarch64/mdct_neon.S @@ -23,8 +23,7 @@ #include "libavutil/aarch64/asm.S" function ff_imdct_half_neon, export=1 - sub sp, sp, #32 - stp x19, x20, [sp] + stp x19, x20, [sp, #-32]! AARCH64_SIGN_LINK_REGISTER str x30, [sp, #16] mov x12, #1 @@ -120,17 +119,15 @@ function ff_imdct_half_neon, export=1 st2 {v4.2s,v5.2s}, [x0] st2 {v6.2s,v7.2s}, [x8] - ldp x19, x20, [sp] ldr x30, [sp, #16] AARCH64_VALIDATE_LINK_REGISTER - add sp, sp, #32 + ldp x19, x20, [sp], #32 ret endfunc function ff_imdct_calc_neon, export=1 - sub sp, sp, #32 - stp x19, x20, [sp] + stp x19, x20, [sp, #-32]! AARCH64_SIGN_LINK_REGISTER str x30, [sp, #16] ldr w3, [x0, #28] // mdct_bits @@ -163,18 +160,16 @@ function ff_imdct_calc_neon, export=1 subs x19, x19, #16 b.gt 1b - ldp x19, x20, [sp] ldr x30, [sp, #16] AARCH64_VALIDATE_LINK_REGISTER - add sp, sp, #32 + ldp x19, x20, [sp], #32 ret endfunc function ff_mdct_calc_neon, export=1 - sub sp, sp, #32 - stp x19, x20, [sp] + stp x19, x20, [sp, #-32]! AARCH64_SIGN_LINK_REGISTER str x30, [sp, #16] @@ -323,10 +318,9 @@ function ff_mdct_calc_neon, export=1 st2 {v4.2s,v5.2s}, [x0] st2 {v6.2s,v7.2s}, [x8] - ldp x19, x20, [sp] ldr x30, [sp, #16] AARCH64_VALIDATE_LINK_REGISTER - add sp, sp, #32 + ldp x19, x20, [sp], #32 ret endfunc diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index d8a18cd4b8..cf86e5081d 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -1173,10 +1173,9 @@ function nsse16_neon, export=1 bl X(sse16_neon) ldr x30, [sp, #0x30] mov w9, w0 // here we store score1 - ldr x5, [sp] ldp x1, x2, [sp, #0x10] ldp x3, x4, [sp, #0x20] - add sp, sp, #0x40 + ldr x5, [sp], #0x40 movi v16.8h, #0 movi v17.8h, #0 @@ -1295,10 +1294,9 @@ function nsse8_neon, export=1 bl X(sse8_neon) ldr x30, [sp, #0x30] mov w9, w0 // here we store score1 - ldr x5, [sp] ldp x1, x2, [sp, #0x10] ldp x3, x4, [sp, #0x20] - add sp, sp, #0x40 + ldr x5, [sp], #0x40 movi v16.8h, #0 movi v17.8h, #0 diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S index ba79ba9686..4f8494ce30 100644 --- a/libavcodec/aarch64/synth_filter_neon.S +++ b/libavcodec/aarch64/synth_filter_neon.S @@ -61,11 +61,11 @@ function ff_synth_filter_float_neon, export=1 blr x9 - ldp x2, x4, [sp] // synct_buf_2, window ldp x13, x9, [sp, #16] // out, synth_buf ldp x0, x30, [sp, #32] // *synth_buf_offset AARCH64_VALIDATE_LINK_REGISTER ldr s0, [sp, #48] + ldp x2, x4, [sp], #64 // synct_buf_2, window add x3, x2, #16*4 // synct_buf_2 + 16 add x14, x13, #16*4 // out + 16 @@ -116,6 +116,5 @@ function ff_synth_filter_float_neon, export=1 b 1b 10: - add sp, sp, #64 ret endfunc diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 03272eae82..a27f7b8ae5 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -850,10 +850,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 mov x15, x30 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to. .ifnc \txfm1\()_\txfm2,idct_idct - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] .endif sub sp, sp, #512 @@ -920,10 +920,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 add sp, sp, #512 .ifnc \txfm1\()_\txfm2,idct_idct - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 .endif ret x15 endfunc @@ -1486,8 +1486,8 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 mov x15, x30 - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] sub sp, sp, #2048 @@ -1544,8 +1544,8 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 add sp, sp, #2048 - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], #0x20 ret x15 endfunc @@ -1569,8 +1569,8 @@ function idct32x32_\size\()_add_neon add sp, sp, #2048 - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], #0x20 ret x15 endfunc diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S index a092617b92..e3e70491c6 100644 --- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S +++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S @@ -417,10 +417,10 @@ endfunc function ff_\func\()_\bpp\()_neon, export=1 .if \push mov x16, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] .endif lsl w2, w2, #\bpp - 8 lsl w3, w3, #\bpp - 8 @@ -430,10 +430,10 @@ function ff_\func\()_\bpp\()_neon, export=1 mov x7, #((1 << \bpp) - 1) .if \push bl \func\()_16_neon - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x16 .else b \func\()_16_neon @@ -450,10 +450,10 @@ endfunc function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 mov x16, x30 .if \push - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] .endif lsl w2, w2, #\bpp - 8 lsl w3, w3, #\bpp - 8 @@ -469,10 +469,10 @@ function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 .endif bl \func\()_\int_suffix\()_16_neon .if \push - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], 0x40 .endif ret x16 endfunc diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 694ff8956f..9a79f48df3 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -564,10 +564,10 @@ function vp9_loop_filter_16 loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 ret 9: - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x10 endfunc @@ -575,10 +575,10 @@ function vp9_loop_filter_16_16b loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 ret 9: - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x10 endfunc @@ -973,10 +973,10 @@ mix_h_16 88 function ff_vp9_loop_filter_v_16_8_neon, export=1 mov x10, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] sub x9, x0, x1, lsl #3 ld1 {v16.8b}, [x9], x1 // p7 ld1 {v24.8b}, [x0], x1 // q0 @@ -1018,10 +1018,10 @@ function ff_vp9_loop_filter_v_16_8_neon, export=1 st1 {v9.8b}, [x9], x1 st1 {v17.8b}, [x0], x1 9: - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x10 8: add x9, x9, x1, lsl #2 @@ -1045,10 +1045,10 @@ endfunc function ff_vp9_loop_filter_v_16_16_neon, export=1 mov x10, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] sub x9, x0, x1, lsl #3 ld1 {v16.16b}, [x9], x1 // p7 ld1 {v24.16b}, [x0], x1 // q0 @@ -1087,10 +1087,10 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1 st1 {v9.16b}, [x9], x1 st1 {v17.16b}, [x0], x1 9: - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x10 8: add x9, x9, x1, lsl #2 @@ -1112,10 +1112,10 @@ endfunc function ff_vp9_loop_filter_h_16_8_neon, export=1 mov x10, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] sub x9, x0, #8 ld1 {v16.8b}, [x9], x1 ld1 {v24.8b}, [x0], x1 @@ -1164,10 +1164,10 @@ function ff_vp9_loop_filter_h_16_8_neon, export=1 st1 {v9.8b}, [x9], x1 st1 {v31.8b}, [x0], x1 9: - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x10 8: // The same writeback as in loop_filter_h_8_8 @@ -1202,10 +1202,10 @@ endfunc function ff_vp9_loop_filter_h_16_16_neon, export=1 mov x10, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! - stp d10, d11, [sp, #-0x10]! - stp d8, d9, [sp, #-0x10]! + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] sub x9, x0, #8 ld1 {v16.8b}, [x9], x1 ld1 {v24.8b}, [x0], x1 @@ -1283,10 +1283,10 @@ function ff_vp9_loop_filter_h_16_16_neon, export=1 st1 {v9.d}[1], [x9], x1 st1 {v31.d}[1], [x0], x1 9: - ldp d8, d9, [sp], 0x10 - ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 ret x10 8: sub x9, x0, #4 diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index 4126c3b812..e5531dcc7c 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -866,10 +866,10 @@ FFT16_FN ns_float, 1 .macro FFT32_FN name, no_perm function ff_tx_fft32_\name\()_neon, export=1 - stp d8, d9, [sp, #-16] - stp d10, d11, [sp, #-32] - stp d12, d13, [sp, #-48] - stp d14, d15, [sp, #-64] + stp d14, d15, [sp, #-16*4]! + stp d8, d9, [sp, #16*3] + stp d10, d11, [sp, #16*2] + stp d12, d13, [sp, #16] LOAD_SUBADD SETUP_SR_RECOMB 32, x7, x8, x9 @@ -911,10 +911,10 @@ function ff_tx_fft32_\name\()_neon, export=1 zip2 v31.2d, v11.2d, v15.2d st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x1] - ldp d14, d15, [sp, #-64] - ldp d12, d13, [sp, #-48] - ldp d10, d11, [sp, #-32] - ldp d8, d9, [sp, #-16] + ldp d12, d13, [sp, #16] + ldp d10, d11, [sp, #16*2] + ldp d8, d9, [sp, #16*3] + ldp d14, d15, [sp], #16*4 ret endfunc @@ -966,12 +966,12 @@ FFT32_FN ns_float, 1 .macro FFT_SPLIT_RADIX_FN name, no_perm function ff_tx_fft_sr_\name\()_neon, export=1 - stp d8, d9, [sp, #-16]! - stp d10, d11, [sp, #-16]! - stp d12, d13, [sp, #-16]! - stp d14, d15, [sp, #-16]! - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! + stp x21, x22, [sp, #-16*6]! + stp d8, d9, [sp, #16*5] + stp d10, d11, [sp, #16*4] + stp d12, d13, [sp, #16*3] + stp d14, d15, [sp, #16*2] + stp x19, x20, [sp, #16] ldr w19, [x0, #0] // global target mov w20, w19 // local length @@ -1185,12 +1185,12 @@ SR_TRANSFORM_DEF 131072 subs w19, w19, #32*4 b.gt 0b - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 + ldp x19, x20, [sp, #16] + ldp d14, d15, [sp, #16*2] + ldp d12, d13, [sp, #16*3] + ldp d10, d11, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp x21, x22, [sp], #16*6 ret @@ -1279,12 +1279,12 @@ SR_TRANSFORM_DEF 131072 zip2 v7.2d, v15.2d, v23.2d st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x15] - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 + ldp x19, x20, [sp, #16] + ldp d14, d15, [sp, #16*2] + ldp d12, d13, [sp, #16*3] + ldp d10, d11, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp x21, x22, [sp], #16*6 ret endfunc