From 2bb626410504dfaa5b00a5b19ec391f37c84aaec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 12 Mar 2026 14:53:26 +0200 Subject: [PATCH] aarch64: hevcdsp: Make returns match the call site For cases when returning early without updating any pixels, we previously returned to return address in the caller's scope, bypassing one function entirely. While this may seem like a neat optimization, it makes the return stack predictor mispredict the returns - which potentially can cost more performance than it gains. Secondly, if the armv9.3 feature GCS (Guarded Control Stack) is enabled, then returns _must_ match the expected value; this feature is being enabled across linux distributions, and by fixing the hevc assembly, we can enable the security feature on ffmpeg as well. (cherry picked from commit 1f7ed8a78de1da743a359913ce05cc258a400b5d) --- libavcodec/aarch64/hevcdsp_deblock_neon.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S index 581056a91e..7a25fe2457 100644 --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S @@ -511,8 +511,11 @@ function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 sqxtun v6.8b, v6.8h sqxtun v7.8b, v7.8h .endif + // Use x15 to signal whether any pixels should be updated or not. + mov x15, #1 + ret +3: mov x15, #0 ret -3: ret x6 endfunc .endm @@ -562,6 +565,7 @@ function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1 .endif .endif bl hevc_loop_filter_luma_body_\bitdepth\()_neon + cbz x15, 9f .if \bitdepth > 8 .ifc \dir, v transpose_8x8H v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 @@ -587,6 +591,7 @@ function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1 st1 {v6.8b}, [x10], x1 st1 {v7.8b}, [x10] .endif +9: ret x6 endfunc .endm