1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

lavc/aarch64: Fix addp overflow in ff_pred16x16_plane_neon_10

The mismatch between neon and C functions can be reproduced
using the following bitstream and command line on aarch64 devices.

wget https://streams.videolan.org/ffmpeg/incoming/replay_intra_pred_16x16.h264
 ./ffmpeg -cpuflags 0  -threads 1 -i replay_intra_pred_16x16.h264  -f framemd5 -y md5_ref
 ./ffmpeg              -threads 1 -i replay_intra_pred_16x16.h264 -f framemd5 -y md5_neon

Signed-off-by: Bin Peng <pengbin@visionular.com>
This commit is contained in:
Bin Peng
2025-10-24 15:58:08 +08:00
committed by Lynne
parent 7e8ef2ded2
commit 3115c0c0e6

View File

@@ -489,10 +489,10 @@ function ff_pred16x16_plane_neon_10, export=1
mul v2.8h, v2.8h, v0.8h mul v2.8h, v2.8h, v0.8h
mul v3.8h, v3.8h, v0.8h mul v3.8h, v3.8h, v0.8h
addp v2.8h, v2.8h, v3.8h addp v2.8h, v2.8h, v3.8h
addp v2.8h, v2.8h, v2.8h saddlp v2.4s, v2.8h
addp v2.4h, v2.4h, v2.4h addp v2.4s, v2.4s, v2.4s
sshll v3.4s, v2.4h, #2 shl v3.4s, v2.4s, #2
saddw v2.4s, v3.4s, v2.4h add v2.4s, v3.4s, v2.4s
rshrn v4.4h, v2.4s, #6 rshrn v4.4h, v2.4s, #6
trn2 v5.4h, v4.4h, v4.4h trn2 v5.4h, v4.4h, v4.4h
add v2.4h, v4.4h, v5.4h add v2.4h, v4.4h, v5.4h
@@ -506,14 +506,13 @@ function ff_pred16x16_plane_neon_10, export=1
sxtl v6.4s, v5.4h // c sxtl v6.4s, v5.4h // c
mov v0.h[0], wzr mov v0.h[0], wzr
mul v0.8h, v0.8h, v4.h[0]
dup v16.4s, v2.s[0] dup v16.4s, v2.s[0]
dup v17.4s, v2.s[0] dup v17.4s, v2.s[0]
dup v2.8h, v4.h[0] // b dup v2.8h, v4.h[0] // b
dup v3.4s, v6.s[0] // c dup v3.4s, v6.s[0] // c
sshll v2.4s, v2.4h, #3 // b * 8 sshll v2.4s, v2.4h, #3 // b * 8
saddw v16.4s, v16.4s, v0.4h smlal v16.4s, v0.4h, v4.h[0]
saddw2 v17.4s, v17.4s, v0.8h smlal2 v17.4s, v0.8h, v4.h[0]
sub v3.4s, v3.4s, v2.4s sub v3.4s, v3.4s, v2.4s
mov w3, #16 mov w3, #16