1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avfilter/x86/vf_fspp: Make ff_column_fidct_mmx() bitexact

It currently is not, because the shortcut mode uses different rounding
than the C code (as well as the non-shortcut code).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-12 18:44:49 +01:00
parent 1049a5fba8
commit 66af18d06a

View File

@@ -33,9 +33,6 @@ pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
pw_4: times 4 dw 4 pw_4: times 4 dw 4
pw_2: times 4 dw 2 pw_2: times 4 dw 2
@@ -315,31 +312,34 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
or tmpd, tmpd or tmpd, tmpd
jnz %1 jnz %1
movq m4, [rsp] movq m4, [rsp]
movq m1, m0 psraw m3, m0, 2
pmulhw m0, [pw_3642] psllw m0, 1
movq m2, m1 mova m5, [outq+DCTSIZE*0*2]
movq m5, [outq+DCTSIZE*0*2] pmulhw m1, m0, [pw_3B21]
movq m3, m2 pmulhw m2, m0, [pw_22A3]
pmulhw m1, [pw_2441] pmulhw m0, [pw_2D41]
paddw m5, m4 paddw m5, m4
movq m6, [rsp+8] movq m6, [rsp+8]
psraw m3, 2 psubw m2, m1
pmulhw m2, [pw_0CBB]
psubw m4, m3 psubw m4, m3
movq m7, [outq+DCTSIZE*1*2] movq m7, [outq+DCTSIZE*1*2]
paddw m5, m3 paddw m5, m3
movq [outq+DCTSIZE*7*2], m4 psubw m1, m3
mova [outq+DCTSIZE*7*2], m4
psubw m0, m1
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
paddw m7, m6 paddw m7, m6
movq m3, [rsp+8*2] movq m3, [rsp+8*2]
psubw m6, m0 psubw m6, m1
movq m4, [outq+DCTSIZE*2*2] movq m4, [outq+DCTSIZE*2*2]
paddw m7, m0 paddw m7, m1
movq [outq], m5 movq [outq], m5
paddw m4, m3 paddw m4, m3
movq [outq+DCTSIZE*6*2], m6 movq [outq+DCTSIZE*6*2], m6
psubw m3, m1 psubw m3, m0
movq m5, [outq+DCTSIZE*5*2] movq m5, [outq+DCTSIZE*5*2]
paddw m4, m1 paddw m4, m0
movq m6, [outq+DCTSIZE*3*2] movq m6, [outq+DCTSIZE*3*2]
paddw m5, m3 paddw m5, m3
movq m0, [rsp+8*3] movq m0, [rsp+8*3]
@@ -347,9 +347,9 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movq [outq+DCTSIZE*1*2], m7 movq [outq+DCTSIZE*1*2], m7
paddw m6, m0 paddw m6, m0
movq [outq+DCTSIZE*2*2], m4 movq [outq+DCTSIZE*2*2], m4
psubw m0, m2 paddw m0, m2
movq m7, [outq+DCTSIZE*4*2] movq m7, [outq+DCTSIZE*4*2]
paddw m6, m2 psubw m6, m2
movq [outq+DCTSIZE*5*2], m5 movq [outq+DCTSIZE*5*2], m5
paddw m7, m0 paddw m7, m0
movq [outq+DCTSIZE*3*2], m6 movq [outq+DCTSIZE*3*2], m6