1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

x86: Update x86inc.asm

Make things up-to-date with upstream.

https://code.videolan.org/videolan/x86inc.asm
This commit is contained in:
Henrik Gramner 2024-03-16 16:39:37 +01:00
parent 782c4df28d
commit afa471d0ef
12 changed files with 518 additions and 269 deletions

4
configure vendored
View File

@ -2222,7 +2222,6 @@ ARCH_EXT_LIST_RISCV="
ARCH_EXT_LIST_X86="
$ARCH_EXT_LIST_X86_SIMD
cpunop
i686
"
@ -2771,7 +2770,6 @@ mipsdsp_deps="mips"
mipsdspr2_deps="mips"
msa_deps="mipsfpu"
cpunop_deps="i686"
x86_64_select="i686"
x86_64_suggest="fast_cmov"
@ -6401,7 +6399,6 @@ EOF
done
disabled x86asm && die "nasm/yasm not found or too old. Use --disable-x86asm for a crippled build."
X86ASMFLAGS="-f $objformat"
enabled pic && append X86ASMFLAGS "-DPIC"
test -n "$extern_prefix" && append X86ASMFLAGS "-DPREFIX"
case "$objformat" in
elf*) enabled debug && append X86ASMFLAGS $x86asm_debug ;;
@ -6412,7 +6409,6 @@ EOF
enabled avx2 && check_x86asm avx2_external "vextracti128 xmm0, ymm0, 0"
enabled xop && check_x86asm xop_external "vpmacsdd xmm0, xmm1, xmm2, xmm3"
enabled fma4 && check_x86asm fma4_external "vfmaddps ymm0, ymm1, ymm2, ymm3"
check_x86asm cpunop "CPU amdnop"
fi
case "$cpu" in

View File

@ -74,7 +74,7 @@ SECTION .text
; "movaps m0, [r5 + r4]" if PIC is enabled
; "movaps m0, [constant_name + r4]" if texrel are used
%macro SET_PIC_BASE 3; reg, const_label
%ifdef PIC
%if PIC
%{1} %2, [%3] ; lea r5, [rip+const]
%define pic_base_%3 %2
%else
@ -195,7 +195,7 @@ align 16
; PIC relative addressing. Use this
; to count it in cglobal
;
%ifdef PIC
%if PIC
%define num_pic_regs 1
%else
%define num_pic_regs 0

View File

@ -91,7 +91,7 @@ SECTION .text
%macro chroma_mc8_mmx_func 2-3
%ifidn %2, rv40
%ifdef PIC
%if PIC
%define rnd_1d_rv40 r8
%define rnd_2d_rv40 r8
%define extra_regs 2
@ -147,7 +147,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
or r4d, r5d ; x + y
%ifidn %2, rv40
%ifdef PIC
%if PIC
lea r8, [rnd_rv40_1d_tbl]
%endif
%if ARCH_X86_64 == 0
@ -198,7 +198,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
movd m4, r4d ; x
movd m6, r5d ; y
%ifidn %2, rv40
%ifdef PIC
%if PIC
lea r8, [rnd_rv40_2d_tbl]
%endif
%if ARCH_X86_64 == 0
@ -283,7 +283,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
%macro chroma_mc4_mmx_func 2
%define extra_regs 0
%ifidn %2, rv40
%ifdef PIC
%if PIC
%define extra_regs 1
%endif ; PIC
%endif ; rv40
@ -301,7 +301,7 @@ cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
psubw m5, m3
%ifidn %2, rv40
%ifdef PIC
%if PIC
lea r6, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r6
%else

View File

@ -42,7 +42,7 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
db 6+11*8, 7+11*8, 6+12*8, 7+12*8
db 4+13*8, 5+13*8, 4+14*8, 5+14*8
db 6+13*8, 7+13*8, 6+14*8, 7+14*8
%ifdef PIC
%if PIC
%define npicregs 1
%define scan8 picregq
%else
@ -322,7 +322,7 @@ INIT_XMM sse2
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
xor r5, r5
%ifdef PIC
%if PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
@ -398,7 +398,7 @@ h264_idct_add8_mmx_plane:
cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
%ifdef PIC
%if PIC
lea picregq, [scan8_mem]
%endif
%if ARCH_X86_64

View File

@ -1311,10 +1311,7 @@ PRED8x8L_DOWN_RIGHT
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL_RIGHT 0
cglobal pred8x8l_vertical_right_8, 4,5,7
; manually spill XMM registers for Win64 because
; the code here is initialized with INIT_MMX
WIN64_SPILL_XMM 7
cglobal pred8x8l_vertical_right_8, 4,5,6
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
@ -1384,7 +1381,6 @@ cglobal pred8x8l_vertical_right_8, 4,5,7
movq2dq xmm4, mm6
pslldq xmm4, 8
por xmm0, xmm4
movdqa xmm6, [pw_ff00]
movdqa xmm1, xmm0
lea r2, [r1+r3*2]
movdqa xmm2, xmm0
@ -1394,15 +1390,16 @@ cglobal pred8x8l_vertical_right_8, 4,5,7
pavgb xmm2, xmm0
INIT_XMM cpuname
PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
pandn xmm6, xmm4
movdqa xmm0, [pw_ff00]
pandn xmm0, xmm4
movdqa xmm5, xmm4
psrlw xmm4, 8
packuswb xmm6, xmm4
movhlps xmm4, xmm6
packuswb xmm0, xmm4
movhlps xmm4, xmm0
movhps [r0+r3*2], xmm5
movhps [r0+r3*1], xmm2
psrldq xmm5, 4
movss xmm5, xmm6
movss xmm5, xmm0
psrldq xmm2, 4
movss xmm2, xmm4
lea r0, [r2+r3*2]

View File

@ -180,7 +180,7 @@ SECTION .text
%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
%if cpuflag(avx2)
%assign %%offset 32
%ifdef PIC
%if PIC
lea %5q, [hevc_epel_filters_avx2_%1]
%define FILTER %5q
%else
@ -188,7 +188,7 @@ SECTION .text
%endif
%else
%assign %%offset 16
%ifdef PIC
%if PIC
lea %5q, [hevc_epel_filters_sse4_%1]
%define FILTER %5q
%else
@ -216,7 +216,7 @@ SECTION .text
%define %%table hevc_epel_filters_sse4_%1
%endif
%ifdef PIC
%if PIC
lea r3srcq, [%%table]
%define FILTER r3srcq
%else
@ -234,7 +234,7 @@ SECTION .text
%else
%define %%table hevc_epel_filters_sse4_10
%endif
%ifdef PIC
%if PIC
lea r3srcq, [%%table]
%define FILTER r3srcq
%else
@ -257,7 +257,7 @@ SECTION .text
%define %%table hevc_qpel_filters_sse4_%1
%endif
%ifdef PIC
%if PIC
lea rfilterq, [%%table]
%else
%define rfilterq %%table
@ -576,7 +576,7 @@ SECTION .text
%define %%table hevc_qpel_filters_sse4_%2
%endif
%ifdef PIC
%if PIC
lea rfilterq, [%%table]
%else
%define rfilterq %%table
@ -1288,7 +1288,7 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
%assign %%offset 4
dec %2q
shl %2q, 3
%ifdef PIC
%if PIC
lea %5q, [%%table]
%define FILTER %5q
%else
@ -1365,7 +1365,7 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, m
sub myq, 1
shl myq, 5
%define %%table hevc_qpel_filters_avx512icl_v_%1
%ifdef PIC
%if PIC
lea tmpq, [%%table]
%define FILTER tmpq
%else

View File

@ -51,7 +51,7 @@ sixtap_filter_v_m: times 8 dw 1
times 8 dw 20
times 8 dw 52
%ifdef PIC
%if PIC
%define sixtap_filter_hw picregq
%define sixtap_filter_hb picregq
%define sixtap_filter_v picregq
@ -84,7 +84,7 @@ SECTION .text
%if WIN64
movsxd %1q, %1d
%endif
%ifdef PIC
%if PIC
add %1q, picregq
%else
add %1q, %2
@ -104,7 +104,7 @@ SECTION .text
%macro FILTER_V 1
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_v_m]
%endif
pxor m7, m7
@ -175,7 +175,7 @@ cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height,
%macro FILTER_H 1
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_v_m]
%endif
pxor m7, m7
@ -238,7 +238,7 @@ FILTER_V avg
%macro FILTER_SSSE3 1
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_hb_m]
%endif
@ -283,7 +283,7 @@ cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height,
RET
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_hb_m]
%endif
mova m3, [filter_h6_shuf2]

View File

@ -308,7 +308,7 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
movq [r2q], m2
RET
%ifdef PIC
%if PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
@ -321,7 +321,7 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
%endif
%macro LOAD_NST 1
%ifdef PIC
%if PIC
lea NOISE_TABLE, [%1]
mova m0, [kxq + NOISE_TABLE]
%else
@ -371,7 +371,7 @@ apply_noise_main:
movsxdifnidn noiseq, noised
dec noiseq
shl countd, 2
%ifdef PIC
%if PIC
lea NOISE_TABLE, [sbr_noise_table]
%endif
lea Yq, [Yq + 2*countq]

View File

@ -114,7 +114,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 2, 6
times 8 db 1, 7
%ifdef PIC
%if PIC
%define fourtap_filter_hw picregq
%define sixtap_filter_hw picregq
%define fourtap_filter_hb picregq
@ -166,7 +166,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
lea mxd, [mxq*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_hb_m]
%endif
mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
@ -207,7 +207,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
mova m2, [pw_256]
mova m3, [filter_h2_shuf]
mova m4, [filter_h4_shuf]
%ifdef PIC
%if PIC
lea picregq, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
@ -234,7 +234,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%ifdef PIC
%if PIC
lea picregq, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+myq-16]
@ -272,7 +272,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
lea myd, [myq*3]
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_hb_m]
%endif
lea myq, [sixtap_filter_hb+myq*8]
@ -326,7 +326,7 @@ FILTER_SSSE3 8
INIT_MMX mmxext
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%ifdef PIC
%if PIC
lea picregq, [fourtap_filter_hw_m]
%endif
movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
@ -374,7 +374,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he
INIT_MMX mmxext
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_hw_m]
%endif
movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
@ -431,7 +431,7 @@ cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he
INIT_XMM sse2
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 5
%ifdef PIC
%if PIC
lea picregq, [fourtap_filter_v_m]
%endif
lea mxq, [fourtap_filter_v+mxq-32]
@ -480,7 +480,7 @@ INIT_XMM sse2
cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
shl mxd, 4
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_v_m]
%endif
lea mxq, [sixtap_filter_v+mxq-96]
@ -543,7 +543,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 5
%ifdef PIC
%if PIC
lea picregq, [fourtap_filter_v_m]
%endif
lea myq, [fourtap_filter_v+myq-32]
@ -597,7 +597,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
lea myq, [myq*3]
%ifdef PIC
%if PIC
lea picregq, [sixtap_filter_v_m]
%endif
lea myq, [sixtap_filter_v+myq-96]
@ -667,7 +667,7 @@ FILTER_V 8
%if cpuflag(ssse3)
cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%ifdef PIC
%if PIC
lea picregq, [bilinear_filter_vb_m]
%endif
pxor m4, m4
@ -697,7 +697,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
%else ; cpuflag(ssse3)
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%ifdef PIC
%if PIC
lea picregq, [bilinear_filter_vw_m]
%endif
pxor m6, m6
@ -743,7 +743,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
%if cpuflag(ssse3)
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%ifdef PIC
%if PIC
lea picregq, [bilinear_filter_vb_m]
%endif
pxor m4, m4
@ -773,7 +773,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride
%else ; cpuflag(ssse3)
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%ifdef PIC
%if PIC
lea picregq, [bilinear_filter_vw_m]
%endif
pxor m6, m6

View File

@ -330,7 +330,9 @@ IDCT_4x4_FN ssse3
INIT_MMX %5
cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
%if WIN64 && notcpuflag(ssse3)
INIT_XMM cpuname
WIN64_SPILL_XMM 8
INIT_MMX cpuname
%endif
movdqa xmm5, [pd_8192]
mova m0, [blockq+ 0]

View File

@ -303,7 +303,9 @@ IDCT4_10_FN
%macro IADST4_FN 4
cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
%if WIN64 && notcpuflag(ssse3)
INIT_XMM cpuname
WIN64_SPILL_XMM 8
INIT_MMX cpuname
%endif
movdqa xmm5, [pd_8192]
mova m0, [blockq+0*16+0]
@ -672,7 +674,7 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
mov dstbakq, dstq
movsxd cntq, cntd
%endif
%ifdef PIC
%if PIC
lea ptrq, [default_8x8]
movzx cntd, byte [ptrq+cntq-1]
%else
@ -921,7 +923,7 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstbakq, dstq
movsxd cntq, cntd
%endif
%ifdef PIC
%if PIC
lea ptrq, [%5_8x8]
movzx cntd, byte [ptrq+cntq-1]
%else
@ -1128,7 +1130,7 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstbakq, dstq
movsxd cntq, cntd
%endif
%ifdef PIC
%if PIC
lea ptrq, [default_16x16]
movzx cntd, byte [ptrq+cntq-1]
%else
@ -1445,7 +1447,7 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstbakq, dstq
movsxd cntq, cntd
%endif
%ifdef PIC
%if PIC
lea ptrq, [%7_16x16]
movzx cntd, byte [ptrq+cntq-1]
%else
@ -1958,7 +1960,7 @@ cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstbakq, dstq
movsxd cntq, cntd
%endif
%ifdef PIC
%if PIC
lea ptrq, [default_32x32]
movzx cntd, byte [ptrq+cntq-1]
%else

File diff suppressed because it is too large Load Diff