mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Merge commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450'
* commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450': x86inc: Utilize the shadow space on 64-bit Windows Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
commit
1f17619fe4
@ -672,13 +672,13 @@ cglobal imdct_calc, 3,5,3
|
|||||||
push r1
|
push r1
|
||||||
push r0
|
push r0
|
||||||
%else
|
%else
|
||||||
sub rsp, 8
|
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
|
||||||
%endif
|
%endif
|
||||||
call r4
|
call r4
|
||||||
%if ARCH_X86_32
|
%if ARCH_X86_32
|
||||||
add esp, 12
|
add esp, 12
|
||||||
%else
|
%else
|
||||||
add rsp, 8
|
add rsp, 8+32*WIN64
|
||||||
%endif
|
%endif
|
||||||
POP r1
|
POP r1
|
||||||
POP r3
|
POP r3
|
||||||
|
@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10
|
|||||||
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
|
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
INIT_MMX cpuname
|
INIT_MMX cpuname
|
||||||
cglobal deblock_h_luma_8, 5,9
|
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
|
||||||
movsxd r7, r1d
|
movsxd r7, r1d
|
||||||
lea r8, [r7+r7*2]
|
lea r8, [r7+r7*2]
|
||||||
lea r6, [r0-4]
|
lea r6, [r0-4]
|
||||||
lea r5, [r0-4+r8]
|
lea r5, [r0-4+r8]
|
||||||
%if WIN64
|
%if WIN64
|
||||||
sub rsp, 0x98
|
%define pix_tmp rsp+0x30 ; shadow space + r4
|
||||||
%define pix_tmp rsp+0x30
|
|
||||||
%else
|
%else
|
||||||
sub rsp, 0x68
|
|
||||||
%define pix_tmp rsp
|
%define pix_tmp rsp
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9
|
|||||||
movq m3, [pix_tmp+0x40]
|
movq m3, [pix_tmp+0x40]
|
||||||
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
|
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
|
||||||
|
|
||||||
%if WIN64
|
|
||||||
add rsp, 0x98
|
|
||||||
%else
|
|
||||||
add rsp, 0x68
|
|
||||||
%endif
|
|
||||||
RET
|
RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
@ -708,13 +701,16 @@ INIT_MMX cpuname
|
|||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
|
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
cglobal deblock_h_luma_intra_8, 4,9
|
cglobal deblock_h_luma_intra_8, 4,9,0,0x80
|
||||||
movsxd r7, r1d
|
movsxd r7, r1d
|
||||||
lea r8, [r7*3]
|
lea r8, [r7*3]
|
||||||
lea r6, [r0-4]
|
lea r6, [r0-4]
|
||||||
lea r5, [r0-4+r8]
|
lea r5, [r0-4+r8]
|
||||||
sub rsp, 0x88
|
%if WIN64
|
||||||
|
%define pix_tmp rsp+0x20 ; shadow space
|
||||||
|
%else
|
||||||
%define pix_tmp rsp
|
%define pix_tmp rsp
|
||||||
|
%endif
|
||||||
|
|
||||||
; transpose 8x16 -> tmp space
|
; transpose 8x16 -> tmp space
|
||||||
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
|
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
|
||||||
@ -734,7 +730,6 @@ cglobal deblock_h_luma_intra_8, 4,9
|
|||||||
sub r5, r7
|
sub r5, r7
|
||||||
shr r7, 3
|
shr r7, 3
|
||||||
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
|
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
|
||||||
add rsp, 0x88
|
|
||||||
RET
|
RET
|
||||||
%else
|
%else
|
||||||
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
|
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
|
||||||
|
@ -353,14 +353,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||||||
%if stack_size < 0
|
%if stack_size < 0
|
||||||
%assign stack_size -stack_size
|
%assign stack_size -stack_size
|
||||||
%endif
|
%endif
|
||||||
%if mmsize != 8
|
%assign stack_size_padded stack_size
|
||||||
%assign xmm_regs_used %2
|
%if WIN64
|
||||||
|
%assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
|
||||||
|
%if mmsize != 8
|
||||||
|
%assign xmm_regs_used %2
|
||||||
|
%if xmm_regs_used > 8
|
||||||
|
%assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
%endif
|
%endif
|
||||||
%if mmsize <= 16 && HAVE_ALIGNED_STACK
|
%if mmsize <= 16 && HAVE_ALIGNED_STACK
|
||||||
%assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
|
%assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
|
||||||
%if xmm_regs_used > 6
|
|
||||||
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
|
|
||||||
%endif
|
|
||||||
SUB rsp, stack_size_padded
|
SUB rsp, stack_size_padded
|
||||||
%else
|
%else
|
||||||
%assign %%reg_num (regs_used - 1)
|
%assign %%reg_num (regs_used - 1)
|
||||||
@ -370,14 +374,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||||||
; stack in a single instruction (i.e. mov rsp, rstk or mov
|
; stack in a single instruction (i.e. mov rsp, rstk or mov
|
||||||
; rsp, [rsp+stack_size_padded])
|
; rsp, [rsp+stack_size_padded])
|
||||||
mov rstk, rsp
|
mov rstk, rsp
|
||||||
%assign stack_size_padded stack_size
|
|
||||||
%if xmm_regs_used > 6
|
|
||||||
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
|
|
||||||
%if mmsize == 32 && xmm_regs_used & 1
|
|
||||||
; re-align to 32 bytes
|
|
||||||
%assign stack_size_padded (stack_size_padded + 16)
|
|
||||||
%endif
|
|
||||||
%endif
|
|
||||||
%if %1 < 0 ; need to store rsp on stack
|
%if %1 < 0 ; need to store rsp on stack
|
||||||
sub rsp, gprsize+stack_size_padded
|
sub rsp, gprsize+stack_size_padded
|
||||||
and rsp, ~(%%stack_alignment-1)
|
and rsp, ~(%%stack_alignment-1)
|
||||||
@ -389,9 +385,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||||||
%xdefine rstkm rstk
|
%xdefine rstkm rstk
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
%if xmm_regs_used > 6
|
WIN64_PUSH_XMM
|
||||||
WIN64_PUSH_XMM
|
|
||||||
%endif
|
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
@ -452,40 +446,55 @@ DECLARE_REG 14, R15, 120
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro WIN64_PUSH_XMM 0
|
%macro WIN64_PUSH_XMM 0
|
||||||
%assign %%i xmm_regs_used
|
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
|
||||||
%rep (xmm_regs_used-6)
|
%if xmm_regs_used > 6
|
||||||
%assign %%i %%i-1
|
movaps [rstk + stack_offset + 8], xmm6
|
||||||
movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
|
%endif
|
||||||
%endrep
|
%if xmm_regs_used > 7
|
||||||
|
movaps [rstk + stack_offset + 24], xmm7
|
||||||
|
%endif
|
||||||
|
%if xmm_regs_used > 8
|
||||||
|
%assign %%i 8
|
||||||
|
%rep xmm_regs_used-8
|
||||||
|
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
|
||||||
|
%assign %%i %%i+1
|
||||||
|
%endrep
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro WIN64_SPILL_XMM 1
|
%macro WIN64_SPILL_XMM 1
|
||||||
%assign xmm_regs_used %1
|
%assign xmm_regs_used %1
|
||||||
ASSERT xmm_regs_used <= 16
|
ASSERT xmm_regs_used <= 16
|
||||||
%if xmm_regs_used > 6
|
%if xmm_regs_used > 8
|
||||||
SUB rsp, (xmm_regs_used-6)*16+16
|
%assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
|
||||||
WIN64_PUSH_XMM
|
SUB rsp, stack_size_padded
|
||||||
%endif
|
%endif
|
||||||
|
WIN64_PUSH_XMM
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro WIN64_RESTORE_XMM_INTERNAL 1
|
%macro WIN64_RESTORE_XMM_INTERNAL 1
|
||||||
%if xmm_regs_used > 6
|
%assign %%pad_size 0
|
||||||
|
%if xmm_regs_used > 8
|
||||||
%assign %%i xmm_regs_used
|
%assign %%i xmm_regs_used
|
||||||
%rep (xmm_regs_used-6)
|
%rep xmm_regs_used-8
|
||||||
%assign %%i %%i-1
|
%assign %%i %%i-1
|
||||||
movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
|
movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
|
||||||
%endrep
|
%endrep
|
||||||
%if stack_size_padded == 0
|
|
||||||
add %1, (xmm_regs_used-6)*16+16
|
|
||||||
%endif
|
|
||||||
%endif
|
%endif
|
||||||
%if stack_size_padded > 0
|
%if stack_size_padded > 0
|
||||||
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
|
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
|
||||||
mov rsp, rstkm
|
mov rsp, rstkm
|
||||||
%else
|
%else
|
||||||
add %1, stack_size_padded
|
add %1, stack_size_padded
|
||||||
|
%assign %%pad_size stack_size_padded
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
%if xmm_regs_used > 7
|
||||||
|
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
|
||||||
|
%endif
|
||||||
|
%if xmm_regs_used > 6
|
||||||
|
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro WIN64_RESTORE_XMM 1
|
%macro WIN64_RESTORE_XMM 1
|
||||||
@ -702,12 +711,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%endif
|
%endif
|
||||||
align function_align
|
align function_align
|
||||||
%2:
|
%2:
|
||||||
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
|
RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
|
||||||
%xdefine rstk rsp
|
%xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
|
||||||
%assign stack_offset 0
|
%assign stack_offset 0 ; stack pointer offset relative to the return address
|
||||||
%assign stack_size 0
|
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
|
||||||
%assign stack_size_padded 0
|
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
|
||||||
%assign xmm_regs_used 0
|
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
|
||||||
%ifnidn %3, ""
|
%ifnidn %3, ""
|
||||||
PROLOGUE %3
|
PROLOGUE %3
|
||||||
%endif
|
%endif
|
||||||
|
Loading…
Reference in New Issue
Block a user