1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

Merge commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450'

* commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450':
  x86inc: Utilize the shadow space on 64-bit Windows

Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2013-10-08 11:22:54 +02:00
commit 1f17619fe4
3 changed files with 55 additions and 51 deletions

View File

@ -672,13 +672,13 @@ cglobal imdct_calc, 3,5,3
push r1
push r0
%else
sub rsp, 8
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
%endif
call r4
%if ARCH_X86_32
add esp, 12
%else
add rsp, 8
add rsp, 8+32*WIN64
%endif
POP r1
POP r3

View File

@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma_8, 5,9
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
movsxd r7, r1d
lea r8, [r7+r7*2]
lea r6, [r0-4]
lea r5, [r0-4+r8]
%if WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
%define pix_tmp rsp+0x30 ; shadow space + r4
%else
sub rsp, 0x68
%define pix_tmp rsp
%endif
@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%if WIN64
add rsp, 0x98
%else
add rsp, 0x68
%endif
RET
%endmacro
@ -708,13 +701,16 @@ INIT_MMX cpuname
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_8, 4,9
cglobal deblock_h_luma_intra_8, 4,9,0,0x80
movsxd r7, r1d
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
sub rsp, 0x88
%if WIN64
%define pix_tmp rsp+0x20 ; shadow space
%else
%define pix_tmp rsp
%endif
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
@ -734,7 +730,6 @@ cglobal deblock_h_luma_intra_8, 4,9
sub r5, r7
shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra_8, 2,4,8,0x80

View File

@ -353,14 +353,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if stack_size < 0
%assign stack_size -stack_size
%endif
%if mmsize != 8
%assign xmm_regs_used %2
%assign stack_size_padded stack_size
%if WIN64
%assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
%if mmsize != 8
%assign xmm_regs_used %2
%if xmm_regs_used > 8
%assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
%endif
%endif
%endif
%if mmsize <= 16 && HAVE_ALIGNED_STACK
%assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%endif
%assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
@ -370,14 +374,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded])
mov rstk, rsp
%assign stack_size_padded stack_size
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%if mmsize == 32 && xmm_regs_used & 1
; re-align to 32 bytes
%assign stack_size_padded (stack_size_padded + 16)
%endif
%endif
%if %1 < 0 ; need to store rsp on stack
sub rsp, gprsize+stack_size_padded
and rsp, ~(%%stack_alignment-1)
@ -389,9 +385,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%xdefine rstkm rstk
%endif
%endif
%if xmm_regs_used > 6
WIN64_PUSH_XMM
%endif
WIN64_PUSH_XMM
%endif
%endif
%endmacro
@ -452,40 +446,55 @@ DECLARE_REG 14, R15, 120
%endmacro
%macro WIN64_PUSH_XMM 0
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
%endrep
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
%if xmm_regs_used > 6
movaps [rstk + stack_offset + 8], xmm6
%endif
%if xmm_regs_used > 7
movaps [rstk + stack_offset + 24], xmm7
%endif
%if xmm_regs_used > 8
%assign %%i 8
%rep xmm_regs_used-8
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
%endif
%endmacro
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
SUB rsp, (xmm_regs_used-6)*16+16
WIN64_PUSH_XMM
%if xmm_regs_used > 8
%assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
SUB rsp, stack_size_padded
%endif
WIN64_PUSH_XMM
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 1
%if xmm_regs_used > 6
%assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%rep xmm_regs_used-8
%assign %%i %%i-1
movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
%endrep
%if stack_size_padded == 0
add %1, (xmm_regs_used-6)*16+16
%endif
%endif
%if stack_size_padded > 0
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
mov rsp, rstkm
%else
add %1, stack_size_padded
%assign %%pad_size stack_size_padded
%endif
%endif
%if xmm_regs_used > 7
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
%endif
%endmacro
%macro WIN64_RESTORE_XMM 1
@ -702,12 +711,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
align function_align
%2:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%xdefine rstk rsp
%assign stack_offset 0
%assign stack_size 0
%assign stack_size_padded 0
%assign xmm_regs_used 0
RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
%xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%ifnidn %3, ""
PROLOGUE %3
%endif