1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

x86inc: support stack mem allocation and re-alignment in PROLOGUE.

Use this in VP8/H264-8bit loopfilter functions so they can be used if
there is no aligned stack (e.g. MSVC 32bit or ICC 10.x).

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Ronald S. Bultje 2012-12-08 16:12:38 -08:00 committed by Michael Niedermayer
parent 82c0211213
commit ce58642ed0
5 changed files with 191 additions and 101 deletions

View File

@ -400,14 +400,12 @@ DEBLOCK_LUMA
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_8, 5,5 cglobal deblock_%1_luma_8, 5,5,8,2*%2
lea r4, [r1*3] lea r4, [r1*3]
dec r2 ; alpha-1 dec r2 ; alpha-1
neg r4 neg r4
dec r3 ; beta-1 dec r3 ; beta-1
add r4, r0 ; pix-3*stride add r4, r0 ; pix-3*stride
%assign pad 2*%2+12-(stack_offset&15)
SUB esp, pad
mova m0, [r4+r1] ; p1 mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0 mova m1, [r4+2*r1] ; p0
@ -445,22 +443,19 @@ cglobal deblock_%1_luma_8, 5,5
DEBLOCK_P0_Q0 DEBLOCK_P0_Q0
mova [r4+2*r1], m1 mova [r4+2*r1], m1
mova [r0], m2 mova [r0], m2
ADD esp, pad
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX cpuname INIT_MMX cpuname
cglobal deblock_h_luma_8, 0,5 cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
mov r0, r0mp mov r0, r0mp
mov r3, r1m mov r3, r1m
lea r4, [r3*3] lea r4, [r3*3]
sub r0, 4 sub r0, 4
lea r1, [r0+r4] lea r1, [r0+r4]
%assign pad 0x78-(stack_offset&15) %define pix_tmp esp+12*HAVE_ALIGNED_STACK
SUB esp, pad
%define pix_tmp esp+12
; transpose 6x16 -> tmp space ; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
@ -502,7 +497,6 @@ cglobal deblock_h_luma_8, 0,5
movq m3, [pix_tmp+0x48] movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
ADD esp, pad
RET RET
%endmacro ; DEBLOCK_LUMA %endmacro ; DEBLOCK_LUMA
@ -635,7 +629,7 @@ DEBLOCK_LUMA v, 16
%define mpb_0 m14 %define mpb_0 m14
%define mpb_1 m15 %define mpb_1 m15
%else %else
%define spill(x) [esp+16*x+((stack_offset+4)&15)] %define spill(x) [esp+16*x]
%define p2 [r4+r1] %define p2 [r4+r1]
%define q2 [r0+2*r1] %define q2 [r0+2*r1]
%define t4 spill(0) %define t4 spill(0)
@ -650,10 +644,7 @@ DEBLOCK_LUMA v, 16
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra_8, 4,6,16 cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
%if ARCH_X86_64 == 0
sub esp, 0x60
%endif
lea r4, [r1*4] lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1 dec r2d ; alpha-1
@ -702,9 +693,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16
LUMA_INTRA_SWAP_PQ LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end: .end:
%if ARCH_X86_64 == 0
add esp, 0x60
%endif
RET RET
INIT_MMX cpuname INIT_MMX cpuname
@ -741,12 +729,10 @@ cglobal deblock_h_luma_intra_8, 4,9
add rsp, 0x88 add rsp, 0x88
RET RET
%else %else
cglobal deblock_h_luma_intra_8, 2,4 cglobal deblock_h_luma_intra_8, 2,4,8,0x80
lea r3, [r1*3] lea r3, [r1*3]
sub r0, 4 sub r0, 4
lea r2, [r0+r3] lea r2, [r0+r3]
%assign pad 0x8c-(stack_offset&15)
SUB rsp, pad
%define pix_tmp rsp %define pix_tmp rsp
; transpose 8x16 -> tmp space ; transpose 8x16 -> tmp space
@ -777,7 +763,6 @@ cglobal deblock_h_luma_intra_8, 2,4
lea r0, [r0+r1*8] lea r0, [r0+r1*8]
lea r2, [r2+r1*8] lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
ADD rsp, pad
RET RET
%endif ; ARCH_X86_64 %endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA %endmacro ; DEBLOCK_LUMA_INTRA

View File

@ -276,18 +276,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
#endif /* HAVE_ALIGNED_STACK */
} }
if (EXTERNAL_SSSE3(mm_flags)) { if (EXTERNAL_SSSE3(mm_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
} }
if (EXTERNAL_AVX(mm_flags) && HAVE_ALIGNED_STACK) { if (EXTERNAL_AVX(mm_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;

View File

@ -1631,28 +1631,31 @@ SIMPLE_LOOPFILTER h, 5
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro INNER_LOOPFILTER 2 %macro INNER_LOOPFILTER 2
%define stack_size 0
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v ; [3]=hev() result
%define stack_size mmsize * -4
%else ; h ; extra storage space for transposes
%define stack_size mmsize * -5
%endif
%endif
%if %2 == 8 ; chroma %if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
%else ; luma %else ; luma
cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
%endif %endif
%if cpuflag(ssse3) %if cpuflag(ssse3)
pxor m7, m7 pxor m7, m7
%endif %endif
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v ; [3]=hev() result %ifndef m8
%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
%else ; h ; extra storage space for transposes
%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
%endif
; splat function arguments ; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh SPLATB_REG m2, hevthrq, m7 ; hev_thresh
SUB rsp, pad
%define m_flimE [rsp] %define m_flimE [rsp]
%define m_flimI [rsp+mmsize] %define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2] %define m_hevthr [rsp+mmsize*2]
@ -2082,12 +2085,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
dec cntrq dec cntrq
jg .next8px jg .next8px
%endif %endif
%endif REP_RET
%else ; mmsize == 16
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
ADD rsp, pad
%endif
RET RET
%endif
%endmacro %endmacro
%if ARCH_X86_32 %if ARCH_X86_32
@ -2122,31 +2123,34 @@ INNER_LOOPFILTER h, 8
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MBEDGE_LOOPFILTER 2 %macro MBEDGE_LOOPFILTER 2
%if %2 == 8 ; chroma %define stack_size 0
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
%endif
%if cpuflag(ssse3)
pxor m7, m7
%endif
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%if mmsize == 16 ; [3]=hev() result %if mmsize == 16 ; [3]=hev() result
; [4]=filter tmp result ; [4]=filter tmp result
; [5]/[6] = p2/q2 backup ; [5]/[6] = p2/q2 backup
; [7]=lim_res sign result ; [7]=lim_res sign result
%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15) %define stack_size mmsize * -7
%else ; 8 ; extra storage space for transposes %else ; 8 ; extra storage space for transposes
%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15) %define stack_size mmsize * -8
%endif %endif
%endif
%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
%endif
%if cpuflag(ssse3)
pxor m7, m7
%endif
%ifndef m8
; splat function arguments ; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh SPLATB_REG m2, hevthrq, m7 ; hev_thresh
SUB rsp, pad
%define m_flimE [rsp] %define m_flimE [rsp]
%define m_flimI [rsp+mmsize] %define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2] %define m_hevthr [rsp+mmsize*2]
@ -2740,12 +2744,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt
dec cntrq dec cntrq
jg .next8px jg .next8px
%endif %endif
%endif REP_RET
%else ; mmsize == 16
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
ADD rsp, pad
%endif
RET RET
%endif
%endmacro %endmacro
%if ARCH_X86_32 %if ARCH_X86_32

View File

@ -390,13 +390,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
#endif
} }
if (mm_flags & AV_CPU_FLAG_SSE2) { if (mm_flags & AV_CPU_FLAG_SSE2) {
@ -404,13 +402,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
#endif
} }
if (mm_flags & AV_CPU_FLAG_SSSE3) { if (mm_flags & AV_CPU_FLAG_SSSE3) {
@ -424,7 +420,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
@ -434,17 +429,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
#endif
} }
if (mm_flags & AV_CPU_FLAG_SSE4) { if (mm_flags & AV_CPU_FLAG_SSE4) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
#endif
} }
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
} }

View File

@ -130,7 +130,12 @@ CPUNOP amdnop
; %1 = number of arguments. loads them from stack if needed. ; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed. ; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = list of names to define to registers ; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
; and an extra register will be allocated to hold the original stack
; pointer (to not invalidate r0m etc.). To prevent the use of an extra
; register as stack pointer, request a negative stack size.
; %4+/%5+ = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal ; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g. ; e.g.
@ -166,11 +171,11 @@ CPUNOP amdnop
%define r%1m %2d %define r%1m %2d
%define r%1mp %2 %define r%1mp %2
%elif ARCH_X86_64 ; memory %elif ARCH_X86_64 ; memory
%define r%1m [rsp + stack_offset + %3] %define r%1m [rstk + stack_offset + %3]
%define r%1mp qword r %+ %1 %+ m %define r%1mp qword r %+ %1m
%else %else
%define r%1m [esp + stack_offset + %3] %define r%1m [rstk + stack_offset + %3]
%define r%1mp dword r %+ %1 %+ m %define r%1mp dword r %+ %1m
%endif %endif
%define r%1 %2 %define r%1 %2
%endmacro %endmacro
@ -231,12 +236,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro PUSH 1 %macro PUSH 1
push %1 push %1
%ifidn rstk, rsp
%assign stack_offset stack_offset+gprsize %assign stack_offset stack_offset+gprsize
%endif
%endmacro %endmacro
%macro POP 1 %macro POP 1
pop %1 pop %1
%ifidn rstk, rsp
%assign stack_offset stack_offset-gprsize %assign stack_offset stack_offset-gprsize
%endif
%endmacro %endmacro
%macro PUSH_IF_USED 1-* %macro PUSH_IF_USED 1-*
@ -268,14 +277,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro SUB 2 %macro SUB 2
sub %1, %2 sub %1, %2
%ifidn %1, rsp %ifidn %1, rstk
%assign stack_offset stack_offset+(%2) %assign stack_offset stack_offset+(%2)
%endif %endif
%endmacro %endmacro
%macro ADD 2 %macro ADD 2
add %1, %2 add %1, %2
%ifidn %1, rsp %ifidn %1, rstk
%assign stack_offset stack_offset-(%2) %assign stack_offset stack_offset-(%2)
%endif %endif
%endmacro %endmacro
@ -333,6 +342,73 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%assign n_arg_names %0 %assign n_arg_names %0
%endmacro %endmacro
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%if %1 != 0
%assign %%stack_alignment ((mmsize + 15) & ~15)
%assign stack_size %1
%if stack_size < 0
%assign stack_size -stack_size
%endif
%assign xmm_regs_used %2
%if mmsize <= 16 && HAVE_ALIGNED_STACK
%assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%endif
SUB rsp, stack_size_padded
%else
%assign reg_num (regs_used - 1)
%xdefine rstk r %+ reg_num
; align stack, and save original stack location directly above
; it, i.e. in [rsp+stack_size_padded], so we can restore the
; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded])
mov rstk, rsp
%assign stack_size_padded stack_size
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%endif
%if %1 < 0 ; need to store rsp on stack
sub rsp, gprsize+stack_size_padded
and rsp, ~(%%stack_alignment-1)
%xdefine rstkm [rsp+stack_size_padded]
mov rstkm, rstk
%else ; can keep rsp in rstk during whole function
sub rsp, stack_size_padded
and rsp, ~(%%stack_alignment-1)
%xdefine rstkm rstk
%endif
%endif
%if xmm_regs_used > 6
WIN64_PUSH_XMM
%endif
%endif
%endif
%endmacro
%macro SETUP_STACK_POINTER 1
%ifnum %1
%if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
%if %1 > 0
%assign regs_used (regs_used + 1)
%elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
%warning "Stack pointer will overwrite register argument"
%endif
%endif
%endif
%endmacro
%macro DEFINE_ARGS_INTERNAL 3+
%ifnum %2
DEFINE_ARGS %3
%elif %1 == 4
DEFINE_ARGS %2
%elif %1 > 4
DEFINE_ARGS %2, %3
%endif
%endmacro
%if WIN64 ; Windows x64 ;================================================= %if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx DECLARE_REG 0, rcx
@ -351,31 +427,37 @@ DECLARE_REG 12, R13, 104
DECLARE_REG 13, R14, 112 DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120 DECLARE_REG 14, R15, 120
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1 %assign num_args %1
%assign regs_used %2 %assign regs_used %2
SETUP_STACK_POINTER %4
ASSERT regs_used >= num_args ASSERT regs_used >= num_args
ASSERT regs_used <= 15 ASSERT regs_used <= 15
PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
%if mmsize == 8
%assign xmm_regs_used 0 %assign xmm_regs_used 0
%else ALLOC_STACK %4, %3
%if mmsize != 8 && stack_size == 0
WIN64_SPILL_XMM %3 WIN64_SPILL_XMM %3
%endif %endif
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4 DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%macro WIN64_PUSH_XMM 0
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movdqa [rsp + (%%i-6)*16 + stack_size], xmm %+ %%i
%endrep
%endmacro %endmacro
%macro WIN64_SPILL_XMM 1 %macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1 %assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16 ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6 %if xmm_regs_used > 6
SUB rsp, (xmm_regs_used-6)*16+16 %assign stack_size_padded (xmm_regs_used-6)*16+16-gprsize-(stack_offset&15)
%assign %%i xmm_regs_used SUB rsp, stack_size_padded
%rep (xmm_regs_used-6) WIN64_PUSH_XMM
%assign %%i %%i-1
movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
%endrep
%endif %endif
%endmacro %endmacro
@ -384,19 +466,25 @@ DECLARE_REG 14, R15, 120
%assign %%i xmm_regs_used %assign %%i xmm_regs_used
%rep (xmm_regs_used-6) %rep (xmm_regs_used-6)
%assign %%i %%i-1 %assign %%i %%i-1
movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size]
%endrep %endrep
add %1, (xmm_regs_used-6)*16+16 %endif
%if stack_size_padded > 0
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
mov rsp, rstkm
%else
add %1, stack_size_padded
%endif
%endif %endif
%endmacro %endmacro
%macro WIN64_RESTORE_XMM 1 %macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1 WIN64_RESTORE_XMM_INTERNAL %1
%assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 %assign stack_offset (stack_offset-stack_size_padded)
%assign xmm_regs_used 0 %assign xmm_regs_used 0
%endmacro %endmacro
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%macro RET 0 %macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp WIN64_RESTORE_XMM_INTERNAL rsp
@ -425,19 +513,28 @@ DECLARE_REG 12, R13, 56
DECLARE_REG 13, R14, 64 DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72 DECLARE_REG 14, R15, 72
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1 %assign num_args %1
%assign regs_used %2 %assign regs_used %2
SETUP_STACK_POINTER %4
ASSERT regs_used >= num_args ASSERT regs_used >= num_args
ASSERT regs_used <= 15 ASSERT regs_used <= 15
PUSH_IF_USED 9, 10, 11, 12, 13, 14 PUSH_IF_USED 9, 10, 11, 12, 13, 14
ALLOC_STACK %4
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4 DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro %endmacro
%define has_epilogue regs_used > 9 || mmsize == 32 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%macro RET 0 %macro RET 0
%if stack_size_padded > 0
%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
mov rsp, rstkm
%else
add rsp, stack_size_padded
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9 POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32 %if mmsize == 32
vzeroupper vzeroupper
@ -458,7 +555,7 @@ DECLARE_REG 6, ebp, 28
%macro DECLARE_ARG 1-* %macro DECLARE_ARG 1-*
%rep %0 %rep %0
%define r%1m [esp + stack_offset + 4*%1 + 4] %define r%1m [rstk + stack_offset + 4*%1 + 4]
%define r%1mp dword r%1m %define r%1mp dword r%1m
%rotate 1 %rotate 1
%endrep %endrep
@ -466,24 +563,31 @@ DECLARE_REG 6, ebp, 28
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1 %assign num_args %1
%assign regs_used %2 %assign regs_used %2
%if num_args > 7
%assign num_args 7
%endif
%if regs_used > 7 %if regs_used > 7
%assign regs_used 7 %assign regs_used 7
%endif %endif
SETUP_STACK_POINTER %4
ASSERT regs_used <= 7
ASSERT regs_used >= num_args ASSERT regs_used >= num_args
PUSH_IF_USED 3, 4, 5, 6 PUSH_IF_USED 3, 4, 5, 6
ALLOC_STACK %4
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
DEFINE_ARGS %4 DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro %endmacro
%define has_epilogue regs_used > 3 || mmsize == 32 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%macro RET 0 %macro RET 0
%if stack_size_padded > 0
%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
mov rsp, rstkm
%else
add rsp, stack_size_padded
%endif
%endif
POP_IF_USED 6, 5, 4, 3 POP_IF_USED 6, 5, 4, 3
%if mmsize == 32 %if mmsize == 32
vzeroupper vzeroupper
@ -498,6 +602,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endmacro %endmacro
%macro WIN64_RESTORE_XMM 1 %macro WIN64_RESTORE_XMM 1
%endmacro %endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
%endif %endif
%macro REP_RET 0 %macro REP_RET 0
@ -527,8 +633,12 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; Applies any symbol mangling needed for C linkage, and sets up a define such that ; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version. ; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified. ; Appends cpuflags to the function name if cpuflags has been specified.
%macro cglobal 1-2+ "" ; name, [PROLOGUE args] %macro cglobal 1-2+ ; name, [PROLOGUE args]
%if %0 == 1
cglobal_internal %1 %+ SUFFIX
%else
cglobal_internal %1 %+ SUFFIX, %2 cglobal_internal %1 %+ SUFFIX, %2
%endif
%endmacro %endmacro
%macro cglobal_internal 1-2+ %macro cglobal_internal 1-2+
%ifndef cglobaled_%1 %ifndef cglobaled_%1
@ -545,8 +655,11 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
align function_align align function_align
%1: %1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%xdefine rstk rsp
%assign stack_offset 0 %assign stack_offset 0
%ifnidn %2, "" %assign stack_size 0
%assign stack_size_padded 0
%if %0 > 1
PROLOGUE %2 PROLOGUE %2
%endif %endif
%endmacro %endmacro