1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-03 05:10:03 +02:00

vp9/x86: use explicit register for relative stack references.

Before this patch, we explicitly modify rsp, which isn't necessarily
universally acceptable, since the space under the stack pointer might
be modified in things like signal handlers. Therefore, use an explicit
register to hold the stack pointer relative to the bottom of the stack
(i.e. rsp). This will also clear out valgrind errors about the use of
uninitialized data that started occurring after the idct16x16/ssse3
optimizations were first merged.
This commit is contained in:
Ronald S. Bultje 2014-01-24 18:48:56 -05:00
parent 97474d527f
commit c9e6325ed9

View File

@ -772,40 +772,40 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx
%endmacro %endmacro
%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc %macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
VP9_IDCT16_1D_START %1, %3, 32, rsp+32 VP9_IDCT16_1D_START %1, %3, 32, tmpq+32
%if %2 == 1 %if %2 == 1
; backup a different register ; backup a different register
mova [rsp+16], m15 mova [tmpq+16], m15
mova m7, [rsp+32] mova m7, [tmpq+32]
SUMSUB_BA w, 6, 9, 15 ; t6, t9 SUMSUB_BA w, 6, 9, 15 ; t6, t9
SUMSUB_BA w, 7, 8, 15 ; t7, t8 SUMSUB_BA w, 7, 8, 15 ; t7, t8
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15
mova [rsp+ 0], m0 mova [tmpq+ 0], m0
mova [rsp+ 32], m1 mova [tmpq+ 32], m1
mova [rsp+ 64], m2 mova [tmpq+ 64], m2
mova [rsp+ 96], m3 mova [tmpq+ 96], m3
mova [rsp+128], m4 mova [tmpq+128], m4
mova [rsp+160], m5 mova [tmpq+160], m5
mova [rsp+192], m6 mova [tmpq+192], m6
mova [rsp+224], m7 mova [tmpq+224], m7
mova m15, [rsp+16] mova m15, [tmpq+16]
TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
mova [rsp+ 16], m8 mova [tmpq+ 16], m8
mova [rsp+ 48], m9 mova [tmpq+ 48], m9
mova [rsp+ 80], m10 mova [tmpq+ 80], m10
mova [rsp+112], m11 mova [tmpq+112], m11
mova [rsp+144], m12 mova [tmpq+144], m12
mova [rsp+176], m13 mova [tmpq+176], m13
mova [rsp+208], m14 mova [tmpq+208], m14
mova [rsp+240], m15 mova [tmpq+240], m15
%else ; %2 == 2 %else ; %2 == 2
; backup more registers ; backup more registers
mova [rsp+64], m8 mova [tmpq+64], m8
mova [rsp+96], m9 mova [tmpq+96], m9
pxor m7, m7 pxor m7, m7
pmulhrsw m0, [pw_512] pmulhrsw m0, [pw_512]
@ -823,9 +823,9 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx
; restore from cache ; restore from cache
SWAP 0, 7 ; move zero from m7 to m0 SWAP 0, 7 ; move zero from m7 to m0
mova m7, [rsp+32] mova m7, [tmpq+32]
mova m8, [rsp+64] mova m8, [tmpq+64]
mova m9, [rsp+96] mova m9, [tmpq+96]
SUMSUB_BA w, 6, 9, 1 ; t6, t9 SUMSUB_BA w, 6, 9, 1 ; t6, t9
SUMSUB_BA w, 7, 8, 1 ; t7, t8 SUMSUB_BA w, 7, 8, 1 ; t7, t8
@ -871,7 +871,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx
%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 %macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
INIT_XMM %1 INIT_XMM %1
cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
; 2x2=eob=3, 4x4=eob=10 ; 2x2=eob=3, 4x4=eob=10
cmp eobd, 38 cmp eobd, 38
jg .idctfull jg .idctfull
@ -894,19 +894,19 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
RET RET
DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
.idct8x8: .idct8x8:
DEFINE_ARGS dst, stride, block, cnt, dst_bak mov tmpq, rsp
VP9_IDCT16_1D blockq, 1, 8 VP9_IDCT16_1D blockq, 1, 8
mov cntd, 2 mov cntd, 2
mov dst_bakq, dstq mov dst_bakq, dstq
.loop2_8x8: .loop2_8x8:
VP9_IDCT16_1D rsp, 2, 8 VP9_IDCT16_1D tmpq, 2, 8
lea dstq, [dst_bakq+8] lea dstq, [dst_bakq+8]
add rsp, 16 add tmpq, 16
dec cntd dec cntd
jg .loop2_8x8 jg .loop2_8x8
sub rsp, 32
; at the end of the loop, m0 should still be zero ; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients ; use that to zero out block coefficients
@ -914,26 +914,25 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
RET RET
.idctfull: .idctfull:
DEFINE_ARGS dst, stride, block, cnt, dst_bak
mov cntd, 2 mov cntd, 2
mov tmpq, rsp
.loop1_full: .loop1_full:
VP9_IDCT16_1D blockq, 1 VP9_IDCT16_1D blockq, 1
add blockq, 16 add blockq, 16
add rsp, 256 add tmpq, 256
dec cntd dec cntd
jg .loop1_full jg .loop1_full
sub blockq, 32 sub blockq, 32
sub rsp, 512
mov cntd, 2 mov cntd, 2
mov tmpq, rsp
mov dst_bakq, dstq mov dst_bakq, dstq
.loop2_full: .loop2_full:
VP9_IDCT16_1D rsp, 2 VP9_IDCT16_1D tmpq, 2
lea dstq, [dst_bakq+8] lea dstq, [dst_bakq+8]
add rsp, 16 add tmpq, 16
dec cntd dec cntd
jg .loop2_full jg .loop2_full
sub rsp, 32
; at the end of the loop, m0 should still be zero ; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients ; use that to zero out block coefficients
@ -970,7 +969,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w] VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w]
VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w] VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w]
mova [rsp+ 0*%%str], m9 ; make some scratch space (t0:m9->r0) mova [tmpq+ 0*%%str], m9 ; make some scratch space (t0:m9->r0)
mova m4, [%1+ 4*32] ; in4 mova m4, [%1+ 4*32] ; in4
mova m5, [%1+11*32] ; in11 mova m5, [%1+11*32] ; in11
mova m12, [%1+ 3*32] ; in3 mova m12, [%1+ 3*32] ; in3
@ -981,10 +980,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w] VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w]
VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w] VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w]
mova [rsp+ 2*%%str], m8 ; t1:m9->r2 mova [tmpq+ 2*%%str], m8 ; t1:m9->r2
mova [rsp+ 3*%%str], m2 ; t2:m2->r3 mova [tmpq+ 3*%%str], m2 ; t2:m2->r3
mova [rsp+ 4*%%str], m3 ; t3:m3->r4 mova [tmpq+ 4*%%str], m3 ; t3:m3->r4
mova [rsp+ 5*%%str], m13 ; t4:m13->r5 mova [tmpq+ 5*%%str], m13 ; t4:m13->r5
mova m2, [%1+ 6*32] ; in6 mova m2, [%1+ 6*32] ; in6
mova m3, [%1+ 9*32] ; in9 mova m3, [%1+ 9*32] ; in9
mova m8, [%1+ 1*32] ; in1 mova m8, [%1+ 1*32] ; in1
@ -1030,16 +1029,16 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14 ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
mova m6, [rsp+ 0*%%str] mova m6, [tmpq+ 0*%%str]
mova m7, [rsp+ 2*%%str] mova m7, [tmpq+ 2*%%str]
mova m13, [rsp+ 3*%%str] mova m13, [tmpq+ 3*%%str]
mova m14, [rsp+ 4*%%str] mova m14, [tmpq+ 4*%%str]
mova m15, [rsp+ 5*%%str] mova m15, [tmpq+ 5*%%str]
mova [rsp+ 8*%%str], m5 mova [tmpq+ 8*%%str], m5
mova [rsp+ 9*%%str], m4 mova [tmpq+ 9*%%str], m4
mova [rsp+10*%%str], m0 mova [tmpq+10*%%str], m0
mova [rsp+11*%%str], m10 mova [tmpq+11*%%str], m10
mova [rsp+12*%%str], m2 mova [tmpq+12*%%str], m2
; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7 ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14 ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
@ -1069,32 +1068,32 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14 ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
%if %2 == 1 %if %2 == 1
mova m0, [rsp+ 8*%%str] mova m0, [tmpq+ 8*%%str]
TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2 TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2
mova [rsp+ 0*16], m9 mova [tmpq+ 0*16], m9
mova [rsp+ 2*16], m3 mova [tmpq+ 2*16], m3
mova [rsp+ 4*16], m11 mova [tmpq+ 4*16], m11
mova [rsp+ 6*16], m14 mova [tmpq+ 6*16], m14
mova m9, [rsp+ 9*%%str] mova m9, [tmpq+ 9*%%str]
mova m3, [rsp+10*%%str] mova m3, [tmpq+10*%%str]
mova m11, [rsp+11*%%str] mova m11, [tmpq+11*%%str]
mova m14, [rsp+12*%%str] mova m14, [tmpq+12*%%str]
mova [rsp+ 8*16], m7 mova [tmpq+ 8*16], m7
mova [rsp+10*16], m1 mova [tmpq+10*16], m1
mova [rsp+12*16], m0 mova [tmpq+12*16], m0
mova [rsp+14*16], m12 mova [tmpq+14*16], m12
TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2 TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2
mova [rsp+ 1*16], m15 mova [tmpq+ 1*16], m15
mova [rsp+ 3*16], m9 mova [tmpq+ 3*16], m9
mova [rsp+ 5*16], m3 mova [tmpq+ 5*16], m3
mova [rsp+ 7*16], m6 mova [tmpq+ 7*16], m6
mova [rsp+ 9*16], m13 mova [tmpq+ 9*16], m13
mova [rsp+11*16], m11 mova [tmpq+11*16], m11
mova [rsp+13*16], m14 mova [tmpq+13*16], m14
mova [rsp+15*16], m8 mova [tmpq+15*16], m8
%else %else
mova m5, [rsp+ 8*%%str] mova m5, [tmpq+ 8*%%str]
pxor m0, m0 pxor m0, m0
pmulhrsw m9, [pw_512] pmulhrsw m9, [pw_512]
@ -1114,10 +1113,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
VP9_STORE_2X 5, 12, 2, 4, 0 VP9_STORE_2X 5, 12, 2, 4, 0
lea dstq, [dstq+strideq*2] lea dstq, [dstq+strideq*2]
mova m9, [rsp+ 9*%%str] mova m9, [tmpq+ 9*%%str]
mova m3, [rsp+10*%%str] mova m3, [tmpq+10*%%str]
mova m11, [rsp+11*%%str] mova m11, [tmpq+11*%%str]
mova m14, [rsp+12*%%str] mova m14, [tmpq+12*%%str]
pmulhrsw m15, [pw_512] pmulhrsw m15, [pw_512]
pmulhrsw m9, [pw_512] pmulhrsw m9, [pw_512]
@ -1139,29 +1138,26 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
%macro IADST16_FN 5 %macro IADST16_FN 5
INIT_XMM %5 INIT_XMM %5
cglobal vp9_%1_%3_16x16_add, 3, 5, 16, 512, dst, stride, block, eob cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
; potential eob checks go here
DEFINE_ARGS dst, stride, block, cnt, dst_bak
mov cntd, 2 mov cntd, 2
mov tmpq, rsp
.loop1_full: .loop1_full:
VP9_%2_1D blockq, 1 VP9_%2_1D blockq, 1
add blockq, 16 add blockq, 16
add rsp, 256 add tmpq, 256
dec cntd dec cntd
jg .loop1_full jg .loop1_full
sub blockq, 32 sub blockq, 32
sub rsp, 512
mov cntd, 2 mov cntd, 2
mov tmpq, rsp
mov dst_bakq, dstq mov dst_bakq, dstq
.loop2_full: .loop2_full:
VP9_%4_1D rsp, 2 VP9_%4_1D tmpq, 2
lea dstq, [dst_bakq+8] lea dstq, [dst_bakq+8]
add rsp, 16 add tmpq, 16
dec cntd dec cntd
jg .loop2_full jg .loop2_full
sub rsp, 32
; at the end of the loop, m0 should still be zero ; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients ; use that to zero out block coefficients
@ -1183,11 +1179,11 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
%assign %%str 16*%2*%2 %assign %%str 16*%2*%2
; first do t0-15, this can be done identical to idct16x16 ; first do t0-15, this can be done identical to idct16x16
VP9_IDCT16_1D_START %1, %3/2, 64*2, rsp+ 4*%%str VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str
; backup a different register ; backup a different register
mova [rsp+30*%%str], m15 ; t15 mova [tmpq+30*%%str], m15 ; t15
mova m7, [rsp+ 4*%%str] mova m7, [tmpq+ 4*%%str]
SUMSUB_BA w, 6, 9, 15 ; t6, t9 SUMSUB_BA w, 6, 9, 15 ; t6, t9
SUMSUB_BA w, 7, 8, 15 ; t7, t8 SUMSUB_BA w, 7, 8, 15 ; t7, t8
@ -1195,21 +1191,21 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
; store everything on stack to make space available for t16-31 ; store everything on stack to make space available for t16-31
; we store interleaved with the output of the second half (t16-31) ; we store interleaved with the output of the second half (t16-31)
; so we don't need to allocate extra stack space ; so we don't need to allocate extra stack space
mova [rsp+ 0*%%str], m0 ; t0 mova [tmpq+ 0*%%str], m0 ; t0
mova [rsp+ 4*%%str], m1 ; t1 mova [tmpq+ 4*%%str], m1 ; t1
mova [rsp+ 8*%%str], m2 ; t2 mova [tmpq+ 8*%%str], m2 ; t2
mova [rsp+12*%%str], m3 ; t3 mova [tmpq+12*%%str], m3 ; t3
mova [rsp+16*%%str], m4 ; t4 mova [tmpq+16*%%str], m4 ; t4
mova [rsp+20*%%str], m5 ; t5 mova [tmpq+20*%%str], m5 ; t5
mova [rsp+24*%%str], m6 ; t6 mova [tmpq+24*%%str], m6 ; t6
mova [rsp+28*%%str], m7 ; t7 mova [tmpq+28*%%str], m7 ; t7
mova [rsp+ 2*%%str], m8 ; t8 mova [tmpq+ 2*%%str], m8 ; t8
mova [rsp+ 6*%%str], m9 ; t9 mova [tmpq+ 6*%%str], m9 ; t9
mova [rsp+10*%%str], m10 ; t10 mova [tmpq+10*%%str], m10 ; t10
mova [rsp+14*%%str], m11 ; t11 mova [tmpq+14*%%str], m11 ; t11
mova [rsp+18*%%str], m12 ; t12 mova [tmpq+18*%%str], m12 ; t12
mova [rsp+22*%%str], m13 ; t13 mova [tmpq+22*%%str], m13 ; t13
mova [rsp+26*%%str], m14 ; t14 mova [tmpq+26*%%str], m14 ; t14
; then, secondly, do t16-31 ; then, secondly, do t16-31
%if %3 <= 8 %if %3 <= 8
@ -1235,8 +1231,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
; from 1 stage forward ; from 1 stage forward
SUMSUB_BA w, 8, 4, 1 SUMSUB_BA w, 8, 4, 1
; temporary storage ; temporary storage
mova [rsp+17*%%str], m8 ; t16 mova [tmpq+17*%%str], m8 ; t16
mova [rsp+21*%%str], m4 ; t19 mova [tmpq+21*%%str], m4 ; t19
VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
@ -1289,8 +1285,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
; from 2 stages forward ; from 2 stages forward
SUMSUB_BA w, 8, 4, 2 SUMSUB_BA w, 8, 4, 2
; temporary storage ; temporary storage
mova [rsp+17*%%str], m8 ; t16 mova [tmpq+17*%%str], m8 ; t16
mova [rsp+21*%%str], m4 ; t19 mova [tmpq+21*%%str], m4 ; t19
%if %3 <= 16 %if %3 <= 16
pmulhrsw m3, m12, [pw_13160x2] pmulhrsw m3, m12, [pw_13160x2]
pmulhrsw m12, [pw_9760x2] pmulhrsw m12, [pw_9760x2]
@ -1336,7 +1332,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
mova m8, [rsp+17*%%str] ; t16 mova m8, [tmpq+17*%%str] ; t16
; from 2 stages forward ; from 2 stages forward
SUMSUB_BA w, 0, 8, 4 SUMSUB_BA w, 0, 8, 4
SUMSUB_BA w, 15, 7, 4 SUMSUB_BA w, 15, 7, 4
@ -1345,10 +1341,10 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
pmulhrsw m7, [pw_11585x2] pmulhrsw m7, [pw_11585x2]
pmulhrsw m8, [pw_11585x2] pmulhrsw m8, [pw_11585x2]
; store t16/t23 ; store t16/t23
mova [rsp+ 1*%%str], m0 ; t16 mova [tmpq+ 1*%%str], m0 ; t16
mova [rsp+29*%%str], m7 ; t23 mova [tmpq+29*%%str], m7 ; t23
mova m4, [rsp+21*%%str] ; t19 mova m4, [tmpq+21*%%str] ; t19
VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29 VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29
VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28 VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28
VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27 VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27
@ -1384,27 +1380,27 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for
; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for
; final sumsub of pass 2 ; final sumsub of pass 2
mova [rsp+ 5*%%str], m1 ; t17 mova [tmpq+ 5*%%str], m1 ; t17
mova [rsp+ 9*%%str], m2 ; t18 mova [tmpq+ 9*%%str], m2 ; t18
mova [rsp+13*%%str], m3 ; t19 mova [tmpq+13*%%str], m3 ; t19
; then do final pass to sumsub+store the two halves ; then do final pass to sumsub+store the two halves
%if %2 == 1 %if %2 == 1
mova [rsp+17*%%str], m4 ; t20 mova [tmpq+17*%%str], m4 ; t20
mova [rsp+21*%%str], m5 ; t21 mova [tmpq+21*%%str], m5 ; t21
mova [rsp+25*%%str], m6 ; t22 mova [tmpq+25*%%str], m6 ; t22
mova m0, [rsp+ 0*%%str] ; t0 mova m0, [tmpq+ 0*%%str] ; t0
mova m1, [rsp+ 4*%%str] ; t1 mova m1, [tmpq+ 4*%%str] ; t1
mova m2, [rsp+ 8*%%str] ; t2 mova m2, [tmpq+ 8*%%str] ; t2
mova m3, [rsp+12*%%str] ; t3 mova m3, [tmpq+12*%%str] ; t3
mova m4, [rsp+16*%%str] ; t4 mova m4, [tmpq+16*%%str] ; t4
mova m5, [rsp+20*%%str] ; t5 mova m5, [tmpq+20*%%str] ; t5
mova m6, [rsp+24*%%str] ; t6 mova m6, [tmpq+24*%%str] ; t6
SUMSUB_BA w, 15, 0, 7 SUMSUB_BA w, 15, 0, 7
mova [rsp+ 3*%%str], m0 ; t15 mova [tmpq+ 3*%%str], m0 ; t15
mova m7, [rsp+28*%%str] ; t7 mova m7, [tmpq+28*%%str] ; t7
SUMSUB_BA w, 14, 1, 0 SUMSUB_BA w, 14, 1, 0
SUMSUB_BA w, 13, 2, 0 SUMSUB_BA w, 13, 2, 0
SUMSUB_BA w, 12, 3, 0 SUMSUB_BA w, 12, 3, 0
@ -1414,45 +1410,45 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
SUMSUB_BA w, 8, 7, 0 SUMSUB_BA w, 8, 7, 0
TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0 TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0
mova [rsp+ 0*%%str], m15 mova [tmpq+ 0*%%str], m15
mova [rsp+ 4*%%str], m14 mova [tmpq+ 4*%%str], m14
mova [rsp+ 8*%%str], m13 mova [tmpq+ 8*%%str], m13
mova [rsp+12*%%str], m12 mova [tmpq+12*%%str], m12
mova [rsp+16*%%str], m11 mova [tmpq+16*%%str], m11
mova [rsp+20*%%str], m10 mova [tmpq+20*%%str], m10
mova [rsp+24*%%str], m9 mova [tmpq+24*%%str], m9
mova [rsp+28*%%str], m8 mova [tmpq+28*%%str], m8
mova m0, [rsp+ 3*%%str] ; t15 mova m0, [tmpq+ 3*%%str] ; t15
TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8 TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8
mova [rsp+ 3*%%str], m7 mova [tmpq+ 3*%%str], m7
mova [rsp+ 7*%%str], m6 mova [tmpq+ 7*%%str], m6
mova [rsp+11*%%str], m5 mova [tmpq+11*%%str], m5
mova [rsp+15*%%str], m4 mova [tmpq+15*%%str], m4
mova [rsp+19*%%str], m3 mova [tmpq+19*%%str], m3
mova [rsp+23*%%str], m2 mova [tmpq+23*%%str], m2
mova [rsp+27*%%str], m1 mova [tmpq+27*%%str], m1
mova [rsp+31*%%str], m0 mova [tmpq+31*%%str], m0
mova m15, [rsp+ 2*%%str] ; t8 mova m15, [tmpq+ 2*%%str] ; t8
mova m14, [rsp+ 6*%%str] ; t9 mova m14, [tmpq+ 6*%%str] ; t9
mova m13, [rsp+10*%%str] ; t10 mova m13, [tmpq+10*%%str] ; t10
mova m12, [rsp+14*%%str] ; t11 mova m12, [tmpq+14*%%str] ; t11
mova m11, [rsp+18*%%str] ; t12 mova m11, [tmpq+18*%%str] ; t12
mova m10, [rsp+22*%%str] ; t13 mova m10, [tmpq+22*%%str] ; t13
mova m9, [rsp+26*%%str] ; t14 mova m9, [tmpq+26*%%str] ; t14
mova m8, [rsp+30*%%str] ; t15 mova m8, [tmpq+30*%%str] ; t15
mova m7, [rsp+ 1*%%str] ; t16 mova m7, [tmpq+ 1*%%str] ; t16
mova m6, [rsp+ 5*%%str] ; t17 mova m6, [tmpq+ 5*%%str] ; t17
mova m5, [rsp+ 9*%%str] ; t18 mova m5, [tmpq+ 9*%%str] ; t18
mova m4, [rsp+13*%%str] ; t19 mova m4, [tmpq+13*%%str] ; t19
mova m3, [rsp+17*%%str] ; t20 mova m3, [tmpq+17*%%str] ; t20
mova m2, [rsp+21*%%str] ; t21 mova m2, [tmpq+21*%%str] ; t21
mova m1, [rsp+25*%%str] ; t22 mova m1, [tmpq+25*%%str] ; t22
SUMSUB_BA w, 7, 8, 0 SUMSUB_BA w, 7, 8, 0
mova [rsp+ 2*%%str], m8 mova [tmpq+ 2*%%str], m8
mova m0, [rsp+29*%%str] ; t23 mova m0, [tmpq+29*%%str] ; t23
SUMSUB_BA w, 6, 9, 8 SUMSUB_BA w, 6, 9, 8
SUMSUB_BA w, 5, 10, 8 SUMSUB_BA w, 5, 10, 8
SUMSUB_BA w, 4, 11, 8 SUMSUB_BA w, 4, 11, 8
@ -1462,29 +1458,29 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
SUMSUB_BA w, 0, 15, 8 SUMSUB_BA w, 0, 15, 8
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
mova [rsp+ 1*%%str], m0 mova [tmpq+ 1*%%str], m0
mova [rsp+ 5*%%str], m1 mova [tmpq+ 5*%%str], m1
mova [rsp+ 9*%%str], m2 mova [tmpq+ 9*%%str], m2
mova [rsp+13*%%str], m3 mova [tmpq+13*%%str], m3
mova [rsp+17*%%str], m4 mova [tmpq+17*%%str], m4
mova [rsp+21*%%str], m5 mova [tmpq+21*%%str], m5
mova [rsp+25*%%str], m6 mova [tmpq+25*%%str], m6
mova [rsp+29*%%str], m7 mova [tmpq+29*%%str], m7
mova m8, [rsp+ 2*%%str] mova m8, [tmpq+ 2*%%str]
TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
mova [rsp+ 2*%%str], m8 mova [tmpq+ 2*%%str], m8
mova [rsp+ 6*%%str], m9 mova [tmpq+ 6*%%str], m9
mova [rsp+10*%%str], m10 mova [tmpq+10*%%str], m10
mova [rsp+14*%%str], m11 mova [tmpq+14*%%str], m11
mova [rsp+18*%%str], m12 mova [tmpq+18*%%str], m12
mova [rsp+22*%%str], m13 mova [tmpq+22*%%str], m13
mova [rsp+26*%%str], m14 mova [tmpq+26*%%str], m14
mova [rsp+30*%%str], m15 mova [tmpq+30*%%str], m15
%else %else
; t0-7 is in [rsp+{0,4,8,12,16,20,24,28}*%%str] ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
; t8-15 is in [rsp+{2,6,10,14,18,22,26,30}*%%str] ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
; t16-19 and t23 is in [rsp+{1,5,9,13,29}*%%str] ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
; t20-22 is in m4-6 ; t20-22 is in m4-6
; t24-31 is in m8-15 ; t24-31 is in m8-15
pxor m7, m7 pxor m7, m7
@ -1507,55 +1503,55 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
%endmacro %endmacro
; store t0-1 and t30-31 ; store t0-1 and t30-31
mova m0, [rsp+ 0*%%str] mova m0, [tmpq+ 0*%%str]
mova m1, [rsp+ 4*%%str] mova m1, [tmpq+ 4*%%str]
%%STORE_2X2 0, 1, 14, 15, 2, 3, 7 %%STORE_2X2 0, 1, 14, 15, 2, 3, 7
; store t2-3 and t28-29 ; store t2-3 and t28-29
mova m0, [rsp+ 8*%%str] mova m0, [tmpq+ 8*%%str]
mova m1, [rsp+12*%%str] mova m1, [tmpq+12*%%str]
%%STORE_2X2 0, 1, 12, 13, 2, 3, 7 %%STORE_2X2 0, 1, 12, 13, 2, 3, 7
; store t4-5 and t26-27 ; store t4-5 and t26-27
mova m0, [rsp+16*%%str] mova m0, [tmpq+16*%%str]
mova m1, [rsp+20*%%str] mova m1, [tmpq+20*%%str]
%%STORE_2X2 0, 1, 10, 11, 2, 3, 7 %%STORE_2X2 0, 1, 10, 11, 2, 3, 7
; store t6-7 and t24-25 ; store t6-7 and t24-25
mova m0, [rsp+24*%%str] mova m0, [tmpq+24*%%str]
mova m1, [rsp+28*%%str] mova m1, [tmpq+28*%%str]
%%STORE_2X2 0, 1, 8, 9, 2, 3, 7 %%STORE_2X2 0, 1, 8, 9, 2, 3, 7
; store t8-9 and t22-23 ; store t8-9 and t22-23
mova m0, [rsp+ 2*%%str] mova m0, [tmpq+ 2*%%str]
mova m1, [rsp+ 6*%%str] mova m1, [tmpq+ 6*%%str]
mova m8, [rsp+29*%%str] mova m8, [tmpq+29*%%str]
%%STORE_2X2 0, 1, 6, 8, 2, 3, 7 %%STORE_2X2 0, 1, 6, 8, 2, 3, 7
; store t10-11 and t20-21 ; store t10-11 and t20-21
mova m0, [rsp+10*%%str] mova m0, [tmpq+10*%%str]
mova m1, [rsp+14*%%str] mova m1, [tmpq+14*%%str]
%%STORE_2X2 0, 1, 4, 5, 2, 3, 7 %%STORE_2X2 0, 1, 4, 5, 2, 3, 7
; store t12-13 and t18-19 ; store t12-13 and t18-19
mova m0, [rsp+18*%%str] mova m0, [tmpq+18*%%str]
mova m1, [rsp+22*%%str] mova m1, [tmpq+22*%%str]
mova m5, [rsp+13*%%str] mova m5, [tmpq+13*%%str]
mova m4, [rsp+ 9*%%str] mova m4, [tmpq+ 9*%%str]
%%STORE_2X2 0, 1, 4, 5, 2, 3, 7 %%STORE_2X2 0, 1, 4, 5, 2, 3, 7
; store t14-17 ; store t14-17
mova m0, [rsp+26*%%str] mova m0, [tmpq+26*%%str]
mova m1, [rsp+30*%%str] mova m1, [tmpq+30*%%str]
mova m5, [rsp+ 5*%%str] mova m5, [tmpq+ 5*%%str]
mova m4, [rsp+ 1*%%str] mova m4, [tmpq+ 1*%%str]
%%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0 %%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0
%endif %endif
%endmacro %endmacro
%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
INIT_XMM %1 INIT_XMM %1
cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
cmp eobd, 135 cmp eobd, 135
jg .idctfull jg .idctfull
cmp eobd, 34 cmp eobd, 34
@ -1580,8 +1576,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
RET RET
DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
.idct8x8: .idct8x8:
mov tmpq, rsp
VP9_IDCT32_1D blockq, 1, 8 VP9_IDCT32_1D blockq, 1, 8
mov stride30q, strideq ; stride mov stride30q, strideq ; stride
@ -1592,12 +1589,11 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
.loop2_8x8: .loop2_8x8:
mov dstq, dst_bakq mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q] lea dst_endq, [dst_bakq+stride30q]
VP9_IDCT32_1D rsp, 2, 8 VP9_IDCT32_1D tmpq, 2, 8
add dst_bakq, 8 add dst_bakq, 8
add rsp, 16 add tmpq, 16
dec cntd dec cntd
jg .loop2_8x8 jg .loop2_8x8
sub rsp, 64
; at the end of the loop, m7 should still be zero ; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients ; use that to zero out block coefficients
@ -1606,29 +1602,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
.idct16x16: .idct16x16:
mov cntd, 2 mov cntd, 2
mov tmpq, rsp
.loop1_16x16: .loop1_16x16:
VP9_IDCT32_1D blockq, 1, 16 VP9_IDCT32_1D blockq, 1, 16
add blockq, 16 add blockq, 16
add rsp, 512 add tmpq, 512
dec cntd dec cntd
jg .loop1_16x16 jg .loop1_16x16
sub blockq, 32 sub blockq, 32
sub rsp, 1024
mov stride30q, strideq ; stride mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2 lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32 shl stride30q, 5 ; stride*32
mov cntd, 4 mov cntd, 4
mov tmpq, rsp
sub stride30q, stride2q ; stride*30 sub stride30q, stride2q ; stride*30
.loop2_16x16: .loop2_16x16:
mov dstq, dst_bakq mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q] lea dst_endq, [dst_bakq+stride30q]
VP9_IDCT32_1D rsp, 2, 16 VP9_IDCT32_1D tmpq, 2, 16
add dst_bakq, 8 add dst_bakq, 8
add rsp, 16 add tmpq, 16
dec cntd dec cntd
jg .loop2_16x16 jg .loop2_16x16
sub rsp, 64
; at the end of the loop, m7 should still be zero ; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients ; use that to zero out block coefficients
@ -1637,29 +1633,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
.idctfull: .idctfull:
mov cntd, 4 mov cntd, 4
mov tmpq, rsp
.loop1_full: .loop1_full:
VP9_IDCT32_1D blockq, 1 VP9_IDCT32_1D blockq, 1
add blockq, 16 add blockq, 16
add rsp, 512 add tmpq, 512
dec cntd dec cntd
jg .loop1_full jg .loop1_full
sub blockq, 64 sub blockq, 64
sub rsp, 2048
mov stride30q, strideq ; stride mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2 lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32 shl stride30q, 5 ; stride*32
mov cntd, 4 mov cntd, 4
mov tmpq, rsp
sub stride30q, stride2q ; stride*30 sub stride30q, stride2q ; stride*30
.loop2_full: .loop2_full:
mov dstq, dst_bakq mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q] lea dst_endq, [dst_bakq+stride30q]
VP9_IDCT32_1D rsp, 2 VP9_IDCT32_1D tmpq, 2
add dst_bakq, 8 add dst_bakq, 8
add rsp, 16 add tmpq, 16
dec cntd dec cntd
jg .loop2_full jg .loop2_full
sub rsp, 64
; at the end of the loop, m7 should still be zero ; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients ; use that to zero out block coefficients