diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c index 80dece700d..24d7799d58 100644 --- a/libavcodec/i386/fft_3dn2.c +++ b/libavcodec/i386/fft_3dn2.c @@ -154,20 +154,23 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, in1 = input; in2 = input + n2 - 1; for(k = 0; k < n4; k++) { + // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it asm volatile( - "movd %1, %%mm0 \n\t" - "movd %3, %%mm1 \n\t" - "punpckldq %2, %%mm0 \n\t" - "punpckldq %4, %%mm1 \n\t" + "movd %0, %%mm0 \n\t" + "movd %2, %%mm1 \n\t" + "punpckldq %1, %%mm0 \n\t" + "punpckldq %3, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "pfmul %%mm1, %%mm0 \n\t" "pswapd %%mm1, %%mm1 \n\t" "pfmul %%mm1, %%mm2 \n\t" "pfpnacc %%mm2, %%mm0 \n\t" + ::"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + asm volatile( "movq %%mm0, %0 \n\t" :"=m"(z[revtab[k]]) - :"m"(in2[-2*k]), "m"(in1[2*k]), - "m"(tcos[k]), "m"(tsin[k]) ); } @@ -190,11 +193,15 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, ); } + z += n8; asm volatile("movd %0, %%mm7" ::"r"(1<<31)); for(k = 0; k < n8; k++) { asm volatile( - "movq %4, %%mm0 \n\t" - "pswapd %5, %%mm1 \n\t" + "movq %0, %%mm0 \n\t" + "pswapd %1, %%mm1 \n\t" + ::"m"(z[k]), "m"(z[-1-k]) + ); + asm volatile( "movq %%mm0, %%mm2 \n\t" "pxor %%mm7, %%mm2 \n\t" "punpckldq %%mm1, %%mm2 \n\t" @@ -209,8 +216,7 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) - :"m"(z[n8+k]), "m"(z[n8-1-k]) - :"memory" + ::"memory" ); } asm volatile("emms");