(commit by michael)

mmx & mmx2 quantizer c dct permutation bugfix dont copy input on intra only encodings if it can be avoided dont draw edges on intra only stuff Originally committed as revision 281 to svn://svn.ffmpeg.org/ffmpeg/trunk
2025-03-23 04:24:35 +02:00 · 2002-01-27 13:30:18 +00:00 · 2002-01-27 13:30:18 +00:00 · 2f349de286
commit 2f349de286
parent 580b82fa24
5 changed files with 309 additions and 112 deletions
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -49,6 +49,12 @@ UINT8 zigzag_direct[64] = {
    53, 60, 61, 54, 47, 55, 62, 63
 };

+/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
+UINT16 __align8 inv_zigzag_direct16[64];
+
+/* not permutated zigzag_direct for MMX quantizer */
+UINT8 zigzag_direct_noperm[64];
+
 UINT8 ff_alternate_horizontal_scan[64] = {
    0,  1,  2,  3,  8,  9, 16, 17, 
    10, 11,  4,  5,  6,  7, 15, 14,
@ -83,6 +89,42 @@ static UINT8 simple_mmx_permutation[64]={
 	0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 };

+/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
+UINT32 inverse[256]={
+         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
+ 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
+ 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
+ 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
+ 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
+ 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
+  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
+  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
+  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
+  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
+  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
+  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
+  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
+  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
+  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
+  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
+  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
+  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
+  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
+  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
+  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
+  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
+  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
+  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
+  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
+  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
+  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
+  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
+  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
+  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
+  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
+  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
+};
+
 /* used to skip zeros at the end */
 UINT8 zigzag_end[64];

@ -515,6 +557,9 @@ void dsputil_init(void)
    else
        for(i=0; i<64; i++) permutation[i]=i;

+    for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
+    for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
+    
    if (use_permuted_idct) {
        /* permute for IDCT */
        for(i=0;i<64;i++) {
--- a/libavcodec/i386/mpegvideo_mmx.c
+++ b/libavcodec/i386/mpegvideo_mmx.c
@ -22,9 +22,16 @@

 #include "../dsputil.h"
 #include "../mpegvideo.h"
+#include "../avcodec.h"
+#include "../mangle.h"

 extern UINT8 zigzag_end[64];
 extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
+extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale);
+
+extern UINT8 zigzag_direct_noperm[64];
+extern UINT16 inv_zigzag_direct16[64];
+extern UINT32 inverse[256];

 #if 0

@ -252,7 +259,7 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
        }
    } else {
        i = 0;
-    unquant_even:
+//    unquant_even:
        quant_matrix = s->non_intra_matrix;
 	/* Align on 4 elements boundary */
 	while(i&7)
@ -411,6 +418,20 @@ static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w)
    }
 }

+static volatile int esp_temp;
+
+void unused_var_warning_killer(){
+	esp_temp++;
+}
+
+#undef HAVE_MMX2
+#define RENAME(a) a ## _MMX
+#include "mpegvideo_mmx_template.c"
+
+#define HAVE_MMX2
+#undef RENAME
+#define RENAME(a) a ## _MMX2
+#include "mpegvideo_mmx_template.c"

 void MPV_common_init_mmx(MpegEncContext *s)
 {
@ -421,5 +442,11 @@ void MPV_common_init_mmx(MpegEncContext *s)
        	s->dct_unquantize = dct_unquantize_mpeg1_mmx;
 	
 	draw_edges = draw_edges_mmx;
+
+	if(mm_flags & MM_MMXEXT){
+	        dct_quantize= dct_quantize_MMX2;
+	}else{
+		dct_quantize= dct_quantize_MMX;
+	}
    }
 }
--- a/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/libavcodec/i386/mpegvideo_mmx_template.c
@ -0,0 +1,201 @@
+/*
+    Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#undef SPREADW
+#undef PMAXW
+#ifdef HAVE_MMX2
+#define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
+#define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t"
+
+#else
+#define SPREADW(a) \
+	"punpcklwd " #a ", " #a " \n\t"\
+	"punpcklwd " #a ", " #a " \n\t"
+#define PMAXW(a,b) \
+	"psubusw " #a ", " #b " \n\t"\
+	"paddw " #a ", " #b " \n\t"
+#endif
+
+static int RENAME(dct_quantize)(MpegEncContext *s,
+                            DCTELEM *block, int n,
+                            int qscale)
+{
+    int i, level, last_non_zero_p1, q;
+    const UINT16 *qmat;
+    static __align8 INT16 temp_block[64];
+    int minLevel, maxLevel;
+    
+    if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){
+	/* mpeg4 */
+        minLevel= -2048;
+	maxLevel= 2047;
+    }else if(s->out_format==FMT_MPEG1){
+	/* mpeg1 */
+        minLevel= -255;
+	maxLevel= 255;
+    }else{
+	/* h263 / msmpeg4 */
+        minLevel= -128;
+	maxLevel= 127;
+    }
+
+    av_fdct (block);
+    
+    if (s->mb_intra) {
+        int dummy;
+        if (n < 4)
+            q = s->y_dc_scale;
+        else
+            q = s->c_dc_scale;
+        
+        /* note: block[0] is assumed to be positive */
+#if 1
+	asm volatile (
+		"xorl %%edx, %%edx	\n\t"
+		"mul %%ebx		\n\t"
+		: "=d" (temp_block[0]), "=a"(dummy)
+		: "a" (block[0] + (q >> 1)), "b" (inverse[q])
+	);
+#else
+	asm volatile (
+		"xorl %%edx, %%edx	\n\t"
+		"divw %%bx		\n\t"
+		"movzwl %%ax, %%eax	\n\t"
+		: "=a" (temp_block[0])
+		: "a" (block[0] + (q >> 1)), "b" (q)
+		: "%edx"
+	);
+#endif
+//        temp_block[0] = (block[0] + (q >> 1)) / q;
+        i = 1;
+        last_non_zero_p1 = 1;
+        if (s->out_format == FMT_H263) {
+            qmat = s->q_non_intra_matrix16;
+        } else {
+            qmat = s->q_intra_matrix16;
+        }
+        for(i=1;i<4;i++) {
+            level = block[i] * qmat[i];
+            level = level / (1 << (QMAT_SHIFT_MMX - 3));
+            /* XXX: currently, this code is not optimal. the range should be:
+               mpeg1: -255..255
+               mpeg2: -2048..2047
+               h263:  -128..127
+               mpeg4: -2048..2047
+            */
+            if (level > maxLevel)
+                level = maxLevel;
+            else if (level < minLevel)
+                level = minLevel;
+            temp_block[i] = level;
+
+	    if(level) 
+	        if(last_non_zero_p1 < inv_zigzag_direct16[i]) last_non_zero_p1= inv_zigzag_direct16[i];
+	    block[i]=0;
+        }
+    } else {
+        i = 0;
+        last_non_zero_p1 = 0;
+        qmat = s->q_non_intra_matrix16;
+    }
+
+    asm volatile( /* XXX: small rounding bug, but it shouldnt matter */
+	"movd %3, %%mm3			\n\t"
+	SPREADW(%%mm3)
+	"movd %4, %%mm4			\n\t"
+	SPREADW(%%mm4)
+	"movd %5, %%mm5			\n\t"
+	SPREADW(%%mm5)
+	"pxor %%mm7, %%mm7		\n\t"
+	"movd %%eax, %%mm2		\n\t"
+	SPREADW(%%mm2)
+	"movl %6, %%eax			\n\t"
+	".balign 16			\n\t"
+	"1:				\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq (%2, %%eax), %%mm1	\n\t"
+	"movq %%mm0, %%mm6		\n\t"
+	"psraw $15, %%mm6		\n\t"
+	"pmulhw %%mm0, %%mm1		\n\t"
+	"psubsw %%mm6, %%mm1		\n\t"
+#ifdef HAVE_MMX2
+	"pminsw %%mm3, %%mm1		\n\t"
+	"pmaxsw %%mm4, %%mm1		\n\t"
+#else
+	"paddsw %%mm3, %%mm1		\n\t"
+	"psubusw %%mm4, %%mm1		\n\t"
+	"paddsw %%mm5, %%mm1		\n\t"
+#endif
+	"movq %%mm1, (%8, %%eax)	\n\t"
+	"pcmpeqw %%mm7, %%mm1		\n\t"
+	"movq (%7, %%eax), %%mm0	\n\t"
+	"movq %%mm7, (%1, %%eax)	\n\t"
+	"pandn %%mm0, %%mm1		\n\t"
+	PMAXW(%%mm1, %%mm2)
+	"addl $8, %%eax			\n\t"
+	" js 1b				\n\t"
+	"movq %%mm2, %%mm0		\n\t"
+	"psrlq $32, %%mm2		\n\t"
+	PMAXW(%%mm0, %%mm2)
+	"movq %%mm2, %%mm0		\n\t"
+	"psrlq $16, %%mm2		\n\t"
+	PMAXW(%%mm0, %%mm2)
+	"movd %%mm2, %%eax		\n\t"
+	"movzbl %%al, %%eax		\n\t"
+	: "+a" (last_non_zero_p1)
+	: "r" (block+64), "r" (qmat+64), 
+#ifdef HAVE_MMX2
+	  "m" (maxLevel),          "m" (minLevel),                    "m" (0 /* dummy */), "g" (2*i - 128),
+#else
+	  "m" (0x7FFF - maxLevel), "m" (0x7FFF -maxLevel + minLevel), "m" (minLevel),      "g" (2*i - 128),
+#endif
+	  "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+    );
+// last_non_zero_p1=64;       
+    /* permute for IDCT */
+    asm volatile(
+	"movl %0, %%eax			\n\t"
+	"pushl %%ebp			\n\t"
+	"movl %%esp, " MANGLE(esp_temp) "\n\t"
+	"1:				\n\t"
+	"movzbl (%1, %%eax), %%ebx	\n\t"
+	"movzbl 1(%1, %%eax), %%ebp	\n\t"
+	"movw (%2, %%ebx, 2), %%cx	\n\t"
+	"movw (%2, %%ebp, 2), %%sp	\n\t"
+	"movzbl " MANGLE(permutation) "(%%ebx), %%ebx\n\t"
+	"movzbl " MANGLE(permutation) "(%%ebp), %%ebp\n\t"
+	"movw %%cx, (%3, %%ebx, 2)	\n\t"
+	"movw %%sp, (%3, %%ebp, 2)	\n\t"
+	"addl $2, %%eax			\n\t"
+	" js 1b				\n\t"
+	"movl " MANGLE(esp_temp) ", %%esp\n\t"
+	"popl %%ebp			\n\t"
+	: 
+	: "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block)
+	: "%eax", "%ebx", "%ecx"
+	);
+/*
+    for(i=0; i<last_non_zero_p1; i++)
+    {
+       int j= zigzag_direct_noperm[i];
+       block[block_permute_op(j)]= temp_block[j];
+    }
+*/
+//block_permute(block);
+    return last_non_zero_p1 - 1;
+}
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@ -35,12 +35,10 @@ static void dct_unquantize_mpeg1_c(MpegEncContext *s,
                                   DCTELEM *block, int n, int qscale);
 static void dct_unquantize_h263_c(MpegEncContext *s, 
                                  DCTELEM *block, int n, int qscale);
-static int dct_quantize(MpegEncContext *s, DCTELEM *block, int n, int qscale);
-static int dct_quantize_mmx(MpegEncContext *s, 
-                            DCTELEM *block, int n,
-                            int qscale);
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w);
+static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale);

+int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale)= dct_quantize_c;
 void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edges_c;

 #define EDGE_WIDTH 16
@ -74,29 +72,29 @@ int motion_estimation_method = ME_LOG;

 extern UINT8 zigzag_end[64];

-/* XXX: should use variable shift ? */
-#define QMAT_SHIFT_MMX 19
-#define QMAT_SHIFT 25
-
-static void convert_matrix(int *qmat, const UINT16 *quant_matrix, int qscale)
+static void convert_matrix(int *qmat, UINT16 *qmat16, const UINT16 *quant_matrix, int qscale)
 {
    int i;

    if (av_fdct == jpeg_fdct_ifast) {
        for(i=0;i<64;i++) {
            /* 16 <= qscale * quant_matrix[i] <= 7905 */
-            /* 19952 <= aanscales[i] * qscale * quant_matrix[i] <= 249205026 */
+            /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
+            /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
+            /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
            
-            qmat[i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / 
-                            (aanscales[i] * qscale * quant_matrix[i]));
+            qmat[block_permute_op(i)] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / 
+                            (aanscales[i] * qscale * quant_matrix[block_permute_op(i)]));
        }
    } else {
        for(i=0;i<64;i++) {
            /* We can safely suppose that 16 <= quant_matrix[i] <= 255
-               So 16 <= qscale * quant_matrix[i] <= 7905
-               so (1 << QMAT_SHIFT) / 16 >= qmat[i] >= (1 << QMAT_SHIFT) / 7905
+               So 16           <= qscale * quant_matrix[i]             <= 7905
+               so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905
+               so 32768        >= (1<<19) / (qscale * quant_matrix[i]) >= 67
            */
-            qmat[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
+            qmat[i]   = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
+            qmat16[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]);
        }
    }
 }
@ -418,7 +416,7 @@ void MPV_frame_start(MpegEncContext *s)
 void MPV_frame_end(MpegEncContext *s)
 {
    /* draw edge for correct motion prediction if outside */
-    if (s->pict_type != B_TYPE) {
+    if (s->pict_type != B_TYPE && !s->intra_only) {
      if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4){
        draw_edges(s->current_picture[0], s->linesize, s->mb_width*16, s->mb_height*16, EDGE_WIDTH);
        draw_edges(s->current_picture[1], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2);
@ -457,7 +455,7 @@ int MPV_encode_picture(AVCodecContext *avctx,
    avctx->key_frame = (s->pict_type == I_TYPE);
    
    MPV_frame_start(s);
-
+    
    for(i=0;i<3;i++) {
        UINT8 *src = pict->data[i];
        UINT8 *dest = s->current_picture[i];
@ -472,11 +470,15 @@ int MPV_encode_picture(AVCodecContext *avctx,
            h >>= 1;
        }

-        for(j=0;j<h;j++) {
-            memcpy(dest, src, w);
-            dest += dest_wrap;
-            src += src_wrap;
-        }
+	if(s->intra_only && dest_wrap==src_wrap){
+	    s->current_picture[i] = pict->data[i];
+	}else {
+            for(j=0;j<h;j++) {
+                memcpy(dest, src, w);
+                dest += dest_wrap;
+                src += src_wrap;
+            }
+	}
        s->new_picture[i] = s->current_picture[i];
    }

@ -873,10 +875,10 @@ static void encode_picture(MpegEncContext *s, int picture_number)
        s->intra_matrix[0] = default_intra_matrix[0];
        for(i=1;i<64;i++)
            s->intra_matrix[i] = (default_intra_matrix[i] * s->qscale) >> 3;
-        convert_matrix(s->q_intra_matrix, s->intra_matrix, 8);
+        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, 8);
    } else {
-        convert_matrix(s->q_intra_matrix, s->intra_matrix, s->qscale);
-        convert_matrix(s->q_non_intra_matrix, s->non_intra_matrix, s->qscale);
+        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, s->qscale);
+        convert_matrix(s->q_non_intra_matrix, s->q_non_intra_matrix16, s->non_intra_matrix, s->qscale);
    }

    switch(s->out_format) {
@ -1011,14 +1013,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                s->y_dc_scale = 8;
                s->c_dc_scale = 8;
            }
-
            for(i=0;i<6;i++) {
-                int last_index;
-                if (av_fdct == jpeg_fdct_ifast)
-                    last_index = dct_quantize(s, s->block[i], i, s->qscale);
-                else
-                    last_index = dct_quantize_mmx(s, s->block[i], i, s->qscale);
-                s->block_last_index[i] = last_index;
+                s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale);
            }

            /* huffman encode */
@ -1060,7 +1056,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
    //    fprintf(stderr,"\nNumber of GOB: %d", s->gob_number);
 }

-static int dct_quantize(MpegEncContext *s, 
+static int dct_quantize_c(MpegEncContext *s, 
                        DCTELEM *block, int n,
                        int qscale)
 {
@ -1157,85 +1153,7 @@ static int dct_quantize(MpegEncContext *s,
                level = maxLevel;
            else if (level < minLevel)
                level = minLevel;
-            block[j] = level;
-            last_non_zero = i;
-        } else {
-            block[j] = 0;
-        }
-    }
-    return last_non_zero;
-}

-static int dct_quantize_mmx(MpegEncContext *s, 
-                            DCTELEM *block, int n,
-                            int qscale)
-{
-    int i, j, level, last_non_zero, q;
-    const int *qmat;
-    int minLevel, maxLevel;
-
-    if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){
-	/* mpeg4 */
-        minLevel= -2048;
-	maxLevel= 2047;
-    }else if(s->out_format==FMT_MPEG1){
-	/* mpeg1 */
-        minLevel= -255;
-	maxLevel= 255;
-    }else{
-	/* h263 / msmpeg4 */
-        minLevel= -128;
-	maxLevel= 127;
-    }
-
-    av_fdct (block);
-    
-    /* we need this permutation so that we correct the IDCT
-       permutation. will be moved into DCT code */
-    block_permute(block);
-
-    if (s->mb_intra) {
-        if (n < 4)
-            q = s->y_dc_scale;
-        else
-            q = s->c_dc_scale;
-        
-        /* note: block[0] is assumed to be positive */
-        block[0] = (block[0] + (q >> 1)) / q;
-        i = 1;
-        last_non_zero = 0;
-        if (s->out_format == FMT_H263) {
-            qmat = s->q_non_intra_matrix;
-        } else {
-            qmat = s->q_intra_matrix;
-        }
-    } else {
-        i = 0;
-        last_non_zero = -1;
-        qmat = s->q_non_intra_matrix;
-    }
-
-    for(;i<64;i++) {
-        j = zigzag_direct[i];
-        level = block[j];
-        level = level * qmat[j];
-        /* XXX: slight error for the low range. Test should be equivalent to
-           (level <= -(1 << (QMAT_SHIFT_MMX - 3)) || level >= (1 <<
-           (QMAT_SHIFT_MMX - 3)))
-        */
-        if (((level << (31 - (QMAT_SHIFT_MMX - 3))) >> (31 - (QMAT_SHIFT_MMX - 3))) != 
-            level) {
-            level = level / (1 << (QMAT_SHIFT_MMX - 3));
-            /* XXX: currently, this code is not optimal. the range should be:
-               mpeg1: -255..255
-               mpeg2: -2048..2047
-               h263:  -128..127
-               mpeg4: -2048..2047
-            */
-            if (level > maxLevel)
-                level = maxLevel;
-            else if (level < minLevel)
-                level = minLevel;
            block[j] = level;
            last_non_zero = i;
        } else {
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@ -30,6 +30,9 @@ enum OutputFormat {

 #define MPEG_BUF_SIZE (16 * 1024)

+#define QMAT_SHIFT_MMX 19
+#define QMAT_SHIFT 25
+
 typedef struct MpegEncContext {
    struct AVCodecContext *avctx;
    /* the following parameters must be initialized before encoding */
@ -120,6 +123,9 @@ typedef struct MpegEncContext {
    /* precomputed matrix (combine qscale and DCT renorm) */
    int q_intra_matrix[64];
    int q_non_intra_matrix[64];
+    /* identical to the above but for MMX & these are not permutated */
+    UINT16 __align8 q_intra_matrix16[64] ;
+    UINT16 __align8 q_non_intra_matrix16[64];
    int block_last_index[6];  /* last non zero coefficient in block */

    void *opaque; /* private data for the user */