From da6505ad2fc8ef045401a3d9f980586ac5cf808c Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 3 Aug 2012 16:58:26 +0200
Subject: [PATCH 01/13] dsputil: make add_hfyu_left_prediction_sse4() support
 unaligned src.

This makes add_hfyu_left_prediction_sse4() handle sources that are not
16-byte aligned in its own function rather than by proxying the call to
add_hfyu_left_prediction_ssse3(). This fixes a crash on Win64, since the
sse4 version clobberes xmm6, but the ssse3 version (which uses MMX regs)
does not restore it, thus leading to XMM clobbering and RSP being off.

Fixes bug 342.
---
 libavcodec/x86/dsputil_yasm.asm | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 70a0aa12e7..af2de15a25 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -388,12 +388,16 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
     RET
 
 
-%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
+%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
     add     srcq, wq
     add     dstq, wq
     neg     wq
 %%.loop:
+%if %2
     mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
     mova    m2, m1
     psllw   m1, 8
     paddb   m1, m2
@@ -435,7 +439,7 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
     mova    m3, [pb_zz11zz55zz99zzdd]
     movd    m0, leftm
     psllq   m0, 56
-    ADD_HFYU_LEFT_LOOP 1
+    ADD_HFYU_LEFT_LOOP 1, 1
 
 INIT_XMM
 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
@@ -446,12 +450,14 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
     movd    m0, leftm
     pslldq  m0, 15
     test    srcq, 15
-    jnz add_hfyu_left_prediction_ssse3.skip_prologue
+    jnz .src_unaligned
     test    dstq, 15
-    jnz .unaligned
-    ADD_HFYU_LEFT_LOOP 1
-.unaligned:
-    ADD_HFYU_LEFT_LOOP 0
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 0
 
 
 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)

From 98d0d19208959766a58f13dd6a678d1f765a26ac Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 2 Aug 2012 20:46:09 -0700
Subject: [PATCH 02/13] lagarith: pad RGB buffer by 1 byte.

For left HFYU prediction, we predict from the buffer buf+1 using 8- or
16-byte reads. This means that aligning the buffer by 16 bytes is in
itself not sufficient, because if the width itself is 16- or 8-byte
aligned, the buffer will not be padded, and thus a read of size 16 at
buf+1 will overflow boundaries at the right edge. Padding the buffer by
1 byte is sufficient to not overflow its boundaries.

Fixes bug 342.
---
 libavcodec/lagarith.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index 35f5a07e4c..e72674c481 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -553,7 +553,7 @@ static int lag_decode_frame(AVCodecContext *avctx,
 
         if (!l->rgb_planes) {
             l->rgb_stride = FFALIGN(avctx->width, 16);
-            l->rgb_planes = av_malloc(l->rgb_stride * avctx->height * planes);
+            l->rgb_planes = av_malloc(l->rgb_stride * avctx->height * planes + 1);
             if (!l->rgb_planes) {
                 av_log(avctx, AV_LOG_ERROR, "cannot allocate temporary buffer\n");
                 return AVERROR(ENOMEM);

From 7191e1c49037f0ddc25515938ad379185ddd0c9c Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 2 Aug 2012 20:46:08 -0700
Subject: [PATCH 03/13] lagarith: fix color plane inversion for YUY2 output.

---
 libavcodec/lagarith.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index e72674c481..124ecee161 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -618,11 +618,11 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
-                               avctx->height, p->linesize[2],
-                               buf + offset_gu, buf_size - offset_gu);
         lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
                                avctx->height, p->linesize[1],
+                               buf + offset_gu, buf_size - offset_gu);
+        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
+                               avctx->height, p->linesize[2],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     case FRAME_ARITH_YV12:

From 36936080239ba9e162f9f5d0f722ce579cb606cf Mon Sep 17 00:00:00 2001
From: Sean McGovern <gseanmcg@gmail.com>
Date: Thu, 2 Aug 2012 15:37:28 -0400
Subject: [PATCH 04/13] wmapro: prevent division by zero when sample rate is
 unspecified

This fixes Bugzilla #327:

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 libavcodec/wmaprodec.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index 699c1b7503..5fcafa29a7 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -340,6 +340,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->avctx->sample_rate <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "invalid sample rate\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     s->num_channels = avctx->channels;
 
     if (s->num_channels < 0) {

From 66adb7ce1bc1cc5e3f1c4b1cd9f20ac68086a486 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 3 Aug 2012 12:06:38 -0700
Subject: [PATCH 05/13] Revert "wmapro: prevent division by zero when sample
 rate is unspecified"

This reverts commit 36936080239ba9e162f9f5d0f722ce579cb606cf. It was
already applied; no idea why it didn't error out while re-applying it.
---
 libavcodec/wmaprodec.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index 5fcafa29a7..699c1b7503 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -340,11 +340,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->avctx->sample_rate <= 0) {
-        av_log(avctx, AV_LOG_ERROR, "invalid sample rate\n");
-        return AVERROR_INVALIDDATA;
-    }
-
     s->num_channels = avctx->channels;
 
     if (s->num_channels < 0) {

From 239fdf1b4a3dd9decad157d4694837cffa917021 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Sun, 8 Jul 2012 18:42:12 +0200
Subject: [PATCH 06/13] x86: build: replace mmx2 by mmxext

Refactoring mmx2/mmxext YASM code with cpuflags will force renames.
So switching to a consistent naming scheme beforehand is sensible.
The name "mmxext" is more official and widespread and also the name
of the CPU flag, as reported e.g. by the Linux kernel.
---
 Doxyfile                                |  2 +-
 configure                               | 12 ++++++------
 doc/APIchanges                          |  5 +++++
 libavcodec/dct-test.c                   |  4 ++--
 libavcodec/motion-test.c                |  4 ++--
 libavcodec/x86/ac3dsp.asm               |  2 +-
 libavcodec/x86/ac3dsp_mmx.c             |  2 +-
 libavcodec/x86/cavsdsp_mmx.c            |  2 +-
 libavcodec/x86/dsputil_mmx.c            |  4 ++--
 libavcodec/x86/dsputilenc_mmx.c         |  7 +++----
 libavcodec/x86/h264_intrapred_init.c    |  4 ++--
 libavcodec/x86/h264dsp_mmx.c            |  6 +++---
 libavcodec/x86/motion_est_mmx.c         |  2 +-
 libavcodec/x86/mpegvideo_mmx.c          | 10 +++++-----
 libavcodec/x86/mpegvideo_mmx_template.c |  2 +-
 libavcodec/x86/pngdsp-init.c            |  2 +-
 libavcodec/x86/rv34dsp_init.c           |  2 +-
 libavcodec/x86/rv40dsp_init.c           |  2 +-
 libavcodec/x86/snowdsp_mmx.c            |  2 +-
 libavcodec/x86/vc1dsp_mmx.c             |  4 ++--
 libavcodec/x86/vp3dsp_init.c            |  2 +-
 libavcodec/x86/vp8dsp-init.c            |  2 +-
 libavfilter/x86/gradfun.c               |  6 +++---
 libavfilter/x86/yadif.c                 |  6 +++---
 libavutil/cpu.c                         |  8 ++++----
 libavutil/cpu.h                         |  5 +++++
 libavutil/version.h                     |  2 +-
 libavutil/x86/cpu.c                     |  4 ++--
 libswscale/swscale.c                    |  4 ++--
 libswscale/swscale.h                    |  3 +++
 libswscale/utils.c                      | 12 ++++++------
 libswscale/version.h                    |  2 +-
 libswscale/x86/rgb2rgb.c                | 12 ++++++------
 libswscale/x86/rgb2rgb_template.c       | 18 +++++++++---------
 libswscale/x86/swscale.c                | 16 ++++++++--------
 libswscale/x86/swscale_template.c       | 18 +++++++++---------
 libswscale/x86/yuv2rgb.c                | 16 ++++++++--------
 libswscale/x86/yuv2rgb_template.c       | 12 ++++++------
 38 files changed, 120 insertions(+), 108 deletions(-)

diff --git a/Doxyfile b/Doxyfile
index 457cf5140a..8e0dcf39f9 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -1378,7 +1378,7 @@ PREDEFINED             = "__attribute__(x)=" \
                          "DEF(x)=x ## _TMPL" \
                          HAVE_AV_CONFIG_H \
                          HAVE_MMX \
-                         HAVE_MMX2 \
+                         HAVE_MMXEXT \
                          HAVE_AMD3DNOW \
                          "DECLARE_ALIGNED(a,t,n)=t n" \
                          "offsetof(x,y)=0x42"
diff --git a/configure b/configure
index a39ead7b78..b139008d2b 100755
--- a/configure
+++ b/configure
@@ -243,7 +243,7 @@ Optimization options (experts only):
   --disable-amd3dnow       disable 3DNow! optimizations
   --disable-amd3dnowext    disable 3DNow! extended optimizations
   --disable-mmx            disable MMX optimizations
-  --disable-mmx2           disable MMX2 optimizations
+  --disable-mmxext         disable MMXEXT optimizations
   --disable-sse            disable SSE optimizations
   --disable-ssse3          disable SSSE3 optimizations
   --disable-avx            disable AVX optimizations
@@ -1054,7 +1054,7 @@ ARCH_EXT_LIST='
     fma4
     mmi
     mmx
-    mmx2
+    mmxext
     neon
     ppc4xx
     sse
@@ -1302,7 +1302,7 @@ x86_64_suggest="cmov fast_cmov"
 amd3dnow_deps="mmx"
 amd3dnowext_deps="amd3dnow"
 mmx_deps="x86"
-mmx2_deps="mmx"
+mmxext_deps="mmx"
 sse_deps="mmx"
 ssse3_deps="sse"
 avx_deps="ssse3"
@@ -2861,9 +2861,9 @@ EOF
     # check whether xmm clobbers are supported
     check_asm xmm_clobbers '"":::"%xmm0"'
 
-    # check whether binutils is new enough to compile SSSE3/MMX2
+    # check whether binutils is new enough to compile SSSE3/MMXEXT
     enabled ssse3 && check_asm ssse3 '"pabsw %xmm0, %xmm0"'
-    enabled mmx2  && check_asm mmx2  '"pmaxub %mm0, %mm1"'
+    enabled mmxext && check_asm mmxext '"pmaxub %mm0, %mm1"'
 
     if ! disabled_any asm mmx yasm; then
         if check_cmd $yasmexe --version; then
@@ -3307,7 +3307,7 @@ echo "runtime cpu detection     ${runtime_cpudetect-no}"
 if enabled x86; then
     echo "${yasmexe}                      ${yasm-no}"
     echo "MMX enabled               ${mmx-no}"
-    echo "MMX2 enabled              ${mmx2-no}"
+    echo "MMXEXT enabled            ${mmxext-no}"
     echo "3DNow! enabled            ${amd3dnow-no}"
     echo "3DNow! extended enabled   ${amd3dnowext-no}"
     echo "SSE enabled               ${sse-no}"
diff --git a/doc/APIchanges b/doc/APIchanges
index 058fab6334..ecb2e871e9 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -13,6 +13,11 @@ libavutil:     2011-04-18
 
 API changes, most recent first:
 
+2012-08-03 - xxxxxxx - lavu 51.37.1 - cpu.h
+                       lsws 2.1.1   - swscale.h
+  Rename AV_CPU_FLAG_MMX2  ---> AV_CPU_FLAG_MMXEXT.
+  Rename SWS_CPU_CAPS_MMX2 ---> SWS_CPU_CAPS_MMXEXT.
+
 2012-07-xx - xxxxxxx - lavf 54.13.0 - avformat.h
   Add AVFMT_FLAG_NOBUFFER for low latency use cases.
 
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 3aa752ba9e..21a3397e4d 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -87,7 +87,7 @@ static const struct algo fdct_tab[] = {
 
 #if HAVE_MMX && HAVE_INLINE_ASM
     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
-    { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
+    { "MMXEXT",         ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMXEXT  },
     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
 #endif
 
@@ -111,7 +111,7 @@ static const struct algo idct_tab[] = {
 #if HAVE_MMX && HAVE_INLINE_ASM
     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
-    { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
+    { "XVID-MMXEXT",    ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 #endif
 
diff --git a/libavcodec/motion-test.c b/libavcodec/motion-test.c
index 2c3a0af939..ccf14f4761 100644
--- a/libavcodec/motion-test.c
+++ b/libavcodec/motion-test.c
@@ -116,8 +116,8 @@ int main(int argc, char **argv)
     AVCodecContext *ctx;
     int c;
     DSPContext cctx, mmxctx;
-    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMX2 };
-    int flags_size = HAVE_MMX2 ? 2 : 1;
+    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT };
+    int flags_size = HAVE_MMXEXT ? 2 : 1;
 
     if (argc > 1) {
         help();
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index f1e73d375a..ef828bb0d5 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -68,7 +68,7 @@ cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
 %define LOOP_ALIGN
 INIT_MMX
 AC3_EXPONENT_MIN mmx
-%if HAVE_MMX2
+%if HAVE_MMXEXT
 %define PMINUB PMINUB_MMXEXT
 %define LOOP_ALIGN ALIGN 16
 AC3_EXPONENT_MIN mmxext
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 1a43183aa0..0ac8685c6d 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -65,7 +65,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
             c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
         }
     }
-    if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT && HAVE_MMXEXT) {
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx2;
     }
diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c
index b3d2c27441..4ed7d8e598 100644
--- a/libavcodec/x86/cavsdsp_mmx.c
+++ b/libavcodec/x86/cavsdsp_mmx.c
@@ -486,7 +486,7 @@ void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx)
     int mm_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
-    if (mm_flags & AV_CPU_FLAG_MMX2)  ff_cavsdsp_init_mmx2 (c, avctx);
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmx2(c, avctx);
     if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_INLINE_ASM */
 }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index d26f6126a8..3daa09b314 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -3028,7 +3028,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
                     c->idct_add              = ff_idct_xvid_sse2_add;
                     c->idct                  = ff_idct_xvid_sse2;
                     c->idct_permutation_type = FF_SSE2_IDCT_PERM;
-                } else if (mm_flags & AV_CPU_FLAG_MMX2) {
+                } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                     c->idct_put              = ff_idct_xvid_mmx2_put;
                     c->idct_add              = ff_idct_xvid_mmx2_add;
                     c->idct                  = ff_idct_xvid_mmx2;
@@ -3044,7 +3044,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
         dsputil_init_mmx(c, avctx, mm_flags);
     }
 
-    if (mm_flags & AV_CPU_FLAG_MMX2)
+    if (mm_flags & AV_CPU_FLAG_MMXEXT)
         dsputil_init_mmx2(c, avctx, mm_flags);
 
     if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 3cac979ef0..0ac4d2c10d 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -1111,7 +1111,7 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
             (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
             if(mm_flags & AV_CPU_FLAG_SSE2){
                 c->fdct = ff_fdct_sse2;
-            }else if(mm_flags & AV_CPU_FLAG_MMX2){
+            } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                 c->fdct = ff_fdct_mmx2;
             }else{
                 c->fdct = ff_fdct_mmx;
@@ -1144,8 +1144,7 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
 
         c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
 
-
-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
             c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
             c->vsad[4]= vsad_intra16_mmx2;
 
@@ -1186,7 +1185,7 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
 
-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
             c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx2;
             c->hadamard8_diff[1] = ff_hadamard8_diff_mmx2;
         }
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index c6d4c709ec..08c35c7138 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -198,7 +198,7 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
             }
         }
 
-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
             h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_mmx2;
             h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_mmx2;
             if (chroma_format_idc == 1)
@@ -308,7 +308,7 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
             }
         }
     } else if (bit_depth == 10) {
-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
             h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
             h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;
 
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 5e02a46236..0612ffbb8b 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -218,7 +218,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #if HAVE_YASM
     int mm_flags = av_get_cpu_flags();
 
-    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2)
+    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMXEXT)
         c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2;
 
     if (bit_depth == 8) {
@@ -236,7 +236,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             if (mm_flags & AV_CPU_FLAG_CMOV)
                 c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
 
-            if (mm_flags & AV_CPU_FLAG_MMX2) {
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                 c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmx2;
                 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2;
                 c->h264_idct_add16   = ff_h264_idct_add16_8_mmx2;
@@ -304,7 +304,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
         }
     } else if (bit_depth == 10) {
         if (mm_flags & AV_CPU_FLAG_MMX) {
-            if (mm_flags & AV_CPU_FLAG_MMX2) {
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
 #if ARCH_X86_32
                 c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmx2;
                 c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2;
diff --git a/libavcodec/x86/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c
index 5aed655657..ab845c1129 100644
--- a/libavcodec/x86/motion_est_mmx.c
+++ b/libavcodec/x86/motion_est_mmx.c
@@ -444,7 +444,7 @@ void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
         c->sad[0]= sad16_mmx;
         c->sad[1]= sad8_mmx;
     }
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         c->pix_abs[0][0] = sad16_mmx2;
         c->pix_abs[1][0] = sad8_mmx2;
 
diff --git a/libavcodec/x86/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo_mmx.c
index a242c19aec..85f6866342 100644
--- a/libavcodec/x86/mpegvideo_mmx.c
+++ b/libavcodec/x86/mpegvideo_mmx.c
@@ -595,15 +595,15 @@ static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
 #define HAVE_SSSE3 0
 
 #undef HAVE_SSE2
-#undef HAVE_MMX2
+#undef HAVE_MMXEXT
 #define HAVE_SSE2 0
-#define HAVE_MMX2 0
+#define HAVE_MMXEXT 0
 #define RENAME(a) a ## _MMX
 #define RENAMEl(a) a ## _mmx
 #include "mpegvideo_mmx_template.c"
 
-#undef HAVE_MMX2
-#define HAVE_MMX2 1
+#undef HAVE_MMXEXT
+#define HAVE_MMXEXT 1
 #undef RENAME
 #undef RENAMEl
 #define RENAME(a) a ## _MMX2
@@ -660,7 +660,7 @@ void ff_MPV_common_init_mmx(MpegEncContext *s)
 #endif
             if(mm_flags & AV_CPU_FLAG_SSE2){
                 s->dct_quantize= dct_quantize_SSE2;
-            } else if(mm_flags & AV_CPU_FLAG_MMX2){
+            } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                 s->dct_quantize= dct_quantize_MMX2;
             } else {
                 s->dct_quantize= dct_quantize_MMX;
diff --git a/libavcodec/x86/mpegvideo_mmx_template.c b/libavcodec/x86/mpegvideo_mmx_template.c
index 13653c89b1..53e09bdbfe 100644
--- a/libavcodec/x86/mpegvideo_mmx_template.c
+++ b/libavcodec/x86/mpegvideo_mmx_template.c
@@ -48,7 +48,7 @@
 #define MMREG_WIDTH "8"
 #define MM "%%mm"
 #define MOVQ "movq"
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #define SPREADW(a) "pshufw $0, "a", "a" \n\t"
 #define PMAXW(a,b) "pmaxsw "a", "b"     \n\t"
 #define PMAX(a,b) \
diff --git a/libavcodec/x86/pngdsp-init.c b/libavcodec/x86/pngdsp-init.c
index 136e92eed0..aa21847db2 100644
--- a/libavcodec/x86/pngdsp-init.c
+++ b/libavcodec/x86/pngdsp-init.c
@@ -41,7 +41,7 @@ void ff_pngdsp_init_x86(PNGDSPContext *dsp)
     if (flags & AV_CPU_FLAG_MMX)
         dsp->add_bytes_l2         = ff_add_bytes_l2_mmx;
 #endif
-    if (flags & AV_CPU_FLAG_MMX2)
+    if (flags & AV_CPU_FLAG_MMXEXT)
         dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmx2;
     if (flags & AV_CPU_FLAG_SSE2)
         dsp->add_bytes_l2         = ff_add_bytes_l2_sse2;
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index d91818c375..7284a9beaf 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -37,7 +37,7 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)
 
     if (mm_flags & AV_CPU_FLAG_MMX)
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmx2;
         c->rv34_idct_add         = ff_rv34_idct_add_mmx2;
     }
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 99ba0d5737..bee33657b5 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -200,7 +200,7 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
         QPEL_MC_SET(put_, _mmx)
 #endif
     }
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
         c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx2;
diff --git a/libavcodec/x86/snowdsp_mmx.c b/libavcodec/x86/snowdsp_mmx.c
index 38f3246fda..770cc1cc73 100644
--- a/libavcodec/x86/snowdsp_mmx.c
+++ b/libavcodec/x86/snowdsp_mmx.c
@@ -889,7 +889,7 @@ void ff_dwt_init_x86(DWTContext *c)
             c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
         }
         else{
-            if(mm_flags & AV_CPU_FLAG_MMX2){
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
             c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
 #if HAVE_7REGS
             c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 717f74f287..aae08c2364 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -757,7 +757,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
         dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
     }
 
-    if (mm_flags & AV_CPU_FLAG_MMX2){
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2;
         dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
         dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
@@ -798,7 +798,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
     }
 
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         ASSIGN_LF(mmx2);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
     } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 704d4a6927..45af041a6e 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -49,7 +49,7 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
     }
 #endif
 
-    if (HAVE_MMX2 && cpuflags & AV_CPU_FLAG_MMX2) {
+    if (HAVE_MMXEXT && cpuflags & AV_CPU_FLAG_MMXEXT) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmx2;
 
         if (!(flags & CODEC_FLAG_BITEXACT)) {
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 589804fa34..64dd8ceadf 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -350,7 +350,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
 
     /* note that 4-tap width=16 functions are missing because w=16
      * is only used for luma, and luma is always a copy or sixtap. */
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         VP8_MC_FUNC(2, 4, mmx2);
         VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
 #if ARCH_X86_32
diff --git a/libavfilter/x86/gradfun.c b/libavfilter/x86/gradfun.c
index f75268a660..b45256d011 100644
--- a/libavfilter/x86/gradfun.c
+++ b/libavfilter/x86/gradfun.c
@@ -28,7 +28,7 @@
 DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
 DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
 
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 static void gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
 {
     intptr_t x;
@@ -173,8 +173,8 @@ av_cold void ff_gradfun_init_x86(GradFunContext *gf)
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
         gf->filter_line = gradfun_filter_line_mmx2;
 #endif
 #if HAVE_SSSE3
diff --git a/libavfilter/x86/yadif.c b/libavfilter/x86/yadif.c
index beb0824d4f..81b536acda 100644
--- a/libavfilter/x86/yadif.c
+++ b/libavfilter/x86/yadif.c
@@ -45,7 +45,7 @@ DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 0x000100010
 #undef COMPILE_TEMPLATE_SSE
 #endif
 
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #undef RENAME
 #define RENAME(a) a ## _mmx2
 #include "yadif_template.c"
@@ -58,8 +58,8 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
         yadif->filter_line = yadif_filter_line_mmx2;
 #endif
 #if HAVE_SSE
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index c641106fff..1fb40511ff 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -47,10 +47,10 @@ void av_set_cpu_flags_mask(int mask)
 
 int av_parse_cpu_flags(const char *s)
 {
-#define CPUFLAG_MMX2     (AV_CPU_FLAG_MMX      | AV_CPU_FLAG_MMX2 | AV_CPU_FLAG_CMOV)
+#define CPUFLAG_MMXEXT   (AV_CPU_FLAG_MMX      | AV_CPU_FLAG_MMXEXT | AV_CPU_FLAG_CMOV)
 #define CPUFLAG_3DNOW    (AV_CPU_FLAG_3DNOW    | AV_CPU_FLAG_MMX)
 #define CPUFLAG_3DNOWEXT (AV_CPU_FLAG_3DNOWEXT | CPUFLAG_3DNOW)
-#define CPUFLAG_SSE      (AV_CPU_FLAG_SSE      | CPUFLAG_MMX2)
+#define CPUFLAG_SSE      (AV_CPU_FLAG_SSE      | CPUFLAG_MMXEXT)
 #define CPUFLAG_SSE2     (AV_CPU_FLAG_SSE2     | CPUFLAG_SSE)
 #define CPUFLAG_SSE2SLOW (AV_CPU_FLAG_SSE2SLOW | CPUFLAG_SSE2)
 #define CPUFLAG_SSE3     (AV_CPU_FLAG_SSE3     | CPUFLAG_SSE2)
@@ -67,7 +67,7 @@ int av_parse_cpu_flags(const char *s)
         { "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_ALTIVEC  },    .unit = "flags" },
 #elif ARCH_X86
         { "mmx"     , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_MMX      },    .unit = "flags" },
-        { "mmx2"    , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_MMX2         },    .unit = "flags" },
+        { "mmxext"  , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_MMXEXT       },    .unit = "flags" },
         { "sse"     , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE          },    .unit = "flags" },
         { "sse2"    , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE2         },    .unit = "flags" },
         { "sse2slow", NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE2SLOW     },    .unit = "flags" },
@@ -129,7 +129,7 @@ static const struct {
     { AV_CPU_FLAG_ALTIVEC,   "altivec"    },
 #elif ARCH_X86
     { AV_CPU_FLAG_MMX,       "mmx"        },
-    { AV_CPU_FLAG_MMX2,      "mmx2"       },
+    { AV_CPU_FLAG_MMXEXT,    "mmxext"     },
     { AV_CPU_FLAG_SSE,       "sse"        },
     { AV_CPU_FLAG_SSE2,      "sse2"       },
     { AV_CPU_FLAG_SSE2SLOW,  "sse2(slow)" },
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index f477c83e13..01f7201a5e 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -21,11 +21,16 @@
 #ifndef AVUTIL_CPU_H
 #define AVUTIL_CPU_H
 
+#include "version.h"
+
 #define AV_CPU_FLAG_FORCE    0x80000000 /* force usage of selected flags (OR) */
 
     /* lower 16 bits - CPU features */
 #define AV_CPU_FLAG_MMX          0x0001 ///< standard MMX
+#define AV_CPU_FLAG_MMXEXT       0x0002 ///< SSE integer functions or AMD MMX ext
+#if LIBAVUTIL_VERSION_MAJOR < 52
 #define AV_CPU_FLAG_MMX2         0x0002 ///< SSE integer functions or AMD MMX ext
+#endif
 #define AV_CPU_FLAG_3DNOW        0x0004 ///< AMD 3DNOW
 #define AV_CPU_FLAG_SSE          0x0008 ///< SSE functions
 #define AV_CPU_FLAG_SSE2         0x0010 ///< PIV SSE2 functions
diff --git a/libavutil/version.h b/libavutil/version.h
index f55a99fd48..bd60d3f282 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -38,7 +38,7 @@
 
 #define LIBAVUTIL_VERSION_MAJOR 51
 #define LIBAVUTIL_VERSION_MINOR 37
-#define LIBAVUTIL_VERSION_MICRO  0
+#define LIBAVUTIL_VERSION_MICRO  1
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
                                                LIBAVUTIL_VERSION_MINOR, \
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index d3b1bd5ea6..a63b564985 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -122,7 +122,7 @@ int ff_get_cpu_flags_x86(void)
         if (std_caps & (1 << 23))
             rval |= AV_CPU_FLAG_MMX;
         if (std_caps & (1 << 25))
-            rval |= AV_CPU_FLAG_MMX2;
+            rval |= AV_CPU_FLAG_MMXEXT;
 #if HAVE_SSE
         if (std_caps & (1 << 25))
             rval |= AV_CPU_FLAG_SSE;
@@ -159,7 +159,7 @@ int ff_get_cpu_flags_x86(void)
         if (ext_caps & (1 << 23))
             rval |= AV_CPU_FLAG_MMX;
         if (ext_caps & (1 << 22))
-            rval |= AV_CPU_FLAG_MMX2;
+            rval |= AV_CPU_FLAG_MMXEXT;
 
         /* Allow for selectively disabling SSE2 functions on AMD processors
            with SSE2 support but not SSE4a. This includes Athlon64, some
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 0f8ef2b15c..9da250e1d1 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -661,8 +661,8 @@ static int swScale(SwsContext *c, const uint8_t *src[],
     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf)
         fillPlane(dst[3], dstStride[3], dstW, dstY - lastDstY, lastDstY, 255);
 
-#if HAVE_MMX2 && HAVE_INLINE_ASM
-    if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT && HAVE_INLINE_ASM
+    if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
         __asm__ volatile ("sfence" ::: "memory");
 #endif
     emms_c();
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index 30c87be760..c3efd48b1a 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -82,7 +82,10 @@ const char *swscale_license(void);
  * are only provided for API compatibility.
  */
 #define SWS_CPU_CAPS_MMX      0x80000000
+#define SWS_CPU_CAPS_MMXEXT   0x20000000
+#if LIBSWSCALE_VERSION_MAJOR < 3
 #define SWS_CPU_CAPS_MMX2     0x20000000
+#endif
 #define SWS_CPU_CAPS_3DNOW    0x40000000
 #define SWS_CPU_CAPS_ALTIVEC  0x10000000
 #define SWS_CPU_CAPS_BFIN     0x01000000
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 4960194f37..9a57405d2c 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -577,7 +577,7 @@ fail:
     return ret;
 }
 
-#if HAVE_MMX2 && HAVE_INLINE_ASM
+#if HAVE_MMXEXT && HAVE_INLINE_ASM
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
                            int16_t *filter, int32_t *filterPos, int numSplits)
 {
@@ -740,7 +740,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
 
     return fragmentPos + 1;
 }
-#endif /* HAVE_MMX2 && HAVE_INLINE_ASM */
+#endif /* HAVE_MMXEXT && HAVE_INLINE_ASM */
 
 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
 {
@@ -973,7 +973,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     FF_ALLOC_OR_GOTO(c, c->formatConvBuffer,
                      (FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16,
                      fail);
-    if (HAVE_MMX2 && HAVE_INLINE_ASM && cpu_flags & AV_CPU_FLAG_MMX2 &&
+    if (HAVE_MMXEXT && HAVE_INLINE_ASM && cpu_flags & AV_CPU_FLAG_MMXEXT &&
         c->srcBpc == 8 && c->dstBpc <= 10) {
         c->canMMX2BeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
                             (srcW & 15) == 0) ? 1 : 0;
@@ -1012,7 +1012,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 
     /* precalculate horizontal scaler filter coefficients */
     {
-#if HAVE_MMX2 && HAVE_INLINE_ASM
+#if HAVE_MMXEXT && HAVE_INLINE_ASM
 // can't downscale !!!
         if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
             c->lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c->lumXInc, NULL,
@@ -1048,7 +1048,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
             mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
 #endif
         } else
-#endif /* HAVE_MMX2 && HAVE_INLINE_ASM */
+#endif /* HAVE_MMXEXT && HAVE_INLINE_ASM */
         {
             const int filterAlign =
                 (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
@@ -1208,7 +1208,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 #endif
                sws_format_name(dstFormat));
 
-        if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2)
+        if (HAVE_MMXEXT && cpu_flags & AV_CPU_FLAG_MMXEXT)
             av_log(c, AV_LOG_INFO, "using MMX2\n");
         else if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
             av_log(c, AV_LOG_INFO, "using 3DNOW\n");
diff --git a/libswscale/version.h b/libswscale/version.h
index 32bb2f5887..acbdf6b012 100644
--- a/libswscale/version.h
+++ b/libswscale/version.h
@@ -28,7 +28,7 @@
 
 #define LIBSWSCALE_VERSION_MAJOR 2
 #define LIBSWSCALE_VERSION_MINOR 1
-#define LIBSWSCALE_VERSION_MICRO 0
+#define LIBSWSCALE_VERSION_MICRO 1
 
 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                                LIBSWSCALE_VERSION_MINOR, \
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index f940888a70..066749c22f 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -85,7 +85,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 
 //Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
 
-#define COMPILE_TEMPLATE_MMX2 0
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define COMPILE_TEMPLATE_SSE2 0
 
@@ -96,8 +96,8 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 
 //MMX2 versions
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 1
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _MMX2
 #include "rgb2rgb_template.c"
 
@@ -110,10 +110,10 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 
 //3DNOW versions
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
+#undef COMPILE_TEMPLATE_MMXEXT
 #undef COMPILE_TEMPLATE_SSE2
 #undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX2 0
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNOW
@@ -137,7 +137,7 @@ av_cold void rgb2rgb_init_x86(void)
         rgb2rgb_init_MMX();
     if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
         rgb2rgb_init_3DNOW();
-    if (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)
+    if (HAVE_MMXEXT   && cpu_flags & AV_CPU_FLAG_MMXEXT)
         rgb2rgb_init_MMX2();
     if (HAVE_SSE      && cpu_flags & AV_CPU_FLAG_SSE2)
         rgb2rgb_init_SSE2();
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index c255610193..3374f45908 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -35,7 +35,7 @@
 #if COMPILE_TEMPLATE_AMD3DNOW
 #define PREFETCH  "prefetch"
 #define PAVGB     "pavgusb"
-#elif COMPILE_TEMPLATE_MMX2
+#elif COMPILE_TEMPLATE_MMXEXT
 #define PREFETCH "prefetchnta"
 #define PAVGB     "pavgb"
 #else
@@ -49,7 +49,7 @@
 #define EMMS     "emms"
 #endif
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@@ -1148,7 +1148,7 @@ static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst,
         PREFETCH"     32(%1, %0)        \n\t"
         "movq           (%1, %0), %%mm0 \n\t"
         "movq          8(%1, %0), %%mm1 \n\t"
-# if COMPILE_TEMPLATE_MMX2
+# if COMPILE_TEMPLATE_MMXEXT
         "pshufw      $177, %%mm0, %%mm3 \n\t"
         "pshufw      $177, %%mm1, %%mm5 \n\t"
         "pand       %%mm7, %%mm0        \n\t"
@@ -1512,7 +1512,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 }
 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
 {
     int x,y;
@@ -1602,7 +1602,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid
                      SFENCE"     \n\t"
                      :::"memory");
 }
-#endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
 
 #if !COMPILE_TEMPLATE_AMD3DNOW
 /**
@@ -1810,7 +1810,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
             "1:                                         \n\t"
             PREFETCH"    64(%0, %%"REG_d")              \n\t"
             PREFETCH"    64(%1, %%"REG_d")              \n\t"
-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
             "movq          (%0, %%"REG_d"), %%mm0       \n\t"
             "movq          (%1, %%"REG_d"), %%mm1       \n\t"
             "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
@@ -1871,7 +1871,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
             "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
             "psraw                      $7, %%mm0       \n\t"
 
-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
             "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
             "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
             "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
@@ -2592,9 +2592,9 @@ static inline void RENAME(rgb2rgb_init)(void)
     yuyvtoyuv422       = RENAME(yuyvtoyuv422);
 #endif /* !COMPILE_TEMPLATE_SSE2 */
 
-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
     planar2x           = RENAME(planar2x);
-#endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
     rgb24toyv12        = RENAME(rgb24toyv12);
 
     yuyvtoyuv420       = RENAME(yuyvtoyuv420);
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index a0c18c79fc..57d270b09d 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -72,16 +72,16 @@ DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
 //MMX versions
 #if HAVE_MMX
 #undef RENAME
-#define COMPILE_TEMPLATE_MMX2 0
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define RENAME(a) a ## _MMX
 #include "swscale_template.c"
 #endif
 
 //MMX2 versions
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 1
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _MMX2
 #include "swscale_template.c"
 #endif
@@ -308,8 +308,8 @@ av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
 #if HAVE_INLINE_ASM
     if (cpu_flags & AV_CPU_FLAG_MMX)
         sws_init_swScale_MMX(c);
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
         sws_init_swScale_MMX2(c);
 #endif
 #endif /* HAVE_INLINE_ASM */
@@ -360,7 +360,7 @@ switch(c->dstBpc){ \
     if (cpu_flags & AV_CPU_FLAG_MMX) {
         ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
         ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
-        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMXEXT);
 
         switch (c->srcFormat) {
         case PIX_FMT_Y400A:
@@ -392,7 +392,7 @@ switch(c->dstBpc){ \
             break;
         }
     }
-    if (cpu_flags & AV_CPU_FLAG_MMX2) {
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2, , 1);
     }
 #endif
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index ad2b32f27b..e9816cf0a6 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -22,13 +22,13 @@
 #undef MOVNTQ
 #undef PREFETCH
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define PREFETCH "prefetchnta"
 #else
 #define PREFETCH  " # nop"
 #endif
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
 #else
 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
@@ -567,7 +567,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
     "cmp  "#dstw", "#index"     \n\t"\
     " jb       1b               \n\t"
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #undef WRITEBGR24
 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
 #else
@@ -1371,7 +1371,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
     }
 }
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
                                  int dstWidth, const uint8_t *src,
                                  int srcW, int xInc)
@@ -1553,7 +1553,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
         dst2[i] = src2[srcW-1]*128;
     }
 }
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
 
 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 {
@@ -1616,17 +1616,17 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 
     if (c->srcBpc == 8 && c->dstBpc <= 10) {
     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
     {
         c->hyscale_fast = RENAME(hyscale_fast);
         c->hcscale_fast = RENAME(hcscale_fast);
     } else {
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
         c->hyscale_fast = NULL;
         c->hcscale_fast = NULL;
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
     }
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
     }
 }
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index b4bcbf53c2..501993ae42 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -52,20 +52,20 @@ DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 //MMX versions
 #if HAVE_MMX
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 0
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define RENAME(a) a ## _MMX
 #include "yuv2rgb_template.c"
 #endif /* HAVE_MMX */
 
 //MMX2 versions
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 1
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _MMX2
 #include "yuv2rgb_template.c"
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMXEXT */
 
 #endif /* HAVE_INLINE_ASM */
 
@@ -78,8 +78,8 @@ av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
         c->srcFormat != PIX_FMT_YUVA420P)
         return NULL;
 
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2) {
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
         switch (c->dstFormat) {
         case PIX_FMT_RGB24:  return yuv420_rgb24_MMX2;
         case PIX_FMT_BGR24:  return yuv420_bgr24_MMX2;
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index 5d1fa5b309..a71fd13862 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -25,7 +25,7 @@
 #undef EMMS
 #undef SFENCE
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@@ -182,7 +182,7 @@
     "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
     "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
 
-#if !COMPILE_TEMPLATE_MMX2
+#if !COMPILE_TEMPLATE_MMXEXT
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
@@ -238,7 +238,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
     YUV2RGB_OPERANDS
     YUV2RGB_ENDFUNC
 }
-#endif /* !COMPILE_TEMPLATE_MMX2 */
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
 
 #define RGB_PACK24(blue, red)\
     "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
@@ -255,7 +255,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
     "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
     RGB_PACK24_B
 
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
@@ -362,7 +362,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
     MOVNTQ "   %%mm5,       16(%1)\n\t"      \
     MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
 
-#if !COMPILE_TEMPLATE_MMX2
+#if !COMPILE_TEMPLATE_MMXEXT
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
@@ -449,4 +449,4 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
 }
 #endif
 
-#endif /* !COMPILE_TEMPLATE_MMX2 */
+#endif /* !COMPILE_TEMPLATE_MMXEXT */

From 0c3ff1982c5da0abfb27a7d2328d742a37257698 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Wed, 1 Aug 2012 19:28:08 +0200
Subject: [PATCH 07/13] x86: dct32: port to cpuflags

---
 libavcodec/x86/dct32_sse.asm | 74 ++++++++++++------------------------
 1 file changed, 25 insertions(+), 49 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index e3c8a45545..9d6169ca66 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -42,39 +42,24 @@ ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
 align 32
 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
-%macro BUTTERFLY_SSE 4
-    movaps %4, %1
-    subps  %1, %2
-    addps  %2, %4
-    mulps  %1, %3
+%macro BUTTERFLY 4
+    subps  %4, %1, %2
+    addps  %2, %2, %1
+    mulps  %1, %4, %3
 %endmacro
 
-%macro BUTTERFLY_AVX 4
-    vsubps  %4, %1, %2
-    vaddps  %2, %2, %1
-    vmulps  %1, %4, %3
-%endmacro
-
-%macro BUTTERFLY0_SSE 5
-    movaps %4, %1
-    shufps %1, %1, %5
-    xorps  %4, %2
-    addps  %1, %4
-    mulps  %1, %3
-%endmacro
-
-%macro BUTTERFLY0_SSE2 5
+%macro BUTTERFLY0 5
+%if cpuflag(sse2) && notcpuflag(avx)
     pshufd %4, %1, %5
     xorps  %1, %2
     addps  %1, %4
     mulps  %1, %3
-%endmacro
-
-%macro BUTTERFLY0_AVX 5
-    vshufps %4, %1, %1, %5
-    vxorps  %1, %1, %2
-    vaddps  %4, %4, %1
-    vmulps  %1, %4, %3
+%else
+    shufps %4, %1, %1, %5
+    xorps  %1, %1, %2
+    addps  %4, %4, %1
+    mulps  %1, %4, %3
+%endif
 %endmacro
 
 %macro BUTTERFLY2 4
@@ -206,14 +191,11 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
     movss [outq+116], m6
 %endmacro
 
-%define BUTTERFLY  BUTTERFLY_AVX
-%define BUTTERFLY0 BUTTERFLY0_AVX
-
-INIT_YMM
+INIT_YMM avx
 SECTION_TEXT
 %if HAVE_AVX
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_avx, 2,3,8, out, in, tmp
+cglobal dct32_float, 2,3,8, out, in, tmp
     ; pass 1
     vmovaps     m4, [inq+0]
     vinsertf128 m5, m5, [inq+96], 1
@@ -286,9 +268,6 @@ INIT_XMM
     RET
 %endif
 
-%define BUTTERFLY  BUTTERFLY_SSE
-%define BUTTERFLY0 BUTTERFLY0_SSE
-
 %if ARCH_X86_64
 %define SPILL SWAP
 %define UNSPILL SWAP
@@ -411,10 +390,9 @@ INIT_XMM
 %endif
 
 
-INIT_XMM
-%macro DCT32_FUNC 1
 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_%1, 2,3,16, out, in, tmp
+%macro DCT32_FUNC 0
+cglobal dct32_float, 2, 3, 16, out, in, tmp
     ; pass 1
 
     movaps      m0, [inq+0]
@@ -498,18 +476,16 @@ cglobal dct32_float_%1, 2,3,16, out, in, tmp
     RET
 %endmacro
 
-%macro LOAD_INV_SSE 2
+%macro LOAD_INV 2
+%if cpuflag(sse2)
+    pshufd      %1, %2, 0x1b
+%elif cpuflag(sse)
     movaps      %1, %2
     shufps      %1, %1, 0x1b
+%endif
 %endmacro
 
-%define LOAD_INV LOAD_INV_SSE
-DCT32_FUNC sse
-
-%macro LOAD_INV_SSE2 2
-    pshufd      %1, %2, 0x1b
-%endmacro
-
-%define LOAD_INV LOAD_INV_SSE2
-%define BUTTERFLY0 BUTTERFLY0_SSE2
-DCT32_FUNC sse2
+INIT_XMM sse
+DCT32_FUNC
+INIT_XMM sse2
+DCT32_FUNC

From 4a8143e73ccb805a2e5a54f752d77f20efe20bd3 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 3 Aug 2012 14:59:16 -0700
Subject: [PATCH 08/13] fft: 3dnow: fix register name typo in DECL_IMDCT macro

Signed-off-by: Diego Biurrun <diego@biurrun.de>
---
 libavcodec/x86/fft_mmx.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 7c0e9de311..6082d9ee36 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -1040,7 +1040,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     mova [r1+r5*8], m0
     mova [r1+r6*8], m2
     add    r4, 2
-    sub    r4, 2
+    sub    r3, 2
 %else
 %if ARCH_X86_64
     movzx  r5,  word [rrevtab+r4-4]

From 9829a81bcd3e9ef26c4bbc2959bfb65159dbd314 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Thu, 2 Aug 2012 22:53:47 +0100
Subject: [PATCH 09/13] ARM: vp56: allow inline asm to build with clang

The clang integrated assembler does not support pre-UAL syntax,
while gcc requires pre-UAL syntax for ARM code.  A patch[1] for
clang to support the old syntax as well has been ignored since
January.

This patch chooses the syntax appropriate for each compiler,
allowing both to build the code.  Notably, this change allows
building for iphone with the latest Apple Xcode update.

[1] http://llvm.org/bugs/show_bug.cgi?id=11855

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/vp56_arith.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index ef30ffe897..d1a8837000 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -29,6 +29,14 @@
 #   define T(x)
 #endif
 
+#if CONFIG_THUMB || defined __clang__
+#   define L(x)
+#   define U(x) x
+#else
+#   define L(x) x
+#   define U(x)
+#endif
+
 #if HAVE_ARMV6 && HAVE_INLINE_ASM
 
 #define vp56_rac_get_prob vp56_rac_get_prob_armv6
@@ -42,8 +50,8 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
     __asm__ ("adds    %3,  %3,  %0           \n"
              "itt     cs                     \n"
              "cmpcs   %7,  %4                \n"
-           A("ldrcsh  %2,  [%4], #2          \n")
-           T("ldrhcs  %2,  [%4], #2          \n")
+           L("ldrcsh  %2,  [%4], #2          \n")
+           U("ldrhcs  %2,  [%4], #2          \n")
              "rsb     %0,  %6,  #256         \n"
              "smlabb  %0,  %5,  %6,  %0      \n"
            T("itttt   cs                     \n")
@@ -80,8 +88,8 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
     __asm__ ("adds    %3,  %3,  %0           \n"
              "itt     cs                     \n"
              "cmpcs   %7,  %4                \n"
-           A("ldrcsh  %2,  [%4], #2          \n")
-           T("ldrhcs  %2,  [%4], #2          \n")
+           L("ldrcsh  %2,  [%4], #2          \n")
+           U("ldrhcs  %2,  [%4], #2          \n")
              "rsb     %0,  %6,  #256         \n"
              "smlabb  %0,  %5,  %6,  %0      \n"
            T("itttt   cs                     \n")

From e6cd698955c87b78534a5a897d595d7315103689 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Thu, 2 Aug 2012 23:18:08 +0100
Subject: [PATCH 10/13] ARMv6: vp8: fix stack allocation with Apple's assembler

In the GNU assembler, a relational expression, bizarrely, has the
value -1 if true, whereas in Apple's it is +1.  This patch makes
sure the correct expression is used in both cases.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/vp8dsp_armv6.S | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
index a26a2a9813..40be926f9f 100644
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@@ -1226,7 +1226,13 @@ vp8_mc_1                bilin,  8, v
 vp8_mc_1                bilin,  4, h
 vp8_mc_1                bilin,  4, v
 
-#define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1)
+/* True relational expressions have the value -1 in the GNU assembler,
+   +1 in Apple's. */
+#ifdef __APPLE__
+#   define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1)
+#else
+#   define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1)
+#endif
 
 .macro  vp8_mc_hv       name, size, h, v, ytaps
 function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1

From c5d5d178800daf6ea22530cfc9a5f5f0036d6f1a Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 2 Aug 2012 20:45:29 -0700
Subject: [PATCH 11/13] fate: add tests for lagarith lossless video codec.

Based on patch by Oana Stratulat <oanaandreeastratulat@gmail.com>.
---
 tests/fate/lossless-video.mak | 15 +++++++++++++++
 tests/ref/fate/lagarith-rgb24 |  5 +++++
 tests/ref/fate/lagarith-rgb32 | 26 ++++++++++++++++++++++++++
 tests/ref/fate/lagarith-yuy2  |  2 ++
 tests/ref/fate/lagarith-yv12  |  3 +++
 5 files changed, 51 insertions(+)
 create mode 100644 tests/ref/fate/lagarith-rgb24
 create mode 100644 tests/ref/fate/lagarith-rgb32
 create mode 100644 tests/ref/fate/lagarith-yuy2
 create mode 100644 tests/ref/fate/lagarith-yv12

diff --git a/tests/fate/lossless-video.mak b/tests/fate/lossless-video.mak
index 4871296a18..2ac3dd3478 100644
--- a/tests/fate/lossless-video.mak
+++ b/tests/fate/lossless-video.mak
@@ -1,3 +1,18 @@
+FATE_LAGARITH += fate-lagarith-rgb24
+fate-lagarith-rgb24: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb24.avi
+
+FATE_LAGARITH += fate-lagarith-rgb32
+fate-lagarith-rgb32: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb32.avi
+
+FATE_LAGARITH += fate-lagarith-yuy2
+fate-lagarith-yuy2: CMD = framecrc -i $(SAMPLES)/lagarith/lag-yuy2.avi
+
+FATE_LAGARITH += fate-lagarith-yv12
+fate-lagarith-yv12: CMD = framecrc -i $(SAMPLES)/lagarith/lag-yv12.avi
+
+FATE_SAMPLES_AVCONV += $(FATE_LAGARITH)
+fate-lagarith: $(FATE_LAGARITH)
+
 FATE_LOCO += fate-loco-rgb
 fate-loco-rgb: CMD = framecrc -i $(SAMPLES)/loco/pig-loco-rgb.avi
 
diff --git a/tests/ref/fate/lagarith-rgb24 b/tests/ref/fate/lagarith-rgb24
new file mode 100644
index 0000000000..1eb2bc441c
--- /dev/null
+++ b/tests/ref/fate/lagarith-rgb24
@@ -0,0 +1,5 @@
+#tb 0: 100/2997
+0,          0,          0,        1,   368640, 0x26f74db2
+0,          1,          1,        1,   368640, 0x63b29ea4
+0,          2,          2,        1,   368640, 0x19467f03
+0,          3,          3,        1,   368640, 0x5fdc3575
diff --git a/tests/ref/fate/lagarith-rgb32 b/tests/ref/fate/lagarith-rgb32
new file mode 100644
index 0000000000..490e2e5c7d
--- /dev/null
+++ b/tests/ref/fate/lagarith-rgb32
@@ -0,0 +1,26 @@
+#tb 0: 1001/24000
+0,          0,          0,        1,  1382400, 0x00000000
+0,          1,          1,        1,  1382400, 0x00000000
+0,          2,          2,        1,  1382400, 0x00000000
+0,          3,          3,        1,  1382400, 0x00000000
+0,          4,          4,        1,  1382400, 0x00000000
+0,          5,          5,        1,  1382400, 0xf95bde46
+0,          6,          6,        1,  1382400, 0x4f4c0393
+0,          7,          7,        1,  1382400, 0xe5aa40db
+0,          8,          8,        1,  1382400, 0xc25a8ba2
+0,          9,          9,        1,  1382400, 0x9db3150d
+0,         10,         10,        1,  1382400, 0x730e64b3
+0,         11,         11,        1,  1382400, 0xf8fd7edf
+0,         12,         12,        1,  1382400, 0x0114798a
+0,         13,         13,        1,  1382400, 0x7571210f
+0,         14,         14,        1,  1382400, 0x552ae59d
+0,         15,         15,        1,  1382400, 0x7ae0c946
+0,         16,         16,        1,  1382400, 0x0818c3ef
+0,         17,         17,        1,  1382400, 0x8257cac4
+0,         18,         18,        1,  1382400, 0x7762a979
+0,         19,         19,        1,  1382400, 0x282af57a
+0,         20,         20,        1,  1382400, 0x3f42de50
+0,         21,         21,        1,  1382400, 0xc42d5f93
+0,         22,         22,        1,  1382400, 0x18775c90
+0,         23,         23,        1,  1382400, 0x34befa90
+0,         24,         24,        1,  1382400, 0xd33d5f53
diff --git a/tests/ref/fate/lagarith-yuy2 b/tests/ref/fate/lagarith-yuy2
new file mode 100644
index 0000000000..c5aed92d61
--- /dev/null
+++ b/tests/ref/fate/lagarith-yuy2
@@ -0,0 +1,2 @@
+#tb 0: 1/10
+0,          0,          0,        1,  1572864, 0xeed76a7d
diff --git a/tests/ref/fate/lagarith-yv12 b/tests/ref/fate/lagarith-yv12
new file mode 100644
index 0000000000..c9c9ff372f
--- /dev/null
+++ b/tests/ref/fate/lagarith-yv12
@@ -0,0 +1,3 @@
+#tb 0: 1/60
+0,          0,          0,        1,    92160, 0x1dfdf5c1
+0,          1,          1,        1,    92160, 0x6965884f

From 73486e3b612aa07b94f1b5ad9c11d7450841ce1b Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 4 Aug 2012 12:24:23 +0100
Subject: [PATCH 12/13] fate: force pix_fmt in lagarith-rgb32 test

This makes big and little endian systems use the same output
format.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 tests/fate/lossless-video.mak | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fate/lossless-video.mak b/tests/fate/lossless-video.mak
index 2ac3dd3478..0a6b544531 100644
--- a/tests/fate/lossless-video.mak
+++ b/tests/fate/lossless-video.mak
@@ -2,7 +2,7 @@ FATE_LAGARITH += fate-lagarith-rgb24
 fate-lagarith-rgb24: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb24.avi
 
 FATE_LAGARITH += fate-lagarith-rgb32
-fate-lagarith-rgb32: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb32.avi
+fate-lagarith-rgb32: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb32.avi -pix_fmt bgra
 
 FATE_LAGARITH += fate-lagarith-yuy2
 fate-lagarith-yuy2: CMD = framecrc -i $(SAMPLES)/lagarith/lag-yuy2.avi

From 8821ae649e61097ec57ca58472c3e4239c82913c Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 29 Jul 2012 19:22:19 -0400
Subject: [PATCH 13/13] lavr: fix handling of custom mix matrices

Adds some validation for changing parameters after setting the matrix and
fixes a bug in the conversion path setup.
---
 libavresample/audio_mix.c        | 10 +++++++++-
 libavresample/audio_mix_matrix.c | 21 +++++++++++++++------
 libavresample/utils.c            |  5 ++---
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/libavresample/audio_mix.c b/libavresample/audio_mix.c
index 2c2a356844..e8ab2e3ee7 100644
--- a/libavresample/audio_mix.c
+++ b/libavresample/audio_mix.c
@@ -314,7 +314,15 @@ int ff_audio_mix_init(AVAudioResampleContext *avr)
     }
 
     /* build matrix if the user did not already set one */
-    if (!avr->am->matrix) {
+    if (avr->am->matrix) {
+        if (avr->am->coeff_type != avr->mix_coeff_type      ||
+            avr->am->in_layout  != avr->in_channel_layout   ||
+            avr->am->out_layout != avr->out_channel_layout) {
+            av_log(avr, AV_LOG_ERROR,
+                   "Custom matrix does not match current parameters\n");
+            return AVERROR(EINVAL);
+        }
+    } else {
         int i, j;
         char in_layout_name[128];
         char out_layout_name[128];
diff --git a/libavresample/audio_mix_matrix.c b/libavresample/audio_mix_matrix.c
index f7121c846d..522a177f20 100644
--- a/libavresample/audio_mix_matrix.c
+++ b/libavresample/audio_mix_matrix.c
@@ -294,8 +294,8 @@ int avresample_get_matrix(AVAudioResampleContext *avr, double *matrix,
     in_channels  = av_get_channel_layout_nb_channels(avr->in_channel_layout);
     out_channels = av_get_channel_layout_nb_channels(avr->out_channel_layout);
 
-    if ( in_channels < 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
-        out_channels < 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
+    if ( in_channels <= 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
+        out_channels <= 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
         av_log(avr, AV_LOG_ERROR, "Invalid channel layouts\n");
         return AVERROR(EINVAL);
     }
@@ -332,6 +332,7 @@ int avresample_get_matrix(AVAudioResampleContext *avr, double *matrix,
         av_log(avr, AV_LOG_ERROR, "Invalid mix coeff type\n");
         return AVERROR(EINVAL);
     }
+
     return 0;
 }
 
@@ -343,14 +344,16 @@ int avresample_set_matrix(AVAudioResampleContext *avr, const double *matrix,
     in_channels  = av_get_channel_layout_nb_channels(avr->in_channel_layout);
     out_channels = av_get_channel_layout_nb_channels(avr->out_channel_layout);
 
-    if ( in_channels < 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
-        out_channels < 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
+    if ( in_channels <= 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
+        out_channels <= 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
         av_log(avr, AV_LOG_ERROR, "Invalid channel layouts\n");
         return AVERROR(EINVAL);
     }
 
-    if (avr->am->matrix)
-        av_freep(avr->am->matrix);
+    if (avr->am->matrix) {
+        av_free(avr->am->matrix[0]);
+        avr->am->matrix = NULL;
+    }
 
 #define CONVERT_MATRIX(type, expr)                                          \
     avr->am->matrix_## type[0] = av_mallocz(out_channels * in_channels *    \
@@ -386,5 +389,11 @@ int avresample_set_matrix(AVAudioResampleContext *avr, const double *matrix,
     /* TODO: detect situations where we can just swap around pointers
              instead of doing matrix multiplications with 0.0 and 1.0 */
 
+    /* set AudioMix params */
+    avr->am->in_layout    = avr->in_channel_layout;
+    avr->am->out_layout   = avr->out_channel_layout;
+    avr->am->in_channels  = in_channels;
+    avr->am->out_channels = out_channels;
+
     return 0;
 }
diff --git a/libavresample/utils.c b/libavresample/utils.c
index 89a82b9dda..2d83372bb8 100644
--- a/libavresample/utils.c
+++ b/libavresample/utils.c
@@ -48,9 +48,8 @@ int avresample_open(AVAudioResampleContext *avr)
     avr->resample_channels = FFMIN(avr->in_channels, avr->out_channels);
     avr->downmix_needed    = avr->in_channels  > avr->out_channels;
     avr->upmix_needed      = avr->out_channels > avr->in_channels ||
-                             avr->am->matrix                      ||
-                             (avr->out_channels == avr->in_channels &&
-                              avr->in_channel_layout != avr->out_channel_layout);
+                             (!avr->downmix_needed && (avr->am->matrix ||
+                              avr->in_channel_layout != avr->out_channel_layout));
     avr->mixing_needed     = avr->downmix_needed || avr->upmix_needed;
 
     /* set resampling parameters */