Merge remote-tracking branch 'qatar/master'

* qatar/master: lavr: fix handling of custom mix matrices fate: force pix_fmt in lagarith-rgb32 test fate: add tests for lagarith lossless video codec. ARMv6: vp8: fix stack allocation with Apple's assembler ARM: vp56: allow inline asm to build with clang fft: 3dnow: fix register name typo in DECL_IMDCT macro x86: dct32: port to cpuflags x86: build: replace mmx2 by mmxext Revert "wmapro: prevent division by zero when sample rate is unspecified" wmapro: prevent division by zero when sample rate is unspecified lagarith: fix color plane inversion for YUY2 output. lagarith: pad RGB buffer by 1 byte. dsputil: make add_hfyu_left_prediction_sse4() support unaligned src. Conflicts: doc/APIchanges libavcodec/lagarith.c libavfilter/x86/gradfun.c libavutil/cpu.h libavutil/version.h libswscale/utils.c libswscale/version.h libswscale/x86/yuv2rgb.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
2025-07-16 22:42:38 +02:00 · 2012-08-04 22:39:25 +02:00
parent 88fc1438c6 8821ae649e
commit e776ee8f29
52 changed files with 254 additions and 181 deletions
--- a/2
+++ b/2
@ -1378,7 +1378,7 @@ PREDEFINED             = "__attribute__(x)=" \
                         "DEF(x)=x ## _TMPL" \
                         HAVE_AV_CONFIG_H \
                         HAVE_MMX \
-                         HAVE_MMX2 \
+                         HAVE_MMXEXT \
                         HAVE_AMD3DNOW \
                         "DECLARE_ALIGNED(a,t,n)=t n" \
                         "offsetof(x,y)=0x42"
--- a/13
+++ b/13
@ -267,7 +267,7 @@ Optimization options (experts only):
  --disable-amd3dnow       disable 3DNow! optimizations
  --disable-amd3dnowext    disable 3DNow! extended optimizations
  --disable-mmx            disable MMX optimizations
-  --disable-mmx2           disable MMX2 optimizations
+  --disable-mmxext         disable MMXEXT optimizations
  --disable-sse            disable SSE optimizations
  --disable-ssse3          disable SSSE3 optimizations
  --disable-avx            disable AVX optimizations
@ -1182,7 +1182,7 @@ ARCH_EXT_LIST='
    fma4
    mmi
    mmx
-    mmx2
+    mmxext
    neon
    ppc4xx
    sse
@ -1459,7 +1459,7 @@ x86_64_suggest="cmov fast_cmov"
 amd3dnow_deps="mmx"
 amd3dnowext_deps="amd3dnow"
 mmx_deps="x86"
-mmx2_deps="mmx"
+mmxext_deps="mmx"
 sse_deps="mmx"
 ssse3_deps="sse"
 avx_deps="ssse3"
@ -3194,9 +3194,9 @@ EOF
    # check whether xmm clobbers are supported
    check_asm xmm_clobbers '"":::"%xmm0"'

-    # check whether binutils is new enough to compile SSSE3/MMX2
+    # check whether binutils is new enough to compile SSSE3/MMXEXT
    enabled ssse3 && check_asm ssse3 '"pabsw %xmm0, %xmm0"'
-    enabled mmx2  && check_asm mmx2  '"pmaxub %mm0, %mm1"'
+    enabled mmxext && check_asm mmxext '"pmaxub %mm0, %mm1"'

    if ! disabled_any asm mmx yasm; then
        if check_cmd $yasmexe --version; then
@ -3748,7 +3748,7 @@ echo "runtime cpu detection     ${runtime_cpudetect-no}"
 if enabled x86; then
    echo "${yasmexe}                      ${yasm-no}"
    echo "MMX enabled               ${mmx-no}"
-    echo "MMX2 enabled              ${mmx2-no}"
+    echo "MMXEXT enabled            ${mmxext-no}"
    echo "3DNow! enabled            ${amd3dnow-no}"
    echo "3DNow! extended enabled   ${amd3dnowext-no}"
    echo "SSE enabled               ${sse-no}"
@ -4019,6 +4019,7 @@ cat > $TMPH <<EOF
 #define EXTERN_PREFIX "${extern_prefix}"
 #define EXTERN_ASM ${extern_prefix}
 #define SLIBSUF "$SLIBSUF"
+#define HAVE_MMX2 HAVE_MMXEXT
 EOF

 test -n "$assert_level" &&
--- a/doc/APIchanges
+++ b/doc/APIchanges
@ -70,6 +70,11 @@ API changes, most recent first:
 2012-03-26 - a67d9cf - lavfi 2.66.100
  Add avfilter_fill_frame_from_{audio_,}buffer_ref() functions.

+2012-08-03 - xxxxxxx - lavu 51.37.1 - cpu.h
+                       lsws 2.1.1   - swscale.h
+  Rename AV_CPU_FLAG_MMX2  ---> AV_CPU_FLAG_MMXEXT.
+  Rename SWS_CPU_CAPS_MMX2 ---> SWS_CPU_CAPS_MMXEXT.
+
 2012-07-xx - xxxxxxx - lavf 54.13.0 - avformat.h
  Add AVFMT_FLAG_NOBUFFER for low latency use cases.

--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@ -29,6 +29,14 @@
 #   define T(x)
 #endif

+#if CONFIG_THUMB || defined __clang__
+#   define L(x)
+#   define U(x) x
+#else
+#   define L(x) x
+#   define U(x)
+#endif
+
 #if HAVE_ARMV6 && HAVE_INLINE_ASM

 #define vp56_rac_get_prob vp56_rac_get_prob_armv6
@ -42,8 +50,8 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
    __asm__ ("adds    %3,  %3,  %0           \n"
             "itt     cs                     \n"
             "cmpcs   %7,  %4                \n"
-           A("ldrcsh  %2,  [%4], #2          \n")
-           T("ldrhcs  %2,  [%4], #2          \n")
+           L("ldrcsh  %2,  [%4], #2          \n")
+           U("ldrhcs  %2,  [%4], #2          \n")
             "rsb     %0,  %6,  #256         \n"
             "smlabb  %0,  %5,  %6,  %0      \n"
           T("itttt   cs                     \n")
@ -80,8 +88,8 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
    __asm__ ("adds    %3,  %3,  %0           \n"
             "itt     cs                     \n"
             "cmpcs   %7,  %4                \n"
-           A("ldrcsh  %2,  [%4], #2          \n")
-           T("ldrhcs  %2,  [%4], #2          \n")
+           L("ldrcsh  %2,  [%4], #2          \n")
+           U("ldrhcs  %2,  [%4], #2          \n")
             "rsb     %0,  %6,  #256         \n"
             "smlabb  %0,  %5,  %6,  %0      \n"
           T("itttt   cs                     \n")
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@ -1226,7 +1226,13 @@ vp8_mc_1                bilin,  8, v
 vp8_mc_1                bilin,  4, h
 vp8_mc_1                bilin,  4, v

+/* True relational expressions have the value -1 in the GNU assembler,
+   +1 in Apple's. */
+#ifdef __APPLE__
+#   define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1)
+#else
 #   define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1)
+#endif

 .macro  vp8_mc_hv       name, size, h, v, ytaps
 function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@ -87,7 +87,7 @@ static const struct algo fdct_tab[] = {

 #if HAVE_MMX && HAVE_INLINE_ASM
    { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
-    { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
+    { "MMXEXT",         ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMXEXT  },
    { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
 #endif

@ -132,7 +132,7 @@ static const struct algo idct_tab[] = {
 #endif
    { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
    { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
-    { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
+    { "XVID-MMXEXT",    ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
    { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 #if ARCH_X86_64 && HAVE_YASM
    { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
--- a/libavcodec/motion-test.c
+++ b/libavcodec/motion-test.c
@ -116,8 +116,8 @@ int main(int argc, char **argv)
    AVCodecContext *ctx;
    int c;
    DSPContext cctx, mmxctx;
-    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMX2 };
-    int flags_size = HAVE_MMX2 ? 2 : 1;
+    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT };
+    int flags_size = HAVE_MMXEXT ? 2 : 1;

    if (argc > 1) {
        help();
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@ -68,7 +68,7 @@ cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
 %define LOOP_ALIGN
 INIT_MMX
 AC3_EXPONENT_MIN mmx
-%if HAVE_MMX2
+%if HAVE_MMXEXT
 %define PMINUB PMINUB_MMXEXT
 %define LOOP_ALIGN ALIGN 16
 AC3_EXPONENT_MIN mmxext
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@ -65,7 +65,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
        }
    }
-    if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT && HAVE_MMXEXT) {
        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx2;
    }
--- a/libavcodec/x86/cavsdsp_mmx.c
+++ b/libavcodec/x86/cavsdsp_mmx.c
@ -486,7 +486,7 @@ void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx)
    int mm_flags = av_get_cpu_flags();

 #if HAVE_INLINE_ASM
-    if (mm_flags & AV_CPU_FLAG_MMX2)  ff_cavsdsp_init_mmx2 (c, avctx);
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmx2(c, avctx);
    if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_INLINE_ASM */
 }
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@ -42,39 +42,24 @@ ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
 align 32
 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000

-%macro BUTTERFLY_SSE 4
-    movaps %4, %1
-    subps  %1, %2
-    addps  %2, %4
-    mulps  %1, %3
+%macro BUTTERFLY 4
+    subps  %4, %1, %2
+    addps  %2, %2, %1
+    mulps  %1, %4, %3
 %endmacro

-%macro BUTTERFLY_AVX 4
-    vsubps  %4, %1, %2
-    vaddps  %2, %2, %1
-    vmulps  %1, %4, %3
-%endmacro
-
-%macro BUTTERFLY0_SSE 5
-    movaps %4, %1
-    shufps %1, %1, %5
-    xorps  %4, %2
-    addps  %1, %4
-    mulps  %1, %3
-%endmacro
-
-%macro BUTTERFLY0_SSE2 5
+%macro BUTTERFLY0 5
+%if cpuflag(sse2) && notcpuflag(avx)
    pshufd %4, %1, %5
    xorps  %1, %2
    addps  %1, %4
    mulps  %1, %3
-%endmacro
-
-%macro BUTTERFLY0_AVX 5
-    vshufps %4, %1, %1, %5
-    vxorps  %1, %1, %2
-    vaddps  %4, %4, %1
-    vmulps  %1, %4, %3
+%else
+    shufps %4, %1, %1, %5
+    xorps  %1, %1, %2
+    addps  %4, %4, %1
+    mulps  %1, %4, %3
+%endif
 %endmacro

 %macro BUTTERFLY2 4
@ -206,14 +191,11 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
    movss [outq+116], m6
 %endmacro

-%define BUTTERFLY  BUTTERFLY_AVX
-%define BUTTERFLY0 BUTTERFLY0_AVX
-
-INIT_YMM
+INIT_YMM avx
 SECTION_TEXT
 %if HAVE_AVX
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_avx, 2,3,8, out, in, tmp
+cglobal dct32_float, 2,3,8, out, in, tmp
    ; pass 1
    vmovaps     m4, [inq+0]
    vinsertf128 m5, m5, [inq+96], 1
@ -286,9 +268,6 @@ INIT_XMM
    RET
 %endif

-%define BUTTERFLY  BUTTERFLY_SSE
-%define BUTTERFLY0 BUTTERFLY0_SSE
-
 %if ARCH_X86_64
 %define SPILL SWAP
 %define UNSPILL SWAP
@ -411,10 +390,9 @@ INIT_XMM
 %endif


-INIT_XMM
-%macro DCT32_FUNC 1
 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_%1, 2,3,16, out, in, tmp
+%macro DCT32_FUNC 0
+cglobal dct32_float, 2, 3, 16, out, in, tmp
    ; pass 1

    movaps      m0, [inq+0]
@ -498,18 +476,16 @@ cglobal dct32_float_%1, 2,3,16, out, in, tmp
    RET
 %endmacro

-%macro LOAD_INV_SSE 2
+%macro LOAD_INV 2
+%if cpuflag(sse2)
+    pshufd      %1, %2, 0x1b
+%elif cpuflag(sse)
    movaps      %1, %2
    shufps      %1, %1, 0x1b
+%endif
 %endmacro

-%define LOAD_INV LOAD_INV_SSE
-DCT32_FUNC sse
-
-%macro LOAD_INV_SSE2 2
-    pshufd      %1, %2, 0x1b
-%endmacro
-
-%define LOAD_INV LOAD_INV_SSE2
-%define BUTTERFLY0 BUTTERFLY0_SSE2
-DCT32_FUNC sse2
+INIT_XMM sse
+DCT32_FUNC
+INIT_XMM sse2
+DCT32_FUNC
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -3171,7 +3171,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
                    c->idct_add              = ff_idct_xvid_sse2_add;
                    c->idct                  = ff_idct_xvid_sse2;
                    c->idct_permutation_type = FF_SSE2_IDCT_PERM;
-                } else if (mm_flags & AV_CPU_FLAG_MMX2) {
+                } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                    c->idct_put              = ff_idct_xvid_mmx2_put;
                    c->idct_add              = ff_idct_xvid_mmx2_add;
                    c->idct                  = ff_idct_xvid_mmx2;
@ -3187,7 +3187,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
        dsputil_init_mmx(c, avctx, mm_flags);
    }

-    if (mm_flags & AV_CPU_FLAG_MMX2)
+    if (mm_flags & AV_CPU_FLAG_MMXEXT)
        dsputil_init_mmx2(c, avctx, mm_flags);

    if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@ -388,12 +388,16 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
    RET


-%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
+%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
    add     srcq, wq
    add     dstq, wq
    neg     wq
 %%.loop:
+%if %2
    mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
    mova    m2, m1
    psllw   m1, 8
    paddb   m1, m2
@ -435,7 +439,7 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
    mova    m3, [pb_zz11zz55zz99zzdd]
    movd    m0, leftm
    psllq   m0, 56
-    ADD_HFYU_LEFT_LOOP 1
+    ADD_HFYU_LEFT_LOOP 1, 1

 INIT_XMM
 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
@ -446,12 +450,14 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
    movd    m0, leftm
    pslldq  m0, 15
    test    srcq, 15
-    jnz add_hfyu_left_prediction_ssse3.skip_prologue
+    jnz .src_unaligned
    test    dstq, 15
-    jnz .unaligned
-    ADD_HFYU_LEFT_LOOP 1
-.unaligned:
-    ADD_HFYU_LEFT_LOOP 0
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 0


 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@ -1112,7 +1112,7 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
            (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
            if(mm_flags & AV_CPU_FLAG_SSE2){
                c->fdct = ff_fdct_sse2;
-            }else if(mm_flags & AV_CPU_FLAG_MMX2){
+            } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                c->fdct = ff_fdct_mmx2;
            }else{
                c->fdct = ff_fdct_mmx;
@ -1145,8 +1145,7 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)

        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;

-
-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
            c->vsad[4]= vsad_intra16_mmx2;

@ -1187,7 +1186,7 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;

-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
            c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx2;
            c->hadamard8_diff[1] = ff_hadamard8_diff_mmx2;
        }
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@ -1041,7 +1041,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
    mova [r1+r5*8], m0
    mova [r1+r6*8], m2
    add    r4, 2
-    sub    r4, 2
+    sub    r3, 2
 %else
 %if ARCH_X86_64
    movzx  r5,  word [rrevtab+r4-4]
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@ -198,7 +198,7 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
            }
        }

-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
            h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_mmx2;
            h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_mmx2;
            if (chroma_format_idc == 1)
@ -308,7 +308,7 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
            }
        }
    } else if (bit_depth == 10) {
-        if (mm_flags & AV_CPU_FLAG_MMX2) {
+        if (mm_flags & AV_CPU_FLAG_MMXEXT) {
            h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
            h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;

--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@ -218,7 +218,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #if HAVE_YASM
    int mm_flags = av_get_cpu_flags();

-    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2)
+    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMXEXT)
        c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2;

    if (bit_depth == 8) {
@ -236,7 +236,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
            if (mm_flags & AV_CPU_FLAG_CMOV)
                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;

-            if (mm_flags & AV_CPU_FLAG_MMX2) {
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmx2;
                c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2;
                c->h264_idct_add16   = ff_h264_idct_add16_8_mmx2;
@ -304,7 +304,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
        }
    } else if (bit_depth == 10) {
        if (mm_flags & AV_CPU_FLAG_MMX) {
-            if (mm_flags & AV_CPU_FLAG_MMX2) {
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
 #if ARCH_X86_32
                c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmx2;
                c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2;
--- a/libavcodec/x86/motion_est_mmx.c
+++ b/libavcodec/x86/motion_est_mmx.c
@ -444,7 +444,7 @@ void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
        c->sad[0]= sad16_mmx;
        c->sad[1]= sad8_mmx;
    }
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
        c->pix_abs[0][0] = sad16_mmx2;
        c->pix_abs[1][0] = sad8_mmx2;

--- a/libavcodec/x86/mpegvideo_mmx.c
+++ b/libavcodec/x86/mpegvideo_mmx.c
@ -595,15 +595,15 @@ static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
 #define HAVE_SSSE3 0

 #undef HAVE_SSE2
-#undef HAVE_MMX2
+#undef HAVE_MMXEXT
 #define HAVE_SSE2 0
-#define HAVE_MMX2 0
+#define HAVE_MMXEXT 0
 #define RENAME(a) a ## _MMX
 #define RENAMEl(a) a ## _mmx
 #include "mpegvideo_mmx_template.c"

-#undef HAVE_MMX2
-#define HAVE_MMX2 1
+#undef HAVE_MMXEXT
+#define HAVE_MMXEXT 1
 #undef RENAME
 #undef RENAMEl
 #define RENAME(a) a ## _MMX2
@ -660,7 +660,7 @@ void ff_MPV_common_init_mmx(MpegEncContext *s)
 #endif
            if(mm_flags & AV_CPU_FLAG_SSE2){
                s->dct_quantize= dct_quantize_SSE2;
-            } else if(mm_flags & AV_CPU_FLAG_MMX2){
+            } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
                s->dct_quantize= dct_quantize_MMX2;
            } else {
                s->dct_quantize= dct_quantize_MMX;
--- a/libavcodec/x86/mpegvideo_mmx_template.c
+++ b/libavcodec/x86/mpegvideo_mmx_template.c
@ -48,7 +48,7 @@
 #define MMREG_WIDTH "8"
 #define MM "%%mm"
 #define MOVQ "movq"
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #define SPREADW(a) "pshufw $0, "a", "a" \n\t"
 #define PMAXW(a,b) "pmaxsw "a", "b"     \n\t"
 #define PMAX(a,b) \
--- a/libavcodec/x86/pngdsp-init.c
+++ b/libavcodec/x86/pngdsp-init.c
@ -41,7 +41,7 @@ void ff_pngdsp_init_x86(PNGDSPContext *dsp)
    if (flags & AV_CPU_FLAG_MMX)
        dsp->add_bytes_l2         = ff_add_bytes_l2_mmx;
 #endif
-    if (flags & AV_CPU_FLAG_MMX2)
+    if (flags & AV_CPU_FLAG_MMXEXT)
        dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmx2;
    if (flags & AV_CPU_FLAG_SSE2)
        dsp->add_bytes_l2         = ff_add_bytes_l2_sse2;
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@ -37,7 +37,7 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)

    if (mm_flags & AV_CPU_FLAG_MMX)
        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
        c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmx2;
        c->rv34_idct_add         = ff_rv34_idct_add_mmx2;
    }
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@ -204,7 +204,7 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
        QPEL_MC_SET(put_, _mmx)
 #endif
    }
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
        c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
        c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx2;
--- a/libavcodec/x86/snowdsp_mmx.c
+++ b/libavcodec/x86/snowdsp_mmx.c
@ -889,7 +889,7 @@ void ff_dwt_init_x86(DWTContext *c)
            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
        }
        else{
-            if(mm_flags & AV_CPU_FLAG_MMX2){
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
 #if HAVE_7REGS
            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@ -760,7 +760,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
            dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
    }

-    if (mm_flags & AV_CPU_FLAG_MMX2){
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
        dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2;
        dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
        dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
@ -810,7 +810,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
    if (mm_flags & AV_CPU_FLAG_MMX) {
    }

-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
        ASSIGN_LF(mmx2);
    }

--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@ -49,7 +49,7 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
    }
 #endif

-    if (HAVE_MMX2 && cpuflags & AV_CPU_FLAG_MMX2) {
+    if (HAVE_MMXEXT && cpuflags & AV_CPU_FLAG_MMXEXT) {
        c->idct_dc_add = ff_vp3_idct_dc_add_mmx2;

        if (!(flags & CODEC_FLAG_BITEXACT)) {
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@ -350,7 +350,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)

    /* note that 4-tap width=16 functions are missing because w=16
     * is only used for luma, and luma is always a copy or sixtap. */
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (mm_flags & AV_CPU_FLAG_MMXEXT) {
        VP8_MC_FUNC(2, 4, mmx2);
        VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
 #if ARCH_X86_32
--- a/libavfilter/x86/gradfun.c
+++ b/libavfilter/x86/gradfun.c
@ -28,7 +28,7 @@
 DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
 DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};

-#if HAVE_MMX2
+#if HAVE_MMXEXT
 static void gradfun_filter_line_mmx2(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
 {
    intptr_t x;
@ -173,8 +173,8 @@ av_cold void ff_gradfun_init_x86(GradFunContext *gf)
    int cpu_flags = av_get_cpu_flags();

 #if HAVE_INLINE_ASM
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
        gf->filter_line = gradfun_filter_line_mmx2;
 #endif
 #if HAVE_SSSE3
--- a/libavfilter/x86/yadif.c
+++ b/libavfilter/x86/yadif.c
@ -45,7 +45,7 @@ DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 0x000100010
 #undef COMPILE_TEMPLATE_SSE
 #endif

-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #undef RENAME
 #define RENAME(a) a ## _mmx2
 #include "yadif_template.c"
@ -58,8 +58,8 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
    int cpu_flags = av_get_cpu_flags();

 #if HAVE_INLINE_ASM
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
        yadif->filter_line = yadif_filter_line_mmx2;
 #endif
 #if HAVE_SSE
--- a/libavresample/audio_mix.c
+++ b/libavresample/audio_mix.c
@ -314,7 +314,15 @@ int ff_audio_mix_init(AVAudioResampleContext *avr)
    }

    /* build matrix if the user did not already set one */
-    if (!avr->am->matrix) {
+    if (avr->am->matrix) {
+        if (avr->am->coeff_type != avr->mix_coeff_type      ||
+            avr->am->in_layout  != avr->in_channel_layout   ||
+            avr->am->out_layout != avr->out_channel_layout) {
+            av_log(avr, AV_LOG_ERROR,
+                   "Custom matrix does not match current parameters\n");
+            return AVERROR(EINVAL);
+        }
+    } else {
        int i, j;
        char in_layout_name[128];
        char out_layout_name[128];
--- a/libavresample/audio_mix_matrix.c
+++ b/libavresample/audio_mix_matrix.c
@ -294,8 +294,8 @@ int avresample_get_matrix(AVAudioResampleContext *avr, double *matrix,
    in_channels  = av_get_channel_layout_nb_channels(avr->in_channel_layout);
    out_channels = av_get_channel_layout_nb_channels(avr->out_channel_layout);

-    if ( in_channels < 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
-        out_channels < 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
+    if ( in_channels <= 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
+        out_channels <= 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
        av_log(avr, AV_LOG_ERROR, "Invalid channel layouts\n");
        return AVERROR(EINVAL);
    }
@ -332,6 +332,7 @@ int avresample_get_matrix(AVAudioResampleContext *avr, double *matrix,
        av_log(avr, AV_LOG_ERROR, "Invalid mix coeff type\n");
        return AVERROR(EINVAL);
    }
+
    return 0;
 }

@ -343,14 +344,16 @@ int avresample_set_matrix(AVAudioResampleContext *avr, const double *matrix,
    in_channels  = av_get_channel_layout_nb_channels(avr->in_channel_layout);
    out_channels = av_get_channel_layout_nb_channels(avr->out_channel_layout);

-    if ( in_channels < 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
-        out_channels < 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
+    if ( in_channels <= 0 ||  in_channels > AVRESAMPLE_MAX_CHANNELS ||
+        out_channels <= 0 || out_channels > AVRESAMPLE_MAX_CHANNELS) {
        av_log(avr, AV_LOG_ERROR, "Invalid channel layouts\n");
        return AVERROR(EINVAL);
    }

-    if (avr->am->matrix)
-        av_freep(avr->am->matrix);
+    if (avr->am->matrix) {
+        av_free(avr->am->matrix[0]);
+        avr->am->matrix = NULL;
+    }

 #define CONVERT_MATRIX(type, expr)                                          \
    avr->am->matrix_## type[0] = av_mallocz(out_channels * in_channels *    \
@ -386,5 +389,11 @@ int avresample_set_matrix(AVAudioResampleContext *avr, const double *matrix,
    /* TODO: detect situations where we can just swap around pointers
             instead of doing matrix multiplications with 0.0 and 1.0 */

+    /* set AudioMix params */
+    avr->am->in_layout    = avr->in_channel_layout;
+    avr->am->out_layout   = avr->out_channel_layout;
+    avr->am->in_channels  = in_channels;
+    avr->am->out_channels = out_channels;
+
    return 0;
 }
--- a/libavresample/utils.c
+++ b/libavresample/utils.c
@ -48,9 +48,8 @@ int avresample_open(AVAudioResampleContext *avr)
    avr->resample_channels = FFMIN(avr->in_channels, avr->out_channels);
    avr->downmix_needed    = avr->in_channels  > avr->out_channels;
    avr->upmix_needed      = avr->out_channels > avr->in_channels ||
-                             avr->am->matrix                      ||
-                             (avr->out_channels == avr->in_channels &&
-                              avr->in_channel_layout != avr->out_channel_layout);
+                             (!avr->downmix_needed && (avr->am->matrix ||
+                              avr->in_channel_layout != avr->out_channel_layout));
    avr->mixing_needed     = avr->downmix_needed || avr->upmix_needed;

    /* set resampling parameters */
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@ -49,10 +49,10 @@ void av_set_cpu_flags_mask(int mask)

 int av_parse_cpu_flags(const char *s)
 {
-#define CPUFLAG_MMX2     (AV_CPU_FLAG_MMX      | AV_CPU_FLAG_MMX2 | AV_CPU_FLAG_CMOV)
+#define CPUFLAG_MMXEXT   (AV_CPU_FLAG_MMX      | AV_CPU_FLAG_MMXEXT | AV_CPU_FLAG_CMOV)
 #define CPUFLAG_3DNOW    (AV_CPU_FLAG_3DNOW    | AV_CPU_FLAG_MMX)
 #define CPUFLAG_3DNOWEXT (AV_CPU_FLAG_3DNOWEXT | CPUFLAG_3DNOW)
-#define CPUFLAG_SSE      (AV_CPU_FLAG_SSE      | CPUFLAG_MMX2)
+#define CPUFLAG_SSE      (AV_CPU_FLAG_SSE      | CPUFLAG_MMXEXT)
 #define CPUFLAG_SSE2     (AV_CPU_FLAG_SSE2     | CPUFLAG_SSE)
 #define CPUFLAG_SSE2SLOW (AV_CPU_FLAG_SSE2SLOW | CPUFLAG_SSE2)
 #define CPUFLAG_SSE3     (AV_CPU_FLAG_SSE3     | CPUFLAG_SSE2)
@ -69,7 +69,7 @@ int av_parse_cpu_flags(const char *s)
        { "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_ALTIVEC  },    .unit = "flags" },
 #elif ARCH_X86
        { "mmx"     , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_MMX      },    .unit = "flags" },
-        { "mmx2"    , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_MMX2         },    .unit = "flags" },
+        { "mmxext"  , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_MMXEXT       },    .unit = "flags" },
        { "sse"     , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE          },    .unit = "flags" },
        { "sse2"    , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE2         },    .unit = "flags" },
        { "sse2slow", NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE2SLOW     },    .unit = "flags" },
@ -174,7 +174,7 @@ static const struct {
    { AV_CPU_FLAG_ALTIVEC,   "altivec"    },
 #elif ARCH_X86
    { AV_CPU_FLAG_MMX,       "mmx"        },
-    { AV_CPU_FLAG_MMX2,      "mmx2"       },
+    { AV_CPU_FLAG_MMXEXT,    "mmxext"     },
    { AV_CPU_FLAG_SSE,       "sse"        },
    { AV_CPU_FLAG_SSE2,      "sse2"       },
    { AV_CPU_FLAG_SSE2SLOW,  "sse2(slow)" },
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@ -27,6 +27,7 @@

    /* lower 16 bits - CPU features */
 #define AV_CPU_FLAG_MMX          0x0001 ///< standard MMX
+#define AV_CPU_FLAG_MMXEXT       0x0002 ///< SSE integer functions or AMD MMX ext
 #define AV_CPU_FLAG_MMX2         0x0002 ///< SSE integer functions or AMD MMX ext
 #define AV_CPU_FLAG_3DNOW        0x0004 ///< AMD 3DNOW
 #define AV_CPU_FLAG_SSE          0x0008 ///< SSE functions
--- a/libavutil/utils.c
+++ b/libavutil/utils.c
@ -33,6 +33,7 @@ unsigned avutil_version(void)
    av_assert0(AVMEDIA_TYPE_ATTACHMENT == 4);
    av_assert0(AV_PICTURE_TYPE_BI == 7);
    av_assert0(LIBAVUTIL_VERSION_MICRO >= 100);
+    av_assert0(HAVE_MMX2 == HAVE_MMXEXT);

    return LIBAVUTIL_VERSION_INT;
 }
--- a/libavutil/version.h
+++ b/libavutil/version.h
@ -40,7 +40,7 @@

 #define LIBAVUTIL_VERSION_MAJOR 51
 #define LIBAVUTIL_VERSION_MINOR 66
-#define LIBAVUTIL_VERSION_MICRO 100
+#define LIBAVUTIL_VERSION_MICRO 101

 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
                                               LIBAVUTIL_VERSION_MINOR, \
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@ -122,7 +122,7 @@ int ff_get_cpu_flags_x86(void)
        if (std_caps & (1 << 23))
            rval |= AV_CPU_FLAG_MMX;
        if (std_caps & (1 << 25))
-            rval |= AV_CPU_FLAG_MMX2;
+            rval |= AV_CPU_FLAG_MMXEXT;
 #if HAVE_SSE
        if (std_caps & (1 << 25))
            rval |= AV_CPU_FLAG_SSE;
@ -159,7 +159,7 @@ int ff_get_cpu_flags_x86(void)
        if (ext_caps & (1 << 23))
            rval |= AV_CPU_FLAG_MMX;
        if (ext_caps & (1 << 22))
-            rval |= AV_CPU_FLAG_MMX2;
+            rval |= AV_CPU_FLAG_MMXEXT;

        /* Allow for selectively disabling SSE2 functions on AMD processors
           with SSE2 support but not SSE4a. This includes Athlon64, some
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@ -663,8 +663,8 @@ static int swScale(SwsContext *c, const uint8_t *src[],
    if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf)
        fillPlane(dst[3], dstStride[3], dstW, dstY - lastDstY, lastDstY, 255);

-#if HAVE_MMX2 && HAVE_INLINE_ASM
-    if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT && HAVE_INLINE_ASM
+    if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
        __asm__ volatile ("sfence" ::: "memory");
 #endif
    emms_c();
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@ -82,7 +82,10 @@ const char *swscale_license(void);
 * are only provided for API compatibility.
 */
 #define SWS_CPU_CAPS_MMX      0x80000000
+#define SWS_CPU_CAPS_MMXEXT   0x20000000
+#if LIBSWSCALE_VERSION_MAJOR < 3
 #define SWS_CPU_CAPS_MMX2     0x20000000
+#endif
 #define SWS_CPU_CAPS_3DNOW    0x40000000
 #define SWS_CPU_CAPS_ALTIVEC  0x10000000
 #define SWS_CPU_CAPS_BFIN     0x01000000
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@ -599,7 +599,7 @@ fail:
    return ret;
 }

-#if HAVE_MMX2 && HAVE_INLINE_ASM
+#if HAVE_MMXEXT && HAVE_INLINE_ASM
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
                           int16_t *filter, int32_t *filterPos, int numSplits)
 {
@ -762,7 +762,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,

    return fragmentPos + 1;
 }
-#endif /* HAVE_MMX2 && HAVE_INLINE_ASM */
+#endif /* HAVE_MMXEXT && HAVE_INLINE_ASM */

 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
 {
@ -1024,7 +1024,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
        c->srcBpc = 16;
    if (c->dstBpc == 16)
        dst_stride <<= 1;
-    if (HAVE_MMX2 && HAVE_INLINE_ASM && cpu_flags & AV_CPU_FLAG_MMX2 &&
+    if (HAVE_MMXEXT && HAVE_INLINE_ASM && cpu_flags & AV_CPU_FLAG_MMXEXT &&
        c->srcBpc == 8 && c->dstBpc <= 14) {
        c->canMMX2BeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
                            (srcW & 15) == 0) ? 1 : 0;
@ -1063,7 +1063,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,

    /* precalculate horizontal scaler filter coefficients */
    {
-#if HAVE_MMX2 && HAVE_INLINE_ASM
+#if HAVE_MMXEXT && HAVE_INLINE_ASM
 // can't downscale !!!
        if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
            c->lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c->lumXInc, NULL,
@ -1107,7 +1107,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
            mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
 #endif
        } else
-#endif /* HAVE_MMX2 && HAVE_INLINE_ASM */
+#endif /* HAVE_MMXEXT && HAVE_INLINE_ASM */
        {
            const int filterAlign =
                (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
@ -1273,7 +1273,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 #endif
               av_get_pix_fmt_name(dstFormat));

-        if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2)
+        if (HAVE_MMXEXT && cpu_flags & AV_CPU_FLAG_MMXEXT)
            av_log(c, AV_LOG_INFO, "using MMX2\n");
        else if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
            av_log(c, AV_LOG_INFO, "using 3DNOW\n");
--- a/libswscale/version.h
+++ b/libswscale/version.h
@ -28,7 +28,7 @@

 #define LIBSWSCALE_VERSION_MAJOR 2
 #define LIBSWSCALE_VERSION_MINOR 1
-#define LIBSWSCALE_VERSION_MICRO 100
+#define LIBSWSCALE_VERSION_MICRO 101

 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                               LIBSWSCALE_VERSION_MINOR, \
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@ -88,7 +88,7 @@ DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;

 //Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.

-#define COMPILE_TEMPLATE_MMX2 0
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define COMPILE_TEMPLATE_SSE2 0

@ -99,8 +99,8 @@ DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;

 //MMX2 versions
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 1
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _MMX2
 #include "rgb2rgb_template.c"

@ -113,10 +113,10 @@ DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;

 //3DNOW versions
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
+#undef COMPILE_TEMPLATE_MMXEXT
 #undef COMPILE_TEMPLATE_SSE2
 #undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX2 0
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNOW
@ -140,7 +140,7 @@ av_cold void rgb2rgb_init_x86(void)
        rgb2rgb_init_MMX();
    if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
        rgb2rgb_init_3DNOW();
-    if (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)
+    if (HAVE_MMXEXT   && cpu_flags & AV_CPU_FLAG_MMXEXT)
        rgb2rgb_init_MMX2();
    if (HAVE_SSE      && cpu_flags & AV_CPU_FLAG_SSE2)
        rgb2rgb_init_SSE2();
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@ -35,7 +35,7 @@
 #if COMPILE_TEMPLATE_AMD3DNOW
 #define PREFETCH  "prefetch"
 #define PAVGB     "pavgusb"
-#elif COMPILE_TEMPLATE_MMX2
+#elif COMPILE_TEMPLATE_MMXEXT
 #define PREFETCH "prefetchnta"
 #define PAVGB     "pavgb"
 #else
@ -49,7 +49,7 @@
 #define EMMS     "emms"
 #endif

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@ -1136,7 +1136,7 @@ static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst,
        PREFETCH"     32(%1, %0)        \n\t"
        "movq           (%1, %0), %%mm0 \n\t"
        "movq          8(%1, %0), %%mm1 \n\t"
-# if COMPILE_TEMPLATE_MMX2
+# if COMPILE_TEMPLATE_MMXEXT
        "pshufw      $177, %%mm0, %%mm3 \n\t"
        "pshufw      $177, %%mm1, %%mm5 \n\t"
        "pand       %%mm7, %%mm0        \n\t"
@ -1500,7 +1500,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 }
 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */

-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
 {
    int x,y;
@ -1590,7 +1590,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid
                     SFENCE"     \n\t"
                     :::"memory");
 }
-#endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */

 #if !COMPILE_TEMPLATE_AMD3DNOW
 /**
@ -1798,7 +1798,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
            "1:                                         \n\t"
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
@ -1859,7 +1859,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
            "psraw                      $7, %%mm0       \n\t"

-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
@ -2580,9 +2580,9 @@ static inline void RENAME(rgb2rgb_init)(void)
    yuyvtoyuv422       = RENAME(yuyvtoyuv422);
 #endif /* !COMPILE_TEMPLATE_SSE2 */

-#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
    planar2x           = RENAME(planar2x);
-#endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
    rgb24toyv12        = RENAME(rgb24toyv12);

    yuyvtoyuv420       = RENAME(yuyvtoyuv420);
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@ -74,16 +74,16 @@ DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
 //MMX versions
 #if HAVE_MMX
 #undef RENAME
-#define COMPILE_TEMPLATE_MMX2 0
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define RENAME(a) a ## _MMX
 #include "swscale_template.c"
 #endif

 //MMX2 versions
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 1
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _MMX2
 #include "swscale_template.c"
 #endif
@ -375,8 +375,8 @@ av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
 #if HAVE_INLINE_ASM
    if (cpu_flags & AV_CPU_FLAG_MMX)
        sws_init_swScale_MMX(c);
-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2)
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
        sws_init_swScale_MMX2(c);
    if (cpu_flags & AV_CPU_FLAG_SSE3){
        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
@ -439,7 +439,7 @@ switch(c->dstBpc){ \
    if (cpu_flags & AV_CPU_FLAG_MMX) {
        ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
        ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
-        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMXEXT);

        switch (c->srcFormat) {
        case PIX_FMT_Y400A:
@ -471,7 +471,7 @@ switch(c->dstBpc){ \
            break;
        }
    }
-    if (cpu_flags & AV_CPU_FLAG_MMX2) {
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2, , 1);
    }
 #endif
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@ -23,13 +23,13 @@
 #undef MOVNTQ2
 #undef PREFETCH

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define PREFETCH "prefetchnta"
 #else
 #define PREFETCH  " # nop"
 #endif

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
 #define MOVNTQ2 "movntq "
 #else
@ -38,7 +38,7 @@
 #endif
 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)

-#if !COMPILE_TEMPLATE_MMX2
+#if !COMPILE_TEMPLATE_MMXEXT
 static av_always_inline void
 dither_8to16(const uint8_t *srcDither, int rot)
 {
@ -641,7 +641,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
    "cmp  "#dstw", "#index"     \n\t"\
    " jb       1b               \n\t"

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #undef WRITEBGR24
 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
 #else
@ -1445,7 +1445,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
    }
 }

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
                                 int dstWidth, const uint8_t *src,
                                 int srcW, int xInc)
@ -1627,7 +1627,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
        dst2[i] = src2[srcW-1]*128;
    }
 }
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */

 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 {
@ -1691,17 +1691,17 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)

    if (c->srcBpc == 8 && c->dstBpc <= 14) {
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
    {
        c->hyscale_fast = RENAME(hyscale_fast);
        c->hcscale_fast = RENAME(hcscale_fast);
    } else {
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
        c->hyscale_fast = NULL;
        c->hcscale_fast = NULL;
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
    }
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
    }
 }
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@ -52,20 +52,20 @@ DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 //MMX versions
 #if HAVE_MMX
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 0
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
 #define RENAME(a) a ## _MMX
 #include "yuv2rgb_template.c"
 #endif /* HAVE_MMX */

 //MMX2 versions
-#if HAVE_MMX2
+#if HAVE_MMXEXT
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#define COMPILE_TEMPLATE_MMX2 1
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
 #define RENAME(a) a ## _MMX2
 #include "yuv2rgb_template.c"
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMXEXT */

 #endif /* HAVE_INLINE_ASM */

@ -74,8 +74,8 @@ av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
 #if HAVE_INLINE_ASM
    int cpu_flags = av_get_cpu_flags();

-#if HAVE_MMX2
-    if (cpu_flags & AV_CPU_FLAG_MMX2) {
+#if HAVE_MMXEXT
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB24:  return yuv420_rgb24_MMX2;
        case PIX_FMT_BGR24:  return yuv420_bgr24_MMX2;
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@ -25,7 +25,7 @@
 #undef EMMS
 #undef SFENCE

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@ -181,7 +181,7 @@
    "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
    "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \

-#if !COMPILE_TEMPLATE_MMX2
+#if !COMPILE_TEMPLATE_MMXEXT
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                       int srcStride[],
                                       int srcSliceY, int srcSliceH,
@ -237,7 +237,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
    YUV2RGB_OPERANDS
    YUV2RGB_ENDFUNC
 }
-#endif /* !COMPILE_TEMPLATE_MMX2 */
+#endif /* !COMPILE_TEMPLATE_MMXEXT */

 #define RGB_PACK24(blue, red)\
    "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
@ -254,7 +254,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
    "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
    RGB_PACK24_B

-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
@ -361,7 +361,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
    MOVNTQ "   %%mm5,       16(%1)\n\t"      \
    MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \

-#if !COMPILE_TEMPLATE_MMX2
+#if !COMPILE_TEMPLATE_MMXEXT
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                       int srcStride[],
                                       int srcSliceY, int srcSliceH,
@ -448,4 +448,4 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
 }
 #endif

-#endif /* !COMPILE_TEMPLATE_MMX2 */
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
--- a/tests/fate/lossless-video.mak
+++ b/tests/fate/lossless-video.mak
@ -1,3 +1,18 @@
+FATE_LAGARITH += fate-lagarith-rgb24
+fate-lagarith-rgb24: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb24.avi
+
+FATE_LAGARITH += fate-lagarith-rgb32
+fate-lagarith-rgb32: CMD = framecrc -i $(SAMPLES)/lagarith/lag-rgb32.avi -pix_fmt bgra
+
+FATE_LAGARITH += fate-lagarith-yuy2
+fate-lagarith-yuy2: CMD = framecrc -i $(SAMPLES)/lagarith/lag-yuy2.avi
+
+FATE_LAGARITH += fate-lagarith-yv12
+fate-lagarith-yv12: CMD = framecrc -i $(SAMPLES)/lagarith/lag-yv12.avi
+
+FATE_SAMPLES_AVCONV += $(FATE_LAGARITH)
+fate-lagarith: $(FATE_LAGARITH)
+
 FATE_LOCO += fate-loco-rgb
 fate-loco-rgb: CMD = framecrc -i $(SAMPLES)/loco/pig-loco-rgb.avi

--- a/tests/ref/fate/lagarith-rgb24
+++ b/tests/ref/fate/lagarith-rgb24
@ -0,0 +1,5 @@
+#tb 0: 100/2997
+0,          0,          0,        1,   368640, 0x26f74db2
+0,          1,          1,        1,   368640, 0x63b29ea4
+0,          2,          2,        1,   368640, 0x19467f03
+0,          3,          3,        1,   368640, 0x5fdc3575
--- a/tests/ref/fate/lagarith-rgb32
+++ b/tests/ref/fate/lagarith-rgb32
@ -0,0 +1,26 @@
+#tb 0: 1001/24000
+0,          0,          0,        1,  1382400, 0x00000000
+0,          1,          1,        1,  1382400, 0x00000000
+0,          2,          2,        1,  1382400, 0x00000000
+0,          3,          3,        1,  1382400, 0x00000000
+0,          4,          4,        1,  1382400, 0x00000000
+0,          5,          5,        1,  1382400, 0xf95bde46
+0,          6,          6,        1,  1382400, 0x4f4c0393
+0,          7,          7,        1,  1382400, 0xe5aa40db
+0,          8,          8,        1,  1382400, 0xc25a8ba2
+0,          9,          9,        1,  1382400, 0x9db3150d
+0,         10,         10,        1,  1382400, 0x730e64b3
+0,         11,         11,        1,  1382400, 0xf8fd7edf
+0,         12,         12,        1,  1382400, 0x0114798a
+0,         13,         13,        1,  1382400, 0x7571210f
+0,         14,         14,        1,  1382400, 0x552ae59d
+0,         15,         15,        1,  1382400, 0x7ae0c946
+0,         16,         16,        1,  1382400, 0x0818c3ef
+0,         17,         17,        1,  1382400, 0x8257cac4
+0,         18,         18,        1,  1382400, 0x7762a979
+0,         19,         19,        1,  1382400, 0x282af57a
+0,         20,         20,        1,  1382400, 0x3f42de50
+0,         21,         21,        1,  1382400, 0xc42d5f93
+0,         22,         22,        1,  1382400, 0x18775c90
+0,         23,         23,        1,  1382400, 0x34befa90
+0,         24,         24,        1,  1382400, 0xd33d5f53
--- a/tests/ref/fate/lagarith-yuy2
+++ b/tests/ref/fate/lagarith-yuy2
@ -0,0 +1,2 @@
+#tb 0: 1/10
+0,          0,          0,        1,  1572864, 0xeed76a7d
--- a/tests/ref/fate/lagarith-yv12
+++ b/tests/ref/fate/lagarith-yv12
@ -0,0 +1,3 @@
+#tb 0: 1/60
+0,          0,          0,        1,    92160, 0x1dfdf5c1
+0,          1,          1,        1,    92160, 0x6965884f