From 8ad77b65b548a6b2f4707265ebd7e97f956acf0b Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Tue, 10 May 2011 07:08:24 -0700
Subject: [PATCH 01/32] Update x86 H.264 deblock asm

Includes AVX versions from x264.
---
 libavcodec/x86/h264_deblock.asm | 391 ++++++++++++++++++--------------
 libavcodec/x86/h264dsp_mmx.c    |  56 +++--
 libavcodec/x86/x86util.asm      |  19 +-
 3 files changed, 275 insertions(+), 191 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 01778a45cb..081c0e1aef 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -1,10 +1,11 @@
 ;*****************************************************************************
-;* MMX/SSE2-optimized H.264 deblocking code
+;* MMX/SSE2/AVX-optimized H.264 deblocking code
 ;*****************************************************************************
-;* Copyright (C) 2005-2008 x264 project
+;* Copyright (C) 2005-2011 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Oskar Arvidsson <oskar@irock.se>
 ;*
 ;* This file is part of Libav.
 ;*
@@ -26,96 +27,135 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION .text
 
 cextern pb_0
 cextern pb_1
 cextern pb_3
 cextern pb_A1
 
-SECTION .text
-
 ; expands to [base],...,[base+7*stride]
 %define PASS8ROWS(base, base3, stride, stride3) \
     [base], [base+stride], [base+stride*2], [base3], \
     [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
 
-; in: 8 rows of 4 bytes in %1..%8
+%define PASS8ROWS(base, base3, stride, stride3, offset) \
+    PASS8ROWS(base+offset, base3+offset, stride, stride3)
+
+; in: 8 rows of 4 bytes in %4..%11
 ; out: 4 rows of 8 bytes in m0..m3
-%macro TRANSPOSE4x8_LOAD 8
-    movd       m0, %1
-    movd       m2, %2
-    movd       m1, %3
-    movd       m3, %4
-    punpcklbw  m0, m2
-    punpcklbw  m1, m3
-    movq       m2, m0
-    punpcklwd  m0, m1
-    punpckhwd  m2, m1
+%macro TRANSPOSE4x8_LOAD 11
+    movh       m0, %4
+    movh       m2, %5
+    movh       m1, %6
+    movh       m3, %7
+    punpckl%1  m0, m2
+    punpckl%1  m1, m3
+    mova       m2, m0
+    punpckl%2  m0, m1
+    punpckh%2  m2, m1
 
-    movd       m4, %5
-    movd       m6, %6
-    movd       m5, %7
-    movd       m7, %8
-    punpcklbw  m4, m6
-    punpcklbw  m5, m7
-    movq       m6, m4
-    punpcklwd  m4, m5
-    punpckhwd  m6, m5
+    movh       m4, %8
+    movh       m6, %9
+    movh       m5, %10
+    movh       m7, %11
+    punpckl%1  m4, m6
+    punpckl%1  m5, m7
+    mova       m6, m4
+    punpckl%2  m4, m5
+    punpckh%2  m6, m5
 
-    movq       m1, m0
-    movq       m3, m2
-    punpckldq  m0, m4
-    punpckhdq  m1, m4
-    punpckldq  m2, m6
-    punpckhdq  m3, m6
+    punpckh%3  m1, m0, m4
+    punpckh%3  m3, m2, m6
+    punpckl%3  m0, m4
+    punpckl%3  m2, m6
 %endmacro
 
 ; in: 4 rows of 8 bytes in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4_STORE 8
-    movq       m4, m0
-    movq       m5, m1
-    movq       m6, m2
-    punpckhdq  m4, m4
-    punpckhdq  m5, m5
-    punpckhdq  m6, m6
+%macro TRANSPOSE8x4B_STORE 8
+    punpckhdq  m4, m0, m0
+    punpckhdq  m5, m1, m1
+    punpckhdq  m6, m2, m2
 
     punpcklbw  m0, m1
     punpcklbw  m2, m3
-    movq       m1, m0
-    punpcklwd  m0, m2
-    punpckhwd  m1, m2
-    movd       %1, m0
-    punpckhdq  m0, m0
-    movd       %2, m0
-    movd       %3, m1
+    punpcklwd  m1, m0, m2
+    punpckhwd  m0, m2
+    movh       %1, m1
     punpckhdq  m1, m1
-    movd       %4, m1
+    movh       %2, m1
+    movh       %3, m0
+    punpckhdq  m0, m0
+    movh       %4, m0
 
     punpckhdq  m3, m3
     punpcklbw  m4, m5
     punpcklbw  m6, m3
-    movq       m5, m4
-    punpcklwd  m4, m6
-    punpckhwd  m5, m6
-    movd       %5, m4
-    punpckhdq  m4, m4
-    movd       %6, m4
-    movd       %7, m5
+    punpcklwd  m5, m4, m6
+    punpckhwd  m4, m6
+    movh       %5, m5
     punpckhdq  m5, m5
-    movd       %8, m5
+    movh       %6, m5
+    movh       %7, m4
+    punpckhdq  m4, m4
+    movh       %8, m4
+%endmacro
+
+%macro TRANSPOSE4x8B_LOAD 8
+    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
+%endmacro
+
+%macro TRANSPOSE4x8W_LOAD 8
+%if mmsize==16
+    TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
+%else
+    SWAP  1, 4, 2, 3
+    mova  m0, [t5]
+    mova  m1, [t5+r1]
+    mova  m2, [t5+r1*2]
+    mova  m3, [t5+t6]
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+%endmacro
+
+%macro TRANSPOSE8x2W_STORE 8
+    punpckhwd  m0, m1, m2
+    punpcklwd  m1, m2
+%if mmsize==8
+    movd       %3, m0
+    movd       %1, m1
+    psrlq      m1, 32
+    psrlq      m0, 32
+    movd       %2, m1
+    movd       %4, m0
+%else
+    movd       %5, m0
+    movd       %1, m1
+    psrldq     m1, 4
+    psrldq     m0, 4
+    movd       %2, m1
+    movd       %6, m0
+    psrldq     m1, 4
+    psrldq     m0, 4
+    movd       %3, m1
+    movd       %7, m0
+    psrldq     m1, 4
+    psrldq     m0, 4
+    movd       %4, m1
+    movd       %8, m0
+%endif
 %endmacro
 
 %macro SBUTTERFLY3 4
-    movq       %4, %2
+    punpckh%1  %4, %2, %3
     punpckl%1  %2, %3
-    punpckh%1  %4, %3
 %endmacro
 
 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
 %macro TRANSPOSE6x8_MEM 9
+    RESET_MM_PERMUTATION
     movq  m0, %1
     movq  m1, %2
     movq  m2, %3
@@ -123,30 +163,32 @@ SECTION .text
     movq  m4, %5
     movq  m5, %6
     movq  m6, %7
-    SBUTTERFLY3 bw, m0, m1, m7
-    SBUTTERFLY3 bw, m2, m3, m1
-    SBUTTERFLY3 bw, m4, m5, m3
-    movq  [%9+0x10], m1
-    SBUTTERFLY3 bw, m6, %8, m5
-    SBUTTERFLY3 wd, m0, m2, m1
-    SBUTTERFLY3 wd, m4, m6, m2
+    SBUTTERFLY bw, 0, 1, 7
+    SBUTTERFLY bw, 2, 3, 7
+    SBUTTERFLY bw, 4, 5, 7
+    movq  [%9+0x10], m3
+    SBUTTERFLY3 bw, m6, %8, m7
+    SBUTTERFLY wd, 0, 2, 3
+    SBUTTERFLY wd, 4, 6, 3
     punpckhdq m0, m4
     movq  [%9+0x00], m0
-    SBUTTERFLY3 wd, m7, [%9+0x10], m6
-    SBUTTERFLY3 wd, m3, m5, m4
-    SBUTTERFLY3 dq, m7, m3, m0
-    SBUTTERFLY3 dq, m1, m2, m5
-    punpckldq m6, m4
-    movq  [%9+0x10], m1
-    movq  [%9+0x20], m5
-    movq  [%9+0x30], m7
-    movq  [%9+0x40], m0
-    movq  [%9+0x50], m6
+    SBUTTERFLY3 wd, m1, [%9+0x10], m3
+    SBUTTERFLY wd, 5, 7, 0
+    SBUTTERFLY dq, 1, 5, 0
+    SBUTTERFLY dq, 2, 6, 0
+    punpckldq m3, m7
+    movq  [%9+0x10], m2
+    movq  [%9+0x20], m6
+    movq  [%9+0x30], m1
+    movq  [%9+0x40], m5
+    movq  [%9+0x50], m3
+    RESET_MM_PERMUTATION
 %endmacro
 
 ; in: 8 rows of 8 in %1..%8
 ; out: 8 rows of 8 in %9..%16
 %macro TRANSPOSE8x8_MEM 16
+    RESET_MM_PERMUTATION
     movq  m0, %1
     movq  m1, %2
     movq  m2, %3
@@ -154,38 +196,44 @@ SECTION .text
     movq  m4, %5
     movq  m5, %6
     movq  m6, %7
-    SBUTTERFLY3 bw, m0, m1, m7
-    SBUTTERFLY3 bw, m2, m3, m1
-    SBUTTERFLY3 bw, m4, m5, m3
-    SBUTTERFLY3 bw, m6, %8, m5
-    movq  %9,  m3
-    SBUTTERFLY3 wd, m0, m2, m3
-    SBUTTERFLY3 wd, m4, m6, m2
-    SBUTTERFLY3 wd, m7, m1, m6
-    movq  %11, m2
-    movq  m2,  %9
-    SBUTTERFLY3 wd, m2, m5, m1
-    SBUTTERFLY3 dq, m0, m4, m5
-    SBUTTERFLY3 dq, m7, m2, m4
+    SBUTTERFLY bw, 0, 1, 7
+    SBUTTERFLY bw, 2, 3, 7
+    SBUTTERFLY bw, 4, 5, 7
+    SBUTTERFLY3 bw, m6, %8, m7
+    movq  %9,  m5
+    SBUTTERFLY wd, 0, 2, 5
+    SBUTTERFLY wd, 4, 6, 5
+    SBUTTERFLY wd, 1, 3, 5
+    movq  %11, m6
+    movq  m6,  %9
+    SBUTTERFLY wd, 6, 7, 5
+    SBUTTERFLY dq, 0, 4, 5
+    SBUTTERFLY dq, 1, 6, 5
     movq  %9,  m0
-    movq  %10, m5
-    movq  %13, m7
-    movq  %14, m4
-    SBUTTERFLY3 dq, m3, %11, m0
-    SBUTTERFLY3 dq, m6, m1, m5
-    movq  %11, m3
+    movq  %10, m4
+    movq  %13, m1
+    movq  %14, m6
+    SBUTTERFLY3 dq, m2, %11, m0
+    SBUTTERFLY dq, 3, 7, 4
+    movq  %11, m2
     movq  %12, m0
-    movq  %15, m6
-    movq  %16, m5
+    movq  %15, m3
+    movq  %16, m7
+    RESET_MM_PERMUTATION
 %endmacro
 
 ; out: %4 = |%1-%2|>%3
 ; clobbers: %5
 %macro DIFF_GT 5
+%if avx_enabled == 0
     mova    %5, %2
     mova    %4, %1
     psubusb %5, %1
     psubusb %4, %2
+%else
+    psubusb %5, %2, %1
+    psubusb %4, %1, %2
+%endif
     por     %4, %5
     psubusb %4, %3
 %endmacro
@@ -193,32 +241,28 @@ SECTION .text
 ; out: %4 = |%1-%2|>%3
 ; clobbers: %5
 %macro DIFF_GT2 5
+%ifdef ARCH_X86_64
+    psubusb %5, %2, %1
+    psubusb %4, %1, %2
+%else
     mova    %5, %2
     mova    %4, %1
     psubusb %5, %1
     psubusb %4, %2
+%endif
     psubusb %5, %3
     psubusb %4, %3
     pcmpeqb %4, %5
 %endmacro
 
-%macro SPLATW 1
-%ifidn m0, xmm0
-    pshuflw  %1, %1, 0
-    punpcklqdq %1, %1
-%else
-    pshufw   %1, %1, 0
-%endif
-%endmacro
-
 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
 ; out: m5=beta-1, m7=mask, %3=alpha-1
 ; clobbers: m4,m6
 %macro LOAD_MASK 2-3
     movd     m4, %1
     movd     m5, %2
-    SPLATW   m4
-    SPLATW   m5
+    SPLATW   m4, m4
+    SPLATW   m5, m5
     packuswb m4, m4  ; 16x alpha-1
     packuswb m5, m5  ; 16x beta-1
 %if %0>2
@@ -237,8 +281,7 @@ SECTION .text
 ; out: m1=p0' m2=q0'
 ; clobbers: m0,3-6
 %macro DEBLOCK_P0_Q0 0
-    mova    m5, m1
-    pxor    m5, m2       ; p0^q0
+    pxor    m5, m1, m2   ; p0^q0
     pand    m5, [pb_1]   ; (p0^q0)&1
     pcmpeqb m4, m4
     pxor    m3, m4
@@ -264,14 +307,12 @@ SECTION .text
 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
 ; clobbers: q2, tmp, tc0
 %macro LUMA_Q1 6
-    mova    %6, m1
-    pavgb   %6, m2
+    pavgb   %6, m1, m2
     pavgb   %2, %6       ; avg(p2,avg(p0,q0))
     pxor    %6, %3
     pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
     psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
-    mova    %6, %1
-    psubusb %6, %5
+    psubusb %6, %1, %5
     paddusb %5, %1
     pmaxub  %2, %6
     pminub  %2, %5
@@ -280,10 +321,10 @@ SECTION .text
 
 %ifdef ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal x264_deblock_v_luma_sse2, 5,5,10
+%macro DEBLOCK_LUMA 1
+cglobal deblock_v_luma_%1, 5,5,10
     movd    m8, [r4] ; tc0
     lea     r4, [r1*3]
     dec     r2d        ; alpha-1
@@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
     movdqa  m3, [r4] ; p2
     DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
     pand    m6, m9
-    mova    m7, m8
-    psubb   m7, m6
+    psubb   m7, m8, m6
     pand    m6, m8
     LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
@@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal x264_deblock_h_luma_sse2, 5,7
+cglobal deblock_h_luma_%1, 5,7
     movsxd r10, r1d
     lea    r11, [r10+r10*2]
     lea    r6,  [r0-4]
@@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7
 
     ; vertical filter
     ; alpha, beta, tc0 are still in r2d, r3d, r4
-    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+    ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
     lea    r0, [pix_tmp+0x30]
     mov    r1d, 0x10
 %ifdef WIN64
     mov    [rsp+0x20], r4
 %endif
-    call   x264_deblock_v_luma_sse2
+    call   deblock_v_luma_%1
 
     ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
     add    r6, 2
@@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
     movq   m1, [pix_tmp+0x28]
     movq   m2, [pix_tmp+0x38]
     movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
 
     shl    r10, 3
     sub    r6,  r10
@@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
     movq   m1, [pix_tmp+0x20]
     movq   m2, [pix_tmp+0x30]
     movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
 
 %ifdef WIN64
     add    rsp, 0x98
@@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7
     add    rsp, 0x68
 %endif
     RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA sse2
+INIT_AVX
+DEBLOCK_LUMA avx
 
 %else
 
 %macro DEBLOCK_LUMA 3
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5
+cglobal deblock_%2_luma_%1, 5,5
     lea     r4, [r1*3]
     dec     r2     ; alpha-1
     neg     r4
@@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
     DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
     pand    m6, m4
     pand    m4, [esp+%3] ; tc
-    mova    m7, m4
-    psubb   m7, m6
+    psubb   m7, m4, m6
     pand    m6, m4
     LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
@@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal x264_deblock_h_luma_%1, 0,5
+cglobal deblock_h_luma_%1, 0,5
     mov    r0, r0mp
     mov    r3, r1m
     lea    r4, [r3*3]
@@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5
     PUSH   dword r2m
     PUSH   dword 16
     PUSH   dword r0
-    call   x264_deblock_%2_luma_%1
+    call   deblock_%2_luma_%1
 %ifidn %2, v8
     add    dword [esp   ], 8 ; pix_tmp+0x38
     add    dword [esp+16], 2 ; tc0+2
-    call   x264_deblock_%2_luma_%1
+    call   deblock_%2_luma_%1
 %endif
     ADD    esp, 20
 
@@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
     movq   m1, [pix_tmp+0x20]
     movq   m2, [pix_tmp+0x30]
     movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
 
     lea    r0, [r0+r3*8]
     lea    r1, [r1+r3*8]
@@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
     movq   m1, [pix_tmp+0x28]
     movq   m2, [pix_tmp+0x38]
     movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
 
     ADD    esp, pad
     RET
@@ -502,22 +547,34 @@ INIT_MMX
 DEBLOCK_LUMA mmxext, v8, 8
 INIT_XMM
 DEBLOCK_LUMA sse2, v, 16
+INIT_AVX
+DEBLOCK_LUMA avx, v, 16
 
 %endif ; ARCH
 
 
 
 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
+%ifdef ARCH_X86_64
+    pavgb t0, p2, p1
+    pavgb t1, p0, q0
+%else
     mova  t0, p2
     mova  t1, p0
     pavgb t0, p1
     pavgb t1, q0
+%endif
     pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
     mova  t5, t1
+%ifdef ARCH_X86_64
+    paddb t2, p2, p1
+    paddb t3, p0, q0
+%else
     mova  t2, p2
     mova  t3, p0
     paddb t2, p1
     paddb t3, q0
+%endif
     paddb t2, t3
     mova  t3, t2
     mova  t4, t2
@@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16
     pand  t2, mpb_1
     psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
 
+%ifdef ARCH_X86_64
+    pavgb t1, p2, q1
+    psubb t2, p2, q1
+%else
     mova  t1, p2
     mova  t2, p2
     pavgb t1, q1
     psubb t2, q1
+%endif
     paddb t3, t3
     psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
     pand  t2, mpb_1
@@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16
     pand  t3, mpb_1
     psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
 
-    mova  t3, p0
-    mova  t2, p0
-    pxor  t3, q1
-    pavgb t2, q1
+    pxor  t3, p0, q1
+    pavgb t2, p0, q1
     pand  t3, mpb_1
     psubb t2, t3
     pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
@@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16
     mova  %1, t1 ; store p0
 
     mova  t1, %4 ; p3
-    mova  t2, t1
+    paddb t2, t1, p2
     pavgb t1, p2
-    paddb t2, p2
     pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
     paddb t2, t2
     paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
@@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16
 %endif
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%2_luma_intra_%1, 4,6,16
 %ifndef ARCH_X86_64
     sub     esp, 0x60
 %endif
@@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
 INIT_MMX
 %ifdef ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra_%1, 4,7
     movsxd r10, r1d
     lea    r11, [r10*3]
     lea    r6,  [r0-4]
@@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
 
     lea    r0,  [pix_tmp+0x40]
     mov    r1,  0x10
-    call   x264_deblock_v_luma_intra_%1
+    call   deblock_v_luma_intra_%1
 
     ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
     lea    r5, [r6+r11]
@@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
     add    rsp, 0x88
     RET
 %else
-cglobal x264_deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra_%1, 2,4
     lea    r3,  [r1*3]
     sub    r0,  4
     lea    r2,  [r0+r3]
@@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
     PUSH   dword r2m
     PUSH   dword 16
     PUSH   r0
-    call   x264_deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_%1
 %ifidn %2, v8
     add    dword [rsp], 8 ; pix_tmp+8
-    call   x264_deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_%1
 %endif
     ADD    esp, 16
 
@@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
 
 INIT_XMM
 DEBLOCK_LUMA_INTRA sse2, v
+INIT_AVX
+DEBLOCK_LUMA_INTRA avx , v
 %ifndef ARCH_X86_64
 INIT_MMX
 DEBLOCK_LUMA_INTRA mmxext, v8
 %endif
 
-
-
 INIT_MMX
 
 %macro CHROMA_V_START 0
@@ -790,23 +849,23 @@ INIT_MMX
 %define t6 r6
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_mmxext, 5,6
     CHROMA_V_START
     movq  m0, [t5]
     movq  m1, [t5+r1]
     movq  m2, [r0]
     movq  m3, [r0+r1]
-    call x264_chroma_inter_body_mmxext
+    call ff_chroma_inter_body_mmxext
     movq  [t5+r1], m1
     movq  [r0], m2
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext, 5,7
+cglobal deblock_h_chroma_mmxext, 5,7
 %ifdef ARCH_X86_64
     %define buf0 [rsp-24]
     %define buf1 [rsp-16]
@@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7
     %define buf1 r2m
 %endif
     CHROMA_H_START
-    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call x264_chroma_inter_body_mmxext
+    call ff_chroma_inter_body_mmxext
     movq  m0, buf0
     movq  m3, buf1
-    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
     RET
 
 ALIGN 16
-x264_chroma_inter_body_mmxext:
+ff_chroma_inter_body_mmxext:
     LOAD_MASK  r2d, r3d
     movd       m6, [r4] ; tc0
     punpcklbw  m6, m6
@@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext:
 %define t6 r5
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_mmxext, 4,5
     CHROMA_V_START
     movq  m0, [t5]
     movq  m1, [t5+r1]
     movq  m2, [r0]
     movq  m3, [r0+r1]
-    call x264_chroma_intra_body_mmxext
+    call ff_chroma_intra_body_mmxext
     movq  [t5+r1], m1
     movq  [r0], m2
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_mmxext, 4,6
     CHROMA_H_START
-    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
-    call x264_chroma_intra_body_mmxext
-    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+    call ff_chroma_intra_body_mmxext
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
     RET
 
 ALIGN 16
-x264_chroma_intra_body_mmxext:
+ff_chroma_intra_body_mmxext:
     LOAD_MASK r2d, r3d
     movq   m5, m1
     movq   m6, m2
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 3a783a39ab..7d27c02ea2 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -219,11 +219,11 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
 }
 
 #define LF_FUNC(DIR, TYPE, OPT) \
-void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
-                                               int alpha, int beta, int8_t *tc0);
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
+                                                  int alpha, int beta, int8_t *tc0);
 #define LF_IFUNC(DIR, TYPE, OPT) \
-void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
-                                               int alpha, int beta);
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
+                                                  int alpha, int beta);
 
 LF_FUNC (h,  chroma,       mmxext)
 LF_IFUNC(h,  chroma_intra, mmxext)
@@ -234,18 +234,18 @@ LF_FUNC (h,  luma,         mmxext)
 LF_IFUNC(h,  luma_intra,   mmxext)
 #if HAVE_YASM && ARCH_X86_32
 LF_FUNC (v8, luma,         mmxext)
-static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 {
     if((tc0[0] & tc0[1]) >= 0)
-        ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
+        ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
     if((tc0[2] & tc0[3]) >= 0)
-        ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
+        ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
 }
 LF_IFUNC(v8, luma_intra,   mmxext)
-static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
+static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
 {
-    ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
-    ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
 }
 #endif
 
@@ -253,6 +253,10 @@ LF_FUNC (h,  luma,         sse2)
 LF_IFUNC(h,  luma_intra,   sse2)
 LF_FUNC (v,  luma,         sse2)
 LF_IFUNC(v,  luma_intra,   sse2)
+LF_FUNC (h,  luma,         avx)
+LF_IFUNC(h,  luma_intra,   avx)
+LF_FUNC (v,  luma,         avx)
+LF_IFUNC(v,  luma_intra,   avx)
 
 /***********************************/
 /* weighted prediction */
@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
             c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
             c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
 
-            c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
-            c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
-            c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
-            c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
+            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
+            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
+            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
+            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
 #if ARCH_X86_32
-            c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
-            c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
-            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
-            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
+            c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
+            c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
 #endif
             c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
             c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
 
 #if HAVE_ALIGNED_STACK
-                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
-                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
-                c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
-                c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
 #endif
 
                 c->h264_idct_add16 = ff_h264_idct_add16_sse2;
@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
                 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
             }
+            if (mm_flags&AV_CPU_FLAG_AVX) {
+#if HAVE_ALIGNED_STACK
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
+#endif
+            }
         }
     }
 #endif
diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm
index f731616270..28baf7a96a 100644
--- a/libavcodec/x86/x86util.asm
+++ b/libavcodec/x86/x86util.asm
@@ -24,16 +24,20 @@
 ;******************************************************************************
 
 %macro SBUTTERFLY 4
+%if avx_enabled == 0
     mova      m%4, m%2
     punpckl%1 m%2, m%3
     punpckh%1 m%4, m%3
+%else
+    punpckh%1 m%4, m%2, m%3
+    punpckl%1 m%2, m%3
+%endif
     SWAP %3, %4
 %endmacro
 
 %macro SBUTTERFLY2 4
-    mova      m%4, m%2
-    punpckh%1 m%2, m%3
-    punpckl%1 m%4, m%3
+    punpckl%1 m%4, m%2, m%3
+    punpckh%1 m%2, m%2, m%3
     SWAP %2, %4, %3
 %endmacro
 
@@ -444,3 +448,12 @@
 %macro PMINUB_MMXEXT 3 ; dst, src, ignored
     pminub   %1, %2
 %endmacro
+
+%macro SPLATW 2-3 0
+%if mmsize == 16
+    pshuflw    %1, %2, (%3)*0x55
+    punpcklqdq %1, %1
+%else
+    pshufw     %1, %2, (%3)*0x55
+%endif
+%endmacro

From 9f3d6ca4f16e9b1f6f89424e9d946bb3a6a40d91 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Tue, 10 May 2011 08:55:12 -0700
Subject: [PATCH 02/32] Port x86 10-bit H.264 deblock asm from x264

---
 libavcodec/x86/Makefile               |   1 +
 libavcodec/x86/dsputil_mmx.c          |   1 +
 libavcodec/x86/h264_deblock.asm       |  34 +-
 libavcodec/x86/h264_deblock_10bit.asm | 804 ++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_mmx.c          | 125 ++--
 libavcodec/x86/x86util.asm            |   5 +
 6 files changed, 907 insertions(+), 63 deletions(-)
 create mode 100644 libavcodec/x86/h264_deblock_10bit.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index f8d456d3ea..1cde9517a5 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -9,6 +9,7 @@ YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
 
 MMX-OBJS-$(CONFIG_H264DSP)             += x86/h264dsp_mmx.o
 YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
+                                          x86/h264_deblock_10bit.o      \
                                           x86/h264_weight.o             \
                                           x86/h264_idct.o               \
 
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index a0cb11aa40..1cc6991666 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -43,6 +43,7 @@ DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
 {0x8000000080000000ULL, 0x8000000080000000ULL};
 
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_1  ) = 0x0001000100010001ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2  ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 081c0e1aef..37866812e7 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -324,7 +324,7 @@ cextern pb_A1
 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA 1
-cglobal deblock_v_luma_%1, 5,5,10
+cglobal deblock_v_luma_8_%1, 5,5,10
     movd    m8, [r4] ; tc0
     lea     r4, [r1*3]
     dec     r2d        ; alpha-1
@@ -369,7 +369,7 @@ cglobal deblock_v_luma_%1, 5,5,10
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal deblock_h_luma_%1, 5,7
+cglobal deblock_h_luma_8_%1, 5,7
     movsxd r10, r1d
     lea    r11, [r10+r10*2]
     lea    r6,  [r0-4]
@@ -396,7 +396,7 @@ cglobal deblock_h_luma_%1, 5,7
 %ifdef WIN64
     mov    [rsp+0x20], r4
 %endif
-    call   deblock_v_luma_%1
+    call   deblock_v_luma_8_%1
 
     ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
     add    r6, 2
@@ -436,7 +436,7 @@ DEBLOCK_LUMA avx
 ;-----------------------------------------------------------------------------
 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_%2_luma_%1, 5,5
+cglobal deblock_%2_luma_8_%1, 5,5
     lea     r4, [r1*3]
     dec     r2     ; alpha-1
     neg     r4
@@ -489,7 +489,7 @@ cglobal deblock_%2_luma_%1, 5,5
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal deblock_h_luma_%1, 0,5
+cglobal deblock_h_luma_8_%1, 0,5
     mov    r0, r0mp
     mov    r3, r1m
     lea    r4, [r3*3]
@@ -512,11 +512,11 @@ cglobal deblock_h_luma_%1, 0,5
     PUSH   dword r2m
     PUSH   dword 16
     PUSH   dword r0
-    call   deblock_%2_luma_%1
+    call   deblock_%2_luma_8_%1
 %ifidn %2, v8
     add    dword [esp   ], 8 ; pix_tmp+0x38
     add    dword [esp+16], 2 ; tc0+2
-    call   deblock_%2_luma_%1
+    call   deblock_%2_luma_8_%1
 %endif
     ADD    esp, 20
 
@@ -685,7 +685,7 @@ DEBLOCK_LUMA avx, v, 16
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%2_luma_intra_8_%1, 4,6,16
 %ifndef ARCH_X86_64
     sub     esp, 0x60
 %endif
@@ -747,7 +747,7 @@ INIT_MMX
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra_8_%1, 4,7
     movsxd r10, r1d
     lea    r11, [r10*3]
     lea    r6,  [r0-4]
@@ -763,7 +763,7 @@ cglobal deblock_h_luma_intra_%1, 4,7
 
     lea    r0,  [pix_tmp+0x40]
     mov    r1,  0x10
-    call   deblock_v_luma_intra_%1
+    call   deblock_v_luma_intra_8_%1
 
     ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
     lea    r5, [r6+r11]
@@ -776,7 +776,7 @@ cglobal deblock_h_luma_intra_%1, 4,7
     add    rsp, 0x88
     RET
 %else
-cglobal deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra_8_%1, 2,4
     lea    r3,  [r1*3]
     sub    r0,  4
     lea    r2,  [r0+r3]
@@ -795,10 +795,10 @@ cglobal deblock_h_luma_intra_%1, 2,4
     PUSH   dword r2m
     PUSH   dword 16
     PUSH   r0
-    call   deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_8_%1
 %ifidn %2, v8
     add    dword [rsp], 8 ; pix_tmp+8
-    call   deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_8_%1
 %endif
     ADD    esp, 16
 
@@ -851,7 +851,7 @@ INIT_MMX
 ;-----------------------------------------------------------------------------
 ; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_8_mmxext, 5,6
     CHROMA_V_START
     movq  m0, [t5]
     movq  m1, [t5+r1]
@@ -865,7 +865,7 @@ cglobal deblock_v_chroma_mmxext, 5,6
 ;-----------------------------------------------------------------------------
 ; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_mmxext, 5,7
+cglobal deblock_h_chroma_8_mmxext, 5,7
 %ifdef ARCH_X86_64
     %define buf0 [rsp-24]
     %define buf1 [rsp-16]
@@ -911,7 +911,7 @@ ff_chroma_inter_body_mmxext:
 ;-----------------------------------------------------------------------------
 ; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_8_mmxext, 4,5
     CHROMA_V_START
     movq  m0, [t5]
     movq  m1, [t5+r1]
@@ -925,7 +925,7 @@ cglobal deblock_v_chroma_intra_mmxext, 4,5
 ;-----------------------------------------------------------------------------
 ; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_8_mmxext, 4,6
     CHROMA_H_START
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     call ff_chroma_intra_body_mmxext
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
new file mode 100644
index 0000000000..402ed9bfac
--- /dev/null
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -0,0 +1,804 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Oskar Arvidsson <oskar@irock.se>
+;*          Loren Merritt <lorenm@u.washington.edu>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+
+SECTION .text
+
+cextern pw_2
+cextern pw_4
+
+; out: %4 = |%1-%2|-%3
+; clobbers: %5
+%macro ABS_SUB 5
+    psubusw %5, %2, %1
+    psubusw %4, %1, %2
+    por     %4, %5
+    psubw   %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|<%3
+%macro DIFF_LT   5
+    psubusw %4, %2, %1
+    psubusw %5, %1, %2
+    por     %5, %4 ; |%1-%2|
+    pxor    %4, %4
+    psubw   %5, %3 ; |%1-%2|-%3
+    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
+%endmacro
+
+%macro LOAD_AB 4
+    movd       %1, %3
+    movd       %2, %4
+    SPLATW     %1, %1
+    SPLATW     %2, %2
+%endmacro
+
+; in:  %2=tc reg
+; out: %1=splatted tc
+%macro LOAD_TC 2
+    movd        %1, [%2]
+    punpcklbw   %1, %1
+%if mmsize == 8
+    pshufw      %1, %1, 0
+%else
+    pshuflw     %1, %1, 01010000b
+    pshufd      %1, %1, 01010000b
+%endif
+    psraw       %1, 6
+%endmacro
+
+; in: %1=p1, %2=p0, %3=q0, %4=q1
+;     %5=alpha, %6=beta, %7-%9=tmp
+; out: %7=mask
+%macro LOAD_MASK 9
+    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
+    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
+    pand        %8, %9
+    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
+    pxor        %7, %7
+    pand        %8, %9
+    pcmpgtw     %7, %8
+%endmacro
+
+; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
+; out: %1=p0', m2=q0'
+%macro DEBLOCK_P0_Q0 7
+    psubw   %3, %4
+    pxor    %7, %7
+    paddw   %3, [pw_4]
+    psubw   %7, %5
+    psubw   %6, %2, %1
+    psllw   %6, 2
+    paddw   %3, %6
+    psraw   %3, 3
+    mova    %6, [pw_pixel_max]
+    CLIPW   %3, %7, %5
+    pxor    %7, %7
+    paddw   %1, %3
+    psubw   %2, %3
+    CLIPW   %1, %7, %6
+    CLIPW   %2, %7, %6
+%endmacro
+
+; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
+%macro LUMA_Q1 6
+    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
+    paddw       %1, %6
+    pxor        %6, %6
+    psraw       %1, 1
+    psubw       %6, %5
+    psubw       %1, %2
+    CLIPW       %1, %6, %5
+    paddw       %1, %2
+%endmacro
+
+%macro LUMA_DEBLOCK_ONE 3
+    DIFF_LT     m5, %1, bm, m4, m6
+    pxor        m6, m6
+    mova        %3, m4
+    pcmpgtw     m6, tcm
+    pand        m4, tcm
+    pandn       m6, m7
+    pand        m4, m6
+    LUMA_Q1 m5, %2, m1, m2, m4, m6
+%endmacro
+
+%macro LUMA_H_STORE 2
+%if mmsize == 8
+    movq        [r0-4], m0
+    movq        [r0+r1-4], m1
+    movq        [r0+r1*2-4], m2
+    movq        [r0+%2-4], m3
+%else
+    movq        [r0-4], m0
+    movhps      [r0+r1-4], m0
+    movq        [r0+r1*2-4], m1
+    movhps      [%1-4], m1
+    movq        [%1+r1-4], m2
+    movhps      [%1+r1*2-4], m2
+    movq        [%1+%2-4], m3
+    movhps      [%1+r1*4-4], m3
+%endif
+%endmacro
+
+%macro DEBLOCK_LUMA 1
+;-----------------------------------------------------------------------------
+; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
+    %assign pad 5*mmsize+12-(stack_offset&15)
+    %define tcm [rsp]
+    %define ms1 [rsp+mmsize]
+    %define ms2 [rsp+mmsize*2]
+    %define am  [rsp+mmsize*3]
+    %define bm  [rsp+mmsize*4]
+    SUB        rsp, pad
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB     m4, m5, r2, r3
+    mov         r3, 32/mmsize
+    mov         r2, r0
+    sub         r0, r1
+    mova        am, m4
+    sub         r0, r1
+    mova        bm, m5
+    sub         r0, r1
+.loop:
+    mova        m0, [r0+r1]
+    mova        m1, [r0+r1*2]
+    mova        m2, [r2]
+    mova        m3, [r2+r1]
+
+    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
+    LOAD_TC     m6, r4
+    mova       tcm, m6
+
+    mova        m5, [r0]
+    LUMA_DEBLOCK_ONE m1, m0, ms1
+    mova   [r0+r1], m5
+
+    mova        m5, [r2+r1*2]
+    LUMA_DEBLOCK_ONE m2, m3, ms2
+    mova   [r2+r1], m5
+
+    pxor        m5, m5
+    mova        m6, tcm
+    pcmpgtw     m5, tcm
+    psubw       m6, ms1
+    pandn       m5, m7
+    psubw       m6, ms2
+    pand        m5, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
+    mova [r0+r1*2], m1
+    mova      [r2], m2
+
+    add         r0, mmsize
+    add         r2, mmsize
+    add         r4, mmsize/8
+    dec         r3
+    jg .loop
+    ADD         rsp, pad
+    RET
+
+cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
+    %assign pad 7*mmsize+12-(stack_offset&15)
+    %define tcm [rsp]
+    %define ms1 [rsp+mmsize]
+    %define ms2 [rsp+mmsize*2]
+    %define p1m [rsp+mmsize*3]
+    %define p2m [rsp+mmsize*4]
+    %define am  [rsp+mmsize*5]
+    %define bm  [rsp+mmsize*6]
+    SUB        rsp, pad
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB     m4, m5, r2, r3
+    mov         r3, r1
+    mova        am, m4
+    add         r3, r1
+    mov         r5, 32/mmsize
+    mova        bm, m5
+    add         r3, r1
+%if mmsize == 16
+    mov         r2, r0
+    add         r2, r3
+%endif
+.loop:
+%if mmsize == 8
+    movq        m2, [r0-8]     ; y q2 q1 q0
+    movq        m7, [r0+0]
+    movq        m5, [r0+r1-8]
+    movq        m3, [r0+r1+0]
+    movq        m0, [r0+r1*2-8]
+    movq        m6, [r0+r1*2+0]
+    movq        m1, [r0+r3-8]
+    TRANSPOSE4x4W 2, 5, 0, 1, 4
+    SWAP         2, 7
+    movq        m7, [r0+r3]
+    TRANSPOSE4x4W 2, 3, 6, 7, 4
+%else
+    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
+    movu        m0, [r0+r1-8]
+    movu        m2, [r0+r1*2-8]
+    movu        m3, [r2-8]
+    TRANSPOSE4x4W 5, 0, 2, 3, 6
+    mova       tcm, m3
+
+    movu        m4, [r2+r1-8]
+    movu        m1, [r2+r1*2-8]
+    movu        m3, [r2+r3-8]
+    movu        m7, [r2+r1*4-8]
+    TRANSPOSE4x4W 4, 1, 3, 7, 6
+
+    mova        m6, tcm
+    punpcklqdq  m6, m7
+    punpckhqdq  m5, m4
+    SBUTTERFLY qdq, 0, 1, 7
+    SBUTTERFLY qdq, 2, 3, 7
+%endif
+
+    mova       p2m, m6
+    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
+    LOAD_TC     m6, r4
+    mova       tcm, m6
+
+    LUMA_DEBLOCK_ONE m1, m0, ms1
+    mova       p1m, m5
+
+    mova        m5, p2m
+    LUMA_DEBLOCK_ONE m2, m3, ms2
+    mova       p2m, m5
+
+    pxor        m5, m5
+    mova        m6, tcm
+    pcmpgtw     m5, tcm
+    psubw       m6, ms1
+    pandn       m5, m7
+    psubw       m6, ms2
+    pand        m5, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
+    mova        m0, p1m
+    mova        m3, p2m
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    LUMA_H_STORE r2, r3
+
+    add         r4, mmsize/8
+    lea         r0, [r0+r1*(mmsize/2)]
+    lea         r2, [r2+r1*(mmsize/2)]
+    dec         r5
+    jg .loop
+    ADD        rsp, pad
+    RET
+%endmacro
+
+INIT_XMM
+%ifdef ARCH_X86_64
+; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
+;      m12=alpha, m13=beta
+; out: m0=p1', m3=q1', m1=p0', m2=q0'
+; clobbers: m4, m5, m6, m7, m10, m11, m14
+%macro DEBLOCK_LUMA_INTER_SSE2 0
+    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
+    LOAD_TC     m6, r4
+    DIFF_LT     m8, m1, m13, m10, m4
+    DIFF_LT     m9, m2, m13, m11, m4
+    pand        m6, m7
+
+    mova       m14, m6
+    pxor        m4, m4
+    pcmpgtw     m6, m4
+    pand        m6, m14
+
+    mova        m5, m10
+    pand        m5, m6
+    LUMA_Q1 m8, m0, m1, m2, m5, m4
+
+    mova        m5, m11
+    pand        m5, m6
+    LUMA_Q1 m9, m3, m1, m2, m5, m4
+
+    pxor        m4, m4
+    psubw       m6, m10
+    pcmpgtw     m4, m14
+    pandn       m4, m7
+    psubw       m6, m11
+    pand        m4, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
+
+    SWAP         0, 8
+    SWAP         3, 9
+%endmacro
+
+%macro DEBLOCK_LUMA_64 1
+cglobal deblock_v_luma_10_%1, 5,5,15
+    %define p2 m8
+    %define p1 m0
+    %define p0 m1
+    %define q0 m2
+    %define q1 m3
+    %define q2 m9
+    %define mask0 m7
+    %define mask1 m10
+    %define mask2 m11
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB    m12, m13, r2, r3
+    mov         r2, r0
+    sub         r0, r1
+    sub         r0, r1
+    sub         r0, r1
+    mov         r3, 2
+.loop:
+    mova        p2, [r0]
+    mova        p1, [r0+r1]
+    mova        p0, [r0+r1*2]
+    mova        q0, [r2]
+    mova        q1, [r2+r1]
+    mova        q2, [r2+r1*2]
+    DEBLOCK_LUMA_INTER_SSE2
+    mova   [r0+r1], p1
+    mova [r0+r1*2], p0
+    mova      [r2], q0
+    mova   [r2+r1], q1
+    add         r0, mmsize
+    add         r2, mmsize
+    add         r4, 2
+    dec         r3
+    jg .loop
+    REP_RET
+
+cglobal deblock_h_luma_10_%1, 5,7,15
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB    m12, m13, r2, r3
+    mov         r2, r1
+    add         r2, r1
+    add         r2, r1
+    mov         r5, r0
+    add         r5, r2
+    mov         r6, 2
+.loop:
+    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
+    movu        m0, [r0+r1-8]
+    movu        m2, [r0+r1*2-8]
+    movu        m9, [r5-8]
+    movu        m5, [r5+r1-8]
+    movu        m1, [r5+r1*2-8]
+    movu        m3, [r5+r2-8]
+    movu        m7, [r5+r1*4-8]
+
+    TRANSPOSE4x4W 8, 0, 2, 9, 10
+    TRANSPOSE4x4W 5, 1, 3, 7, 10
+
+    punpckhqdq  m8, m5
+    SBUTTERFLY qdq, 0, 1, 10
+    SBUTTERFLY qdq, 2, 3, 10
+    punpcklqdq  m9, m7
+
+    DEBLOCK_LUMA_INTER_SSE2
+
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    LUMA_H_STORE r5, r2
+    add         r4, 2
+    lea         r0, [r0+r1*8]
+    lea         r5, [r5+r1*8]
+    dec         r6
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA_64 sse2
+INIT_AVX
+DEBLOCK_LUMA_64 avx
+%endif
+
+%macro SWAPMOVA 2
+%ifid %1
+    SWAP %1, %2
+%else
+    mova %1, %2
+%endif
+%endmacro
+
+; in: t0-t2: tmp registers
+;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
+;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
+%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
+%ifdef ARCH_X86_64
+    paddw     t0, %3, %2
+    mova      t2, %4
+    paddw     t2, %3
+%else
+    mova      t0, %3
+    mova      t2, %4
+    paddw     t0, %2
+    paddw     t2, %3
+%endif
+    paddw     t0, %1
+    paddw     t2, t2
+    paddw     t0, %5
+    paddw     t2, %9
+    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
+    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
+
+    psrlw     t2, 3
+    psrlw     t1, t0, 2
+    psubw     t2, %3
+    psubw     t1, %2
+    pand      t2, %8
+    pand      t1, %8
+    paddw     t2, %3
+    paddw     t1, %2
+    SWAPMOVA %11, t1
+
+    psubw     t1, t0, %3
+    paddw     t0, t0
+    psubw     t1, %5
+    psubw     t0, %3
+    paddw     t1, %6
+    paddw     t1, %2
+    paddw     t0, %6
+    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
+    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
+
+    pxor      t0, t1
+    pxor      t1, %1
+    pand      t0, %8
+    pand      t1, %7
+    pxor      t0, t1
+    pxor      t0, %1
+    SWAPMOVA %10, t0
+    SWAPMOVA %12, t2
+%endmacro
+
+%macro LUMA_INTRA_INIT 1
+    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
+    %define t0 m4
+    %define t1 m5
+    %define t2 m6
+    %define t3 m7
+    %assign i 4
+%rep %1
+    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
+    %assign i i+1
+%endrep
+    SUB    rsp, pad
+%endmacro
+
+; in: %1-%3=tmp, %4=p2, %5=q2
+%macro LUMA_INTRA_INTER 5
+    LOAD_AB t0, t1, r2d, r3d
+    mova    %1, t0
+    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
+%ifdef ARCH_X86_64
+    mova    %2, t0        ; mask0
+    psrlw   t3, %1, 2
+%else
+    mova    t3, %1
+    mova    %2, t0        ; mask0
+    psrlw   t3, 2
+%endif
+    paddw   t3, [pw_2]    ; alpha/4+2
+    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
+    pand    t2, %2
+    mova    t3, %5        ; q2
+    mova    %1, t2        ; mask1
+    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
+    pand    t2, %1
+    mova    t3, %4        ; p2
+    mova    %3, t2        ; mask1q
+    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
+    pand    t2, %1
+    mova    %1, t2        ; mask1p
+%endmacro
+
+%macro LUMA_H_INTRA_LOAD 0
+%if mmsize == 8
+    movu    t0, [r0-8]
+    movu    t1, [r0+r1-8]
+    movu    m0, [r0+r1*2-8]
+    movu    m1, [r0+r4-8]
+    TRANSPOSE4x4W 4, 5, 0, 1, 2
+    mova    t4, t0        ; p3
+    mova    t5, t1        ; p2
+
+    movu    m2, [r0]
+    movu    m3, [r0+r1]
+    movu    t0, [r0+r1*2]
+    movu    t1, [r0+r4]
+    TRANSPOSE4x4W 2, 3, 4, 5, 6
+    mova    t6, t0        ; q2
+    mova    t7, t1        ; q3
+%else
+    movu    t0, [r0-8]
+    movu    t1, [r0+r1-8]
+    movu    m0, [r0+r1*2-8]
+    movu    m1, [r0+r5-8]
+    movu    m2, [r4-8]
+    movu    m3, [r4+r1-8]
+    movu    t2, [r4+r1*2-8]
+    movu    t3, [r4+r5-8]
+    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
+    mova    t4, t0        ; p3
+    mova    t5, t1        ; p2
+    mova    t6, t2        ; q2
+    mova    t7, t3        ; q3
+%endif
+%endmacro
+
+; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
+%macro LUMA_H_INTRA_STORE 9
+%if mmsize == 8
+    TRANSPOSE4x4W %1, %2, %3, %4, %9
+    movq       [r0-8], m%1
+    movq       [r0+r1-8], m%2
+    movq       [r0+r1*2-8], m%3
+    movq       [r0+r4-8], m%4
+    movq       m%1, %8
+    TRANSPOSE4x4W %5, %6, %7, %1, %9
+    movq       [r0], m%5
+    movq       [r0+r1], m%6
+    movq       [r0+r1*2], m%7
+    movq       [r0+r4], m%1
+%else
+    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
+    movq       [r0-8], m%1
+    movq       [r0+r1-8], m%2
+    movq       [r0+r1*2-8], m%3
+    movq       [r0+r5-8], m%4
+    movhps     [r4-8], m%1
+    movhps     [r4+r1-8], m%2
+    movhps     [r4+r1*2-8], m%3
+    movhps     [r4+r5-8], m%4
+%ifnum %8
+    SWAP       %1, %8
+%else
+    mova       m%1, %8
+%endif
+    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
+    movq       [r0], m%5
+    movq       [r0+r1], m%6
+    movq       [r0+r1*2], m%7
+    movq       [r0+r5], m%1
+    movhps     [r4], m%5
+    movhps     [r4+r1], m%6
+    movhps     [r4+r1*2], m%7
+    movhps     [r4+r5], m%1
+%endif
+%endmacro
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+%macro DEBLOCK_LUMA_INTRA_64 1
+cglobal deblock_v_luma_intra_10_%1, 4,7,16
+    %define t0 m1
+    %define t1 m2
+    %define t2 m4
+    %define p2 m8
+    %define p1 m9
+    %define p0 m10
+    %define q0 m11
+    %define q1 m12
+    %define q2 m13
+    %define aa m5
+    %define bb m14
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    neg     r4
+    add     r4, r0     ; pix-4*stride
+    mov     r6, 2
+    mova    m0, [pw_2]
+    shl    r2d, 2
+    shl    r3d, 2
+    LOAD_AB aa, bb, r2d, r3d
+.loop
+    mova    p2, [r4+r1]
+    mova    p1, [r4+2*r1]
+    mova    p0, [r4+r5]
+    mova    q0, [r0]
+    mova    q1, [r0+r1]
+    mova    q2, [r0+2*r1]
+
+    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
+    mova    t2, aa
+    psrlw   t2, 2
+    paddw   t2, m0 ; alpha/4+2
+    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
+    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
+    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
+    pand    m6, m3
+    pand    m7, m6
+    pand    m6, t1
+    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
+    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
+    add     r0, mmsize
+    add     r4, mmsize
+    dec     r6
+    jg .loop
+    REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_10_%1, 4,7,16
+    %define t0 m15
+    %define t1 m14
+    %define t2 m2
+    %define q3 m5
+    %define q2 m8
+    %define q1 m9
+    %define q0 m10
+    %define p0 m11
+    %define p1 m12
+    %define p2 m13
+    %define p3 m4
+    %define spill [rsp]
+    %assign pad 24-(stack_offset&15)
+    SUB     rsp, pad
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    add     r4, r0     ; pix+4*stride
+    mov     r6, 2
+    mova    m0, [pw_2]
+    shl    r2d, 2
+    shl    r3d, 2
+.loop
+    movu    q3, [r0-8]
+    movu    q2, [r0+r1-8]
+    movu    q1, [r0+r1*2-8]
+    movu    q0, [r0+r5-8]
+    movu    p0, [r4-8]
+    movu    p1, [r4+r1-8]
+    movu    p2, [r4+r1*2-8]
+    movu    p3, [r4+r5-8]
+    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
+
+    LOAD_AB m1, m2, r2d, r3d
+    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
+    psrlw   m1, 2
+    paddw   m1, m0 ; alpha/4+2
+    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
+    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
+    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
+    pand    m6, m3
+    pand    m7, m6
+    pand    m6, t1
+
+    mova spill, q3
+    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
+    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
+    mova    m7, spill
+
+    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
+
+    lea     r0, [r0+r1*8]
+    lea     r4, [r4+r1*8]
+    dec     r6
+    jg .loop
+    ADD    rsp, pad
+    RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA_INTRA_64 sse2
+INIT_AVX
+DEBLOCK_LUMA_INTRA_64 avx
+
+%endif
+
+%macro DEBLOCK_LUMA_INTRA 1
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
+    LUMA_INTRA_INIT 3
+    lea     r4, [r1*4]
+    lea     r5, [r1*3]
+    neg     r4
+    add     r4, r0
+    mov     r6, 32/mmsize
+    shl    r2d, 2
+    shl    r3d, 2
+.loop:
+    mova    m0, [r4+r1*2] ; p1
+    mova    m1, [r4+r5]   ; p0
+    mova    m2, [r0]      ; q0
+    mova    m3, [r0+r1]   ; q1
+    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
+    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
+    mova    t3, [r0+r1*2] ; q2
+    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
+    add     r0, mmsize
+    add     r4, mmsize
+    dec     r6
+    jg .loop
+    ADD    rsp, pad
+    RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
+    LUMA_INTRA_INIT 8
+%if mmsize == 8
+    lea     r4, [r1*3]
+    mov     r5, 32/mmsize
+%else
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    add     r4, r0     ; pix+4*stride
+    mov     r6, 32/mmsize
+%endif
+    shl    r2d, 2
+    shl    r3d, 2
+.loop:
+    LUMA_H_INTRA_LOAD
+    LUMA_INTRA_INTER t8, t9, t10, t5, t6
+
+    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
+    mova    t3, t6     ; q2
+    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
+
+    mova    m2, t4
+    mova    m0, t11
+    mova    m1, t5
+    mova    m3, t8
+    mova    m6, t6
+
+    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
+
+    lea     r0, [r0+r1*(mmsize/2)]
+%if mmsize == 8
+    dec     r5
+%else
+    lea     r4, [r4+r1*(mmsize/2)]
+    dec     r6
+%endif
+    jg .loop
+    ADD    rsp, pad
+    RET
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_LUMA mmxext
+DEBLOCK_LUMA_INTRA mmxext
+INIT_XMM
+DEBLOCK_LUMA sse2
+DEBLOCK_LUMA_INTRA sse2
+INIT_AVX
+DEBLOCK_LUMA avx
+DEBLOCK_LUMA_INTRA avx
+%endif
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 7d27c02ea2..42dae93f2d 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -218,45 +218,49 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
     );
 }
 
-#define LF_FUNC(DIR, TYPE, OPT) \
-void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
-                                                  int alpha, int beta, int8_t *tc0);
-#define LF_IFUNC(DIR, TYPE, OPT) \
-void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
-                                                  int alpha, int beta);
+#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
+                                                                int alpha, int beta, int8_t *tc0);
+#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
+                                                                int alpha, int beta);
 
-LF_FUNC (h,  chroma,       mmxext)
-LF_IFUNC(h,  chroma_intra, mmxext)
-LF_FUNC (v,  chroma,       mmxext)
-LF_IFUNC(v,  chroma_intra, mmxext)
+#define LF_FUNCS(type, depth)\
+LF_FUNC (h,  chroma,       depth, mmxext)\
+LF_IFUNC(h,  chroma_intra, depth, mmxext)\
+LF_FUNC (v,  chroma,       depth, mmxext)\
+LF_IFUNC(v,  chroma_intra, depth, mmxext)\
+LF_FUNC (h,  luma,         depth, mmxext)\
+LF_IFUNC(h,  luma_intra,   depth, mmxext)\
+LF_FUNC (h,  luma,         depth, sse2)\
+LF_IFUNC(h,  luma_intra,   depth, sse2)\
+LF_FUNC (v,  luma,         depth, sse2)\
+LF_IFUNC(v,  luma_intra,   depth, sse2)\
+LF_FUNC (h,  luma,         depth,  avx)\
+LF_IFUNC(h,  luma_intra,   depth,  avx)\
+LF_FUNC (v,  luma,         depth,  avx)\
+LF_IFUNC(v,  luma_intra,   depth,  avx)
 
-LF_FUNC (h,  luma,         mmxext)
-LF_IFUNC(h,  luma_intra,   mmxext)
-#if HAVE_YASM && ARCH_X86_32
-LF_FUNC (v8, luma,         mmxext)
-static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+LF_FUNCS( uint8_t,  8)
+LF_FUNCS(uint16_t, 10)
+
+LF_FUNC (v8, luma,             8, mmxext)
+static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 {
     if((tc0[0] & tc0[1]) >= 0)
-        ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
+        ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0);
     if((tc0[2] & tc0[3]) >= 0)
-        ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
+        ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2);
 }
-LF_IFUNC(v8, luma_intra,   mmxext)
-static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
+LF_IFUNC(v8, luma_intra,        8, mmxext)
+static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta)
 {
-    ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
-    ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta);
 }
-#endif
 
-LF_FUNC (h,  luma,         sse2)
-LF_IFUNC(h,  luma_intra,   sse2)
-LF_FUNC (v,  luma,         sse2)
-LF_IFUNC(v,  luma_intra,   sse2)
-LF_FUNC (h,  luma,         avx)
-LF_IFUNC(h,  luma_intra,   avx)
-LF_FUNC (v,  luma,         avx)
-LF_IFUNC(v,  luma_intra,   avx)
+LF_FUNC (v,  luma,            10, mmxext)
+LF_IFUNC(v,  luma_intra,      10, mmxext)
 
 /***********************************/
 /* weighted prediction */
@@ -318,15 +322,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
             c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
             c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
 
-            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
-            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
-            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
-            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
+            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
+            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
+            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext;
+            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
 #if ARCH_X86_32
-            c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
-            c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
-            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
-            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
+            c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext;
+            c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
             c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
             c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
@@ -364,10 +368,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
 
 #if HAVE_ALIGNED_STACK
-                c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
-                c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
-                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
-                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
 #endif
 
                 c->h264_idct_add16 = ff_h264_idct_add16_sse2;
@@ -383,10 +387,39 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
             }
             if (mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
-                c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
-                c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
-                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
-                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#endif
+            }
+        }
+    }
+#endif
+    } else if (bit_depth == 10) {
+#if HAVE_YASM
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if (mm_flags & AV_CPU_FLAG_MMX2) {
+#if ARCH_X86_32
+            c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
+            c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
+#endif
+            if (mm_flags&AV_CPU_FLAG_SSE2) {
+#if HAVE_ALIGNED_STACK
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
+#endif
+            }
+            if (mm_flags&AV_CPU_FLAG_AVX) {
+#if HAVE_ALIGNED_STACK
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
 #endif
             }
         }
diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm
index 28baf7a96a..c963deff86 100644
--- a/libavcodec/x86/x86util.asm
+++ b/libavcodec/x86/x86util.asm
@@ -457,3 +457,8 @@
     pshufw     %1, %2, (%3)*0x55
 %endif
 %endmacro
+
+%macro CLIPW 3 ;(dst, min, max)
+    pmaxsw %1, %2
+    pminsw %1, %3
+%endmacro

From 918a5409532e1218b011b5c079beb4eb5f45fdd4 Mon Sep 17 00:00:00 2001
From: Alex Converse <aconverse@google.com>
Date: Tue, 10 May 2011 15:10:31 -0700
Subject: [PATCH 03/32] Don't allow unsupported resampling configurations.

---
 libavcodec/resample.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libavcodec/resample.c b/libavcodec/resample.c
index 2185a11ad7..9f0599fb59 100644
--- a/libavcodec/resample.c
+++ b/libavcodec/resample.c
@@ -156,6 +156,11 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
         av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than 2 unsupported.\n");
         return NULL;
       }
+    if (output_channels > 2 && !(output_channels == 6 && input_channels == 2)) {
+        av_log(NULL, AV_LOG_ERROR,
+               "Resampling output channel count must be 1 or 2 for mono input and 1, 2 or 6 for stereo input.\n");
+        return NULL;
+    }
 
     s = av_mallocz(sizeof(ReSampleContext));
     if (!s)

From 3e00ababc49bc8ddd149c891199ba2d30beb3118 Mon Sep 17 00:00:00 2001
From: Alex Converse <aconverse@google.com>
Date: Tue, 10 May 2011 14:24:05 -0700
Subject: [PATCH 04/32] Allow resampling with no channel count change for up to
 8 channels.

---
 libavcodec/resample.c | 84 +++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/libavcodec/resample.c b/libavcodec/resample.c
index 9f0599fb59..bdd32f439d 100644
--- a/libavcodec/resample.c
+++ b/libavcodec/resample.c
@@ -29,6 +29,8 @@
 #include "libavutil/opt.h"
 #include "libavutil/samplefmt.h"
 
+#define MAX_CHANNELS 8
+
 struct AVResampleContext;
 
 static const char *context_to_name(void *ptr)
@@ -41,7 +43,7 @@ static const AVClass audioresample_context_class = { "ReSampleContext", context_
 
 struct ReSampleContext {
     struct AVResampleContext *resample_context;
-    short *temp[2];
+    short *temp[MAX_CHANNELS];
     int temp_len;
     float ratio;
     /* channel convert */
@@ -104,24 +106,25 @@ static void mono_to_stereo(short *output, short *input, int n1)
     }
 }
 
-/* XXX: should use more abstract 'N' channels system */
-static void stereo_split(short *output1, short *output2, short *input, int n)
+static void deinterleave(short **output, short *input, int channels, int samples)
 {
-    int i;
+    int i, j;
 
-    for(i=0;i<n;i++) {
-        *output1++ = *input++;
-        *output2++ = *input++;
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output[j]++ = *input++;
+        }
     }
 }
 
-static void stereo_mux(short *output, short *input1, short *input2, int n)
+static void interleave(short *output, short **input, int channels, int samples)
 {
-    int i;
+    int i, j;
 
-    for(i=0;i<n;i++) {
-        *output++ = *input1++;
-        *output++ = *input2++;
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output++ = *input[j]++;
+        }
     }
 }
 
@@ -151,14 +154,18 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
 {
     ReSampleContext *s;
 
-    if ( input_channels > 2)
+    if (input_channels > MAX_CHANNELS)
       {
-        av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than 2 unsupported.\n");
+        av_log(NULL, AV_LOG_ERROR,
+               "Resampling with input channels greater than %d is unsupported.\n",
+               MAX_CHANNELS);
         return NULL;
       }
-    if (output_channels > 2 && !(output_channels == 6 && input_channels == 2)) {
+    if (  output_channels > 2 &&
+        !(output_channels == 6 && input_channels == 2) &&
+          output_channels != input_channels) {
         av_log(NULL, AV_LOG_ERROR,
-               "Resampling output channel count must be 1 or 2 for mono input and 1, 2 or 6 for stereo input.\n");
+               "Resampling output channel count must be 1 or 2 for mono input; 1, 2 or 6 for stereo input; or N for N channel input.\n");
         return NULL;
     }
 
@@ -206,14 +213,6 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
         }
     }
 
-/*
- * AC-3 output is the only case where filter_channels could be greater than 2.
- * input channels can't be greater than 2, so resample the 2 channels and then
- * expand to 6 channels after the resampling.
- */
-    if(s->filter_channels>2)
-      s->filter_channels = 2;
-
 #define TAPS 16
     s->resample_context= av_resample_init(output_rate, input_rate,
                          filter_length, log2_phase_count, linear, cutoff);
@@ -228,9 +227,9 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
 int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples)
 {
     int i, nb_samples1;
-    short *bufin[2];
-    short *bufout[2];
-    short *buftmp2[2], *buftmp3[2];
+    short *bufin[MAX_CHANNELS];
+    short *bufout[MAX_CHANNELS];
+    short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS];
     short *output_bak = NULL;
     int lenout;
 
@@ -291,12 +290,9 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         bufin[i]= av_malloc( (nb_samples + s->temp_len) * sizeof(short) );
         memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short));
         buftmp2[i] = bufin[i] + s->temp_len;
+        bufout[i] = av_malloc(lenout * sizeof(short));
     }
 
-    /* make some zoom to avoid round pb */
-    bufout[0]= av_malloc( lenout * sizeof(short) );
-    bufout[1]= av_malloc( lenout * sizeof(short) );
-
     if (s->input_channels == 2 &&
         s->output_channels == 1) {
         buftmp3[0] = output;
@@ -304,10 +300,11 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
     } else if (s->output_channels >= 2 && s->input_channels == 1) {
         buftmp3[0] = bufout[0];
         memcpy(buftmp2[0], input, nb_samples*sizeof(short));
-    } else if (s->output_channels >= 2) {
-        buftmp3[0] = bufout[0];
-        buftmp3[1] = bufout[1];
-        stereo_split(buftmp2[0], buftmp2[1], input, nb_samples);
+    } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) {
+        for (i = 0; i < s->input_channels; i++) {
+            buftmp3[i] = bufout[i];
+        }
+        deinterleave(buftmp2, input, s->input_channels, nb_samples);
     } else {
         buftmp3[0] = output;
         memcpy(buftmp2[0], input, nb_samples*sizeof(short));
@@ -329,10 +326,10 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
 
     if (s->output_channels == 2 && s->input_channels == 1) {
         mono_to_stereo(output, buftmp3[0], nb_samples1);
-    } else if (s->output_channels == 2) {
-        stereo_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
-    } else if (s->output_channels == 6) {
+    } else if (s->output_channels == 6 && s->input_channels == 2) {
         ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
+    } else if (s->output_channels == s->input_channels && s->input_channels >= 2) {
+        interleave(output, buftmp3, s->output_channels, nb_samples1);
     }
 
     if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
@@ -348,19 +345,20 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         }
     }
 
-    for(i=0; i<s->filter_channels; i++)
+    for (i = 0; i < s->filter_channels; i++) {
         av_free(bufin[i]);
+        av_free(bufout[i]);
+    }
 
-    av_free(bufout[0]);
-    av_free(bufout[1]);
     return nb_samples1;
 }
 
 void audio_resample_close(ReSampleContext *s)
 {
+    int i;
     av_resample_close(s->resample_context);
-    av_freep(&s->temp[0]);
-    av_freep(&s->temp[1]);
+    for (i = 0; i < s->filter_channels; i++)
+        av_freep(&s->temp[i]);
     av_freep(&s->buffer[0]);
     av_freep(&s->buffer[1]);
     av_audio_convert_free(s->convert_ctx[0]);

From ffc437c026dd0e1b8e5d9114163b4e95999b95fd Mon Sep 17 00:00:00 2001
From: Alex Converse <aconverse@google.com>
Date: Tue, 10 May 2011 16:58:01 -0700
Subject: [PATCH 05/32] cosmetics: Fix crazy formatting in resample.

---
 libavcodec/resample.c | 97 ++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 47 deletions(-)

diff --git a/libavcodec/resample.c b/libavcodec/resample.c
index bdd32f439d..0bebe1ab88 100644
--- a/libavcodec/resample.c
+++ b/libavcodec/resample.c
@@ -39,7 +39,9 @@ static const char *context_to_name(void *ptr)
 }
 
 static const AVOption options[] = {{NULL}};
-static const AVClass audioresample_context_class = { "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT };
+static const AVClass audioresample_context_class = {
+    "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT
+};
 
 struct ReSampleContext {
     struct AVResampleContext *resample_context;
@@ -50,9 +52,9 @@ struct ReSampleContext {
     int input_channels, output_channels, filter_channels;
     AVAudioConvert *convert_ctx[2];
     enum AVSampleFormat sample_fmt[2]; ///< input and output sample format
-    unsigned sample_size[2];         ///< size of one sample in sample_fmt
-    short *buffer[2];                ///< buffers used for conversion to S16
-    unsigned buffer_size[2];         ///< sizes of allocated buffers
+    unsigned sample_size[2];           ///< size of one sample in sample_fmt
+    short *buffer[2];                  ///< buffers used for conversion to S16
+    unsigned buffer_size[2];           ///< sizes of allocated buffers
 };
 
 /* n1: number of samples */
@@ -131,17 +133,17 @@ static void interleave(short *output, short **input, int channels, int samples)
 static void ac3_5p1_mux(short *output, short *input1, short *input2, int n)
 {
     int i;
-    short l,r;
+    short l, r;
 
-    for(i=0;i<n;i++) {
-      l=*input1++;
-      r=*input2++;
-      *output++ = l;           /* left */
-      *output++ = (l/2)+(r/2); /* center */
-      *output++ = r;           /* right */
-      *output++ = 0;           /* left surround */
-      *output++ = 0;           /* right surroud */
-      *output++ = 0;           /* low freq */
+    for (i = 0; i < n; i++) {
+        l = *input1++;
+        r = *input2++;
+        *output++ = l;                  /* left */
+        *output++ = (l / 2) + (r / 2);  /* center */
+        *output++ = r;                  /* right */
+        *output++ = 0;                  /* left surround */
+        *output++ = 0;                  /* right surroud */
+        *output++ = 0;                  /* low freq */
     }
 }
 
@@ -154,27 +156,25 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
 {
     ReSampleContext *s;
 
-    if (input_channels > MAX_CHANNELS)
-      {
+    if (input_channels > MAX_CHANNELS) {
         av_log(NULL, AV_LOG_ERROR,
                "Resampling with input channels greater than %d is unsupported.\n",
                MAX_CHANNELS);
         return NULL;
-      }
-    if (  output_channels > 2 &&
+    }
+    if (output_channels > 2 &&
         !(output_channels == 6 && input_channels == 2) &&
-          output_channels != input_channels) {
+        output_channels != input_channels) {
         av_log(NULL, AV_LOG_ERROR,
                "Resampling output channel count must be 1 or 2 for mono input; 1, 2 or 6 for stereo input; or N for N channel input.\n");
         return NULL;
     }
 
     s = av_mallocz(sizeof(ReSampleContext));
-    if (!s)
-      {
+    if (!s) {
         av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n");
         return NULL;
-      }
+    }
 
     s->ratio = (float)output_rate / (float)input_rate;
 
@@ -185,10 +185,10 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
     if (s->output_channels < s->filter_channels)
         s->filter_channels = s->output_channels;
 
-    s->sample_fmt [0] = sample_fmt_in;
-    s->sample_fmt [1] = sample_fmt_out;
-    s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0])>>3;
-    s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1])>>3;
+    s->sample_fmt[0]  = sample_fmt_in;
+    s->sample_fmt[1]  = sample_fmt_out;
+    s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0]) >> 3;
+    s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1]) >> 3;
 
     if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
         if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1,
@@ -214,8 +214,9 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
     }
 
 #define TAPS 16
-    s->resample_context= av_resample_init(output_rate, input_rate,
-                         filter_length, log2_phase_count, linear, cutoff);
+    s->resample_context = av_resample_init(output_rate, input_rate,
+                                           filter_length, log2_phase_count,
+                                           linear, cutoff);
 
     *(const AVClass**)s->resample_context = &audioresample_context_class;
 
@@ -244,7 +245,7 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         int ostride[1] = { 2 };
         const void *ibuf[1] = { input };
         void       *obuf[1];
-        unsigned input_size = nb_samples*s->input_channels*2;
+        unsigned input_size = nb_samples * s->input_channels * 2;
 
         if (!s->buffer_size[0] || s->buffer_size[0] < input_size) {
             av_free(s->buffer[0]);
@@ -259,15 +260,16 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         obuf[0] = s->buffer[0];
 
         if (av_audio_convert(s->convert_ctx[0], obuf, ostride,
-                             ibuf, istride, nb_samples*s->input_channels) < 0) {
-            av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format conversion failed\n");
+                             ibuf, istride, nb_samples * s->input_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format conversion failed\n");
             return 0;
         }
 
-        input  = s->buffer[0];
+        input = s->buffer[0];
     }
 
-    lenout= 4*nb_samples * s->ratio + 16;
+    lenout = 4 * nb_samples * s->ratio + 16;
 
     if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
         output_bak = output;
@@ -286,20 +288,19 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
     }
 
     /* XXX: move those malloc to resample init code */
-    for(i=0; i<s->filter_channels; i++){
-        bufin[i]= av_malloc( (nb_samples + s->temp_len) * sizeof(short) );
+    for (i = 0; i < s->filter_channels; i++) {
+        bufin[i] = av_malloc((nb_samples + s->temp_len) * sizeof(short));
         memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short));
         buftmp2[i] = bufin[i] + s->temp_len;
         bufout[i] = av_malloc(lenout * sizeof(short));
     }
 
-    if (s->input_channels == 2 &&
-        s->output_channels == 1) {
+    if (s->input_channels == 2 && s->output_channels == 1) {
         buftmp3[0] = output;
         stereo_to_mono(buftmp2[0], input, nb_samples);
     } else if (s->output_channels >= 2 && s->input_channels == 1) {
         buftmp3[0] = bufout[0];
-        memcpy(buftmp2[0], input, nb_samples*sizeof(short));
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
     } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) {
         for (i = 0; i < s->input_channels; i++) {
             buftmp3[i] = bufout[i];
@@ -307,21 +308,22 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         deinterleave(buftmp2, input, s->input_channels, nb_samples);
     } else {
         buftmp3[0] = output;
-        memcpy(buftmp2[0], input, nb_samples*sizeof(short));
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
     }
 
     nb_samples += s->temp_len;
 
     /* resample each channel */
     nb_samples1 = 0; /* avoid warning */
-    for(i=0;i<s->filter_channels;i++) {
+    for (i = 0; i < s->filter_channels; i++) {
         int consumed;
-        int is_last= i+1 == s->filter_channels;
+        int is_last = i + 1 == s->filter_channels;
 
-        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i], &consumed, nb_samples, lenout, is_last);
-        s->temp_len= nb_samples - consumed;
-        s->temp[i]= av_realloc(s->temp[i], s->temp_len*sizeof(short));
-        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len*sizeof(short));
+        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i],
+                                  &consumed, nb_samples, lenout, is_last);
+        s->temp_len = nb_samples - consumed;
+        s->temp[i] = av_realloc(s->temp[i], s->temp_len * sizeof(short));
+        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short));
     }
 
     if (s->output_channels == 2 && s->input_channels == 1) {
@@ -339,8 +341,9 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         void       *obuf[1] = { output_bak };
 
         if (av_audio_convert(s->convert_ctx[1], obuf, ostride,
-                             ibuf, istride, nb_samples1*s->output_channels) < 0) {
-            av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format convertion failed\n");
+                             ibuf, istride, nb_samples1 * s->output_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format convertion failed\n");
             return 0;
         }
     }

From 91199cfe55b1398b23a16b1f55df75e62e05198b Mon Sep 17 00:00:00 2001
From: Benjamin Larsson <benjamin@southpole.se>
Date: Tue, 8 Mar 2011 15:29:46 +0100
Subject: [PATCH 06/32] ffplay: add a dummy option -i so that it is easy to
 switch between ffmpeg -i "file" and ffplay -i "file".

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 cmdutils.c | 4 ++--
 ffplay.c   | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmdutils.c b/cmdutils.c
index f95778822e..feeea694d8 100644
--- a/cmdutils.c
+++ b/cmdutils.c
@@ -281,8 +281,8 @@ unknown_opt:
                     fprintf(stderr, "%s: failed to set value '%s' for option '%s'\n", argv[0], arg, opt);
                     exit(1);
                 }
-            } else {
-                po->u.func_arg(arg);
+            } else if (po->u.func_arg) {
+                    po->u.func_arg(arg);
             }
             if(po->flags & OPT_EXIT)
                 exit(0);
diff --git a/ffplay.c b/ffplay.c
index 11b307eaca..07727b667a 100644
--- a/ffplay.c
+++ b/ffplay.c
@@ -3019,6 +3019,7 @@ static const OptionDef options[] = {
 #endif
     { "rdftspeed", OPT_INT | HAS_ARG| OPT_AUDIO | OPT_EXPERT, {(void*)&rdftspeed}, "rdft speed", "msecs" },
     { "default", OPT_FUNC2 | HAS_ARG | OPT_AUDIO | OPT_VIDEO | OPT_EXPERT, {(void*)opt_default}, "generic catch all option", "" },
+    { "i", 0, {NULL}, "ffmpeg compatibility dummy option", ""},
     { NULL, },
 };
 

From b568d6d94bda607e4ebb35be68181a8c2a9f5c50 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Sat, 26 Mar 2011 15:26:45 +0100
Subject: [PATCH 07/32] ffmpeg: warns the user when the selected pixel format
 is ignored

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 ffmpeg.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ffmpeg.c b/ffmpeg.c
index 74cfd095d0..612a413077 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -590,8 +590,15 @@ static void choose_pixel_fmt(AVStream *st, AVCodec *codec)
             if(*p == st->codec->pix_fmt)
                 break;
         }
-        if(*p == -1)
+        if (*p == -1) {
+            if(st->codec->pix_fmt != PIX_FMT_NONE)
+                av_log(NULL, AV_LOG_WARNING,
+                        "Incompatible pixel format '%s' for codec '%s', auto-selecting format '%s'\n",
+                        av_pix_fmt_descriptors[st->codec->pix_fmt].name,
+                        codec->name,
+                        av_pix_fmt_descriptors[codec->pix_fmts[0]].name);
             st->codec->pix_fmt = codec->pix_fmts[0];
+        }
     }
 }
 

From 2ecc5b70fb53d0e2e74b51d1d598af8c842afc68 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Sat, 16 Apr 2011 22:58:13 +0200
Subject: [PATCH 08/32] ffmpeg: improve reporting if size/pixel format changes

Use av_log() rather than fprintf(stderr, ...), and show information
related to the previous size/pixel format configuration.

Consistent with the corresponding message issued in case of audio
configuration change.

Signed-off-by: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 ffmpeg.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ffmpeg.c b/ffmpeg.c
index 612a413077..89755863e4 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -1129,8 +1129,11 @@ static void do_video_out(AVFormatContext *s,
     if (   ost->resample_height != ist->st->codec->height
         || ost->resample_width  != ist->st->codec->width
         || (ost->resample_pix_fmt!= ist->st->codec->pix_fmt) ) {
-
-        fprintf(stderr,"Input Stream #%d.%d frame size changed to %dx%d, %s\n", ist->file_index, ist->index, ist->st->codec->width,     ist->st->codec->height,avcodec_get_pix_fmt_name(ist->st->codec->pix_fmt));
+        av_log(NULL, AV_LOG_INFO,
+               "Input stream #%d.%d frame changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s\n",
+               ist->file_index, ist->index,
+               ost->resample_width  , ost->resample_height  , avcodec_get_pix_fmt_name(ost->resample_pix_fmt),
+               ist->st->codec->width, ist->st->codec->height, avcodec_get_pix_fmt_name(ist->st->codec->pix_fmt));
         if(!ost->video_resample)
             ffmpeg_exit(1);
     }

From c29c2eea8fb35682fdfdcb64c4890e8a25137b2a Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Sun, 17 Apr 2011 01:38:09 +0200
Subject: [PATCH 09/32] ffmpeg: prefer "dec" over "ist->st->codec" in
 do_video_out() snippet

Simplify, ease readability.

Signed-off-by: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 ffmpeg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ffmpeg.c b/ffmpeg.c
index 89755863e4..a5d877a022 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -1132,8 +1132,8 @@ static void do_video_out(AVFormatContext *s,
         av_log(NULL, AV_LOG_INFO,
                "Input stream #%d.%d frame changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s\n",
                ist->file_index, ist->index,
-               ost->resample_width  , ost->resample_height  , avcodec_get_pix_fmt_name(ost->resample_pix_fmt),
-               ist->st->codec->width, ist->st->codec->height, avcodec_get_pix_fmt_name(ist->st->codec->pix_fmt));
+               ost->resample_width, ost->resample_height, avcodec_get_pix_fmt_name(ost->resample_pix_fmt),
+               dec->width         , dec->height         , avcodec_get_pix_fmt_name(dec->pix_fmt));
         if(!ost->video_resample)
             ffmpeg_exit(1);
     }

From 9aa797cd2873562e85d04ea45ef7f49ad2cb07b9 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Sat, 16 Apr 2011 23:11:01 +0200
Subject: [PATCH 10/32] ffmpeg: factorize resampling condition check in
 do_video_out()

Simplify and improve readability.

Signed-off-by: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 ffmpeg.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/ffmpeg.c b/ffmpeg.c
index a5d877a022..647018c083 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -1078,7 +1078,7 @@ static void do_video_out(AVFormatContext *s,
                          AVFrame *in_picture,
                          int *frame_size)
 {
-    int nb_frames, i, ret;
+    int nb_frames, i, ret, resample_changed;
     AVFrame *final_picture, *formatted_picture, *resampling_dst;
     AVCodecContext *enc, *dec;
     double sync_ipts;
@@ -1126,9 +1126,11 @@ static void do_video_out(AVFormatContext *s,
     final_picture = formatted_picture;
     resampling_dst = &ost->pict_tmp;
 
-    if (   ost->resample_height != ist->st->codec->height
-        || ost->resample_width  != ist->st->codec->width
-        || (ost->resample_pix_fmt!= ist->st->codec->pix_fmt) ) {
+    resample_changed = ost->resample_width   != dec->width  ||
+                       ost->resample_height  != dec->height ||
+                       ost->resample_pix_fmt != dec->pix_fmt;
+
+    if (resample_changed) {
         av_log(NULL, AV_LOG_INFO,
                "Input stream #%d.%d frame changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s\n",
                ist->file_index, ist->index,
@@ -1141,10 +1143,7 @@ static void do_video_out(AVFormatContext *s,
 #if !CONFIG_AVFILTER
     if (ost->video_resample) {
         final_picture = &ost->pict_tmp;
-        if(  ost->resample_height != ist->st->codec->height
-          || ost->resample_width  != ist->st->codec->width
-          || (ost->resample_pix_fmt!= ist->st->codec->pix_fmt) ) {
-
+        if (resample_changed) {
             /* initialize a new scaler context */
             sws_freeContext(ost->img_resample_ctx);
             ost->img_resample_ctx = sws_getContext(

From 2b95602e93226bd269676b0edcda5322b5be8444 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Sat, 16 Apr 2011 23:18:22 +0200
Subject: [PATCH 11/32] ffmpeg: reformat resample condition code in transcode()

Signed-off-by: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 ffmpeg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ffmpeg.c b/ffmpeg.c
index 647018c083..738fb7b922 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -2171,9 +2171,9 @@ static int transcode(AVFormatContext **output_files,
                     fprintf(stderr, "Video pixel format is unknown, stream cannot be encoded\n");
                     ffmpeg_exit(1);
                 }
-                ost->video_resample = (codec->width != icodec->width   ||
-                                       codec->height != icodec->height ||
-                        (codec->pix_fmt != icodec->pix_fmt));
+                ost->video_resample = codec->width   != icodec->width  ||
+                                      codec->height  != icodec->height ||
+                                      codec->pix_fmt != icodec->pix_fmt;
                 if (ost->video_resample) {
 #if !CONFIG_AVFILTER
                     avcodec_get_frame_defaults(&ost->pict_tmp);

From 3fd62c6e247468d792ce8f1d3c458017d1ea9eb5 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Wed, 20 Apr 2011 13:13:09 +0200
Subject: [PATCH 12/32] ffmpeg: call pre_process_video_frame() only if decoding
 is needed

In output_packet(), move the pre_process_video_frame() call inside the
if (ist->decoding_needed) { } block. This way
pre_process_video_frame() is not called when stream-copy has been
selected.

Also simplify.

Signed-off-by: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 ffmpeg.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/ffmpeg.c b/ffmpeg.c
index 738fb7b922..5ab3c7a508 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -1428,7 +1428,7 @@ static int output_packet(AVInputStream *ist, int ist_index,
     int ret, i;
     int got_output;
     AVFrame picture;
-    void *buffer_to_free;
+    void *buffer_to_free = NULL;
     static unsigned int samples_size= 0;
     AVSubtitle subtitle, *subtitle_to_free;
     int64_t pkt_pts = AV_NOPTS_VALUE;
@@ -1530,6 +1530,8 @@ static int output_packet(AVInputStream *ist, int ist_index,
                             ist->st->codec->time_base.den;
                     }
                     avpkt.size = 0;
+                    buffer_to_free = NULL;
+                    pre_process_video_frame(ist, (AVPicture *)&picture, &buffer_to_free);
                     break;
             case AVMEDIA_TYPE_SUBTITLE:
                 ret = avcodec_decode_subtitle2(ist->st->codec,
@@ -1564,12 +1566,6 @@ static int output_packet(AVInputStream *ist, int ist_index,
             avpkt.size = 0;
         }
 
-        buffer_to_free = NULL;
-        if (ist->st->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
-            pre_process_video_frame(ist, (AVPicture *)&picture,
-                                    &buffer_to_free);
-        }
-
 #if CONFIG_AVFILTER
         if (ist->st->codec->codec_type == AVMEDIA_TYPE_VIDEO && ist->input_video_filter) {
             AVRational sar;

From 0b4949b51816bc2fd23ba4c4de183b877b58aa25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 9 May 2011 20:11:16 +0300
Subject: [PATCH 13/32] rtsp: Only do keepalive using GET_PARAMETER if the
 server supports it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is more like what VLC does. If the server doesn't mention
supporting GET_PARAMETER in response to an OPTIONS request,
VLC doesn't send any keepalive requests at all. After this patch,
libavformat will still send OPTIONS keepalives if GET_PARAMETER
isn't explicitly said to be supported.

Some RTSP cameras don't support GET_PARAMETER, and will
close the connection if this is sent as keepalive request
(but support OPTIONS just fine, but probably don't need any
keepalive at all). Some other cameras don't support using
OPTIONS as keepalive, but require GET_PARAMETER instead.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/rtsp.c    | 4 ++++
 libavformat/rtsp.h    | 5 +++++
 libavformat/rtspdec.c | 4 +++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index 14111e69fd..2ebf7e0510 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -808,6 +808,10 @@ void ff_rtsp_parse_line(RTSPMessageHeader *reply, const char *buf,
         p += strspn(p, SPACE_CHARS);
         if (method && !strcmp(method, "PLAY"))
             rtsp_parse_rtp_info(rt, p);
+    } else if (av_stristart(p, "Public:", &p) && rt) {
+        if (strstr(p, "GET_PARAMETER") &&
+            method && !strcmp(method, "OPTIONS"))
+            rt->get_parameter_supported = 1;
     }
 }
 
diff --git a/libavformat/rtsp.h b/libavformat/rtsp.h
index e1f1df990e..ff66502626 100644
--- a/libavformat/rtsp.h
+++ b/libavformat/rtsp.h
@@ -331,6 +331,11 @@ typedef struct RTSPState {
      * Polling array for udp
      */
     struct pollfd *p;
+
+    /**
+     * Whether the server supports the GET_PARAMETER method.
+     */
+    int get_parameter_supported;
 } RTSPState;
 
 /**
diff --git a/libavformat/rtspdec.c b/libavformat/rtspdec.c
index 866f313d10..ccfc4d8e27 100644
--- a/libavformat/rtspdec.c
+++ b/libavformat/rtspdec.c
@@ -341,7 +341,9 @@ retry:
 
     /* send dummy request to keep TCP connection alive */
     if ((av_gettime() - rt->last_cmd_time) / 1000000 >= rt->timeout / 2) {
-        if (rt->server_type != RTSP_SERVER_REAL) {
+        if (rt->server_type == RTSP_SERVER_WMS ||
+           (rt->server_type != RTSP_SERVER_REAL &&
+            rt->get_parameter_supported)) {
             ff_rtsp_send_cmd_async(s, "GET_PARAMETER", rt->control_uri, NULL);
         } else {
             ff_rtsp_send_cmd_async(s, "OPTIONS", "*", NULL);

From f628559d9a600159576faff9735e069479c9d361 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 11:45:13 +0200
Subject: [PATCH 14/32] rotozoom: K&R coding style cosmetics

---
 tests/rotozoom.c | 199 ++++++++++++++++++++++++-----------------------
 1 file changed, 102 insertions(+), 97 deletions(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index 505072c4f9..03e9afb86f 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -24,47 +24,52 @@
 #include <stdio.h>
 #include <inttypes.h>
 
-#define FIXP (1<<16)
-#define MY_PI 205887 //(M_PI*FIX)
+#define FIXP (1 << 16)
+#define MY_PI 205887 //(M_PI * FIX)
 
-static int64_t int_pow(int64_t a, int p){
-    int64_t v= FIXP;
+static int64_t int_pow(int64_t a, int p)
+{
+    int64_t v = FIXP;
 
-    for(; p; p--){
-        v*= a;
-        v/= FIXP;
+    for (; p; p--) {
+        v *= a;
+        v /= FIXP;
     }
 
     return v;
 }
 
-static int64_t int_sin(int64_t a){
-    if(a<0) a= MY_PI-a; // 0..inf
-    a %= 2*MY_PI;       // 0..2PI
+static int64_t int_sin(int64_t a)
+{
+    if (a < 0)
+        a = MY_PI - a;  // 0..inf
+    a %= 2 * MY_PI;     // 0..2PI
 
-    if(a>=MY_PI*3/2) a -= 2*MY_PI;  // -PI/2 .. 3PI/2
-    if(a>=MY_PI/2  ) a = MY_PI - a; // -PI/2 ..  PI/2
+    if (a >= MY_PI * 3 / 2)
+        a -= 2 * MY_PI; // -PI / 2 .. 3PI / 2
+    if (a >= MY_PI /2)
+        a = MY_PI - a;  // -PI / 2 ..  PI / 2
 
-    return a - int_pow(a, 3)/6 + int_pow(a, 5)/120 - int_pow(a, 7)/5040;
+    return a - int_pow(a, 3) / 6 + int_pow(a, 5) / 120 - int_pow(a, 7) / 5040;
 }
 
 #define SCALEBITS 8
 #define ONE_HALF  (1 << (SCALEBITS - 1))
-#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5))
+#define FIX(x)    ((int) ((x) * (1L << SCALEBITS) + 0.5))
 typedef unsigned char UINT8;
 
 static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
+                             UINT8 *src, int width, int height)
 {
     int wrap, wrap3, x, y;
     int r, g, b, r1, g1, b1;
     UINT8 *p;
 
-    wrap = width;
+    wrap  = width;
     wrap3 = width * 3;
     p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
+    for (y = 0; y < height; y += 2) {
+        for (x = 0; x < width; x += 2) {
             r = p[0];
             g = p[1];
             b = p[2];
@@ -81,7 +86,7 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
             b1 += b;
             lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap3;
+            p   += wrap3;
             lum += wrap;
 
             r = p[0];
@@ -104,14 +109,14 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
             cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
                       FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
             cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
+                      FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
 
             cb++;
             cr++;
-            p += -wrap3 + 2 * 3;
-            lum += -wrap + 2;
+            p   += -wrap3 + 2 * 3;
+            lum += -wrap  + 2;
         }
-        p += wrap3;
+        p   += wrap3;
         lum += wrap;
     }
 }
@@ -119,7 +124,7 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
 /* cif format */
 #define DEFAULT_WIDTH   352
 #define DEFAULT_HEIGHT  288
-#define DEFAULT_NB_PICT 50
+#define DEFAULT_NB_PICT  50
 
 static void pgmyuv_save(const char *filename, int w, int h,
                         unsigned char *rgb_tab)
@@ -130,19 +135,19 @@ static void pgmyuv_save(const char *filename, int w, int h,
     unsigned char *lum_tab, *cb_tab, *cr_tab;
 
     lum_tab = malloc(w * h);
-    cb_tab = malloc((w * h) / 4);
-    cr_tab = malloc((w * h) / 4);
+    cb_tab  = malloc((w * h) / 4);
+    cr_tab  = malloc((w * h) / 4);
 
     rgb24_to_yuv420p(lum_tab, cb_tab, cr_tab, rgb_tab, w, h);
 
-    f = fopen(filename,"wb");
+    f = fopen(filename, "wb");
     fprintf(f, "P5\n%d %d\n%d\n", w, (h * 3) / 2, 255);
     fwrite(lum_tab, 1, w * h, f);
     h2 = h / 2;
     w2 = w / 2;
     cb = cb_tab;
     cr = cr_tab;
-    for(i=0;i<h2;i++) {
+    for (i = 0; i < h2; i++) {
         fwrite(cb, 1, w2, f);
         fwrite(cr, 1, w2, f);
         cb += w2;
@@ -172,96 +177,96 @@ static void put_pixel(int x, int y, int r, int g, int b)
     p[2] = b;
 }
 
-unsigned char tab_r[256*256];
-unsigned char tab_g[256*256];
-unsigned char tab_b[256*256];
+unsigned char tab_r[256 * 256];
+unsigned char tab_g[256 * 256];
+unsigned char tab_b[256 * 256];
 
 int h_cos [360];
 int h_sin [360];
 
-static int ipol(uint8_t *src, int x, int y){
-    int int_x= x>>16;
-    int int_y= y>>16;
-    int frac_x= x&0xFFFF;
-    int frac_y= y&0xFFFF;
-    int s00= src[ ( int_x   &255) + 256*( int_y   &255) ];
-    int s01= src[ ((int_x+1)&255) + 256*( int_y   &255) ];
-    int s10= src[ ( int_x   &255) + 256*((int_y+1)&255) ];
-    int s11= src[ ((int_x+1)&255) + 256*((int_y+1)&255) ];
-    int s0= (((1<<16) - frac_x)*s00 + frac_x*s01)>>8;
-    int s1= (((1<<16) - frac_x)*s10 + frac_x*s11)>>8;
+static int ipol(uint8_t *src, int x, int y)
+{
+    int int_x  = x >> 16;
+    int int_y  = y >> 16;
+    int frac_x = x & 0xFFFF;
+    int frac_y = y & 0xFFFF;
+    int s00    = src[( int_x      & 255) + 256 * ( int_y      & 255)];
+    int s01    = src[((int_x + 1) & 255) + 256 * ( int_y      & 255)];
+    int s10    = src[( int_x      & 255) + 256 * ((int_y + 1) & 255)];
+    int s11    = src[((int_x + 1) & 255) + 256 * ((int_y + 1) & 255)];
+    int s0     = (((1 << 16) - frac_x) * s00 + frac_x * s01) >> 8;
+    int s1     = (((1 << 16) - frac_x) * s10 + frac_x * s11) >> 8;
 
-    return (((1<<16) - frac_y)*s0 + frac_y*s1)>>24;
+    return (((1 << 16) - frac_y) * s0 + frac_y * s1) >> 24;
 }
 
 static void gen_image(int num, int w, int h)
 {
-  const int c = h_cos [num % 360];
-  const int s = h_sin [num % 360];
+    const int c = h_cos [num % 360];
+    const int s = h_sin [num % 360];
 
-  const int xi = -(w/2) * c;
-  const int yi =  (w/2) * s;
+    const int xi = -(w / 2) * c;
+    const int yi =  (w / 2) * s;
 
-  const int xj = -(h/2) * s;
-  const int yj = -(h/2) * c;
-  int i,j;
+    const int xj = -(h / 2) * s;
+    const int yj = -(h / 2) * c;
+    int i, j;
 
-  int x,y;
-  int xprime = xj;
-  int yprime = yj;
+    int x, y;
+    int xprime = xj;
+    int yprime = yj;
 
+    for (j = 0; j < h; j++) {
+        x = xprime + xi + FIXP * w / 2;
+        xprime += s;
 
-  for (j=0;j<h;j++) {
+        y = yprime + yi + FIXP * h / 2;
+        yprime += c;
 
-    x = xprime + xi + FIXP*w/2;
-    xprime += s;
-
-    y = yprime + yi + FIXP*h/2;
-    yprime += c;
-
-    for ( i=0 ; i<w ; i++ ) {
-      x += c;
-      y -= s;
-      put_pixel(i, j, ipol(tab_r, x, y), ipol(tab_g, x, y), ipol(tab_b, x, y));
+        for (i = 0; i < w; i++ ) {
+            x += c;
+            y -= s;
+            put_pixel(i, j, ipol(tab_r, x, y), ipol(tab_g, x, y), ipol(tab_b, x, y));
+        }
     }
-  }
 }
 
 #define W 256
 #define H 256
 
-static void init_demo(const char *filename) {
-  int i,j;
-  int h;
-  int radian;
-  char line[3 * W];
+static void init_demo(const char *filename)
+{
+    int i, j;
+    int h;
+    int radian;
+    char line[3 * W];
 
-  FILE *fichier;
+    FILE *fichier;
 
-  fichier = fopen(filename,"rb");
-  if (!fichier) {
-      perror(filename);
-      exit(1);
-  }
-
-  fread(line, 1, 15, fichier);
-  for (i=0;i<H;i++) {
-    fread(line,1,3*W,fichier);
-    for (j=0;j<W;j++) {
-          tab_r[W*i+j] = line[3*j    ];
-          tab_g[W*i+j] = line[3*j + 1];
-          tab_b[W*i+j] = line[3*j + 2];
+    fichier = fopen(filename, "rb");
+    if (!fichier) {
+        perror(filename);
+        exit(1);
     }
-  }
-  fclose(fichier);
 
-  /* tables sin/cos */
-  for (i=0;i<360;i++) {
-    radian = 2*i*MY_PI/360;
-    h = 2*FIXP + int_sin (radian);
-    h_cos[i] = ( h * int_sin (radian + MY_PI/2) )/2/FIXP;
-    h_sin[i] = ( h * int_sin (radian          ) )/2/FIXP;
-  }
+    fread(line, 1, 15, fichier);
+    for (i = 0; i < H; i++) {
+        fread(line, 1, 3 * W, fichier);
+        for (j = 0; j < W; j++) {
+            tab_r[W * i + j] = line[3 * j    ];
+            tab_g[W * i + j] = line[3 * j + 1];
+            tab_b[W * i + j] = line[3 * j + 2];
+        }
+    }
+    fclose(fichier);
+
+    /* tables sin/cos */
+    for (i = 0; i < 360; i++) {
+        radian = 2 * i * MY_PI / 360;
+        h      = 2 * FIXP + int_sin (radian);
+        h_cos[i] = (h * int_sin(radian + MY_PI / 2)) / 2 / FIXP;
+        h_sin[i] = (h * int_sin(radian)            ) / 2 / FIXP;
+    }
 }
 
 int main(int argc, char **argv)
@@ -279,13 +284,13 @@ int main(int argc, char **argv)
     h = DEFAULT_HEIGHT;
 
     rgb_tab = malloc(w * h * 3);
-    wrap = w * 3;
-    width = w;
-    height = h;
+    wrap    = w * 3;
+    width   = w;
+    height  = h;
 
     init_demo(argv[2]);
 
-    for(i=0;i<DEFAULT_NB_PICT;i++) {
+    for (i = 0; i < DEFAULT_NB_PICT; i++) {
         snprintf(buf, sizeof(buf), "%s%02d.pgm", argv[1], i);
         gen_image(i, w, h);
         pgmyuv_save(buf, w, h, rgb_tab);

From e9c10459a36869b5779a4c6cd52d879f23c6b294 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 12:23:55 +0200
Subject: [PATCH 15/32] rotozoom: Drop some unnecessary parentheses.

---
 tests/rotozoom.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index 03e9afb86f..209ed027d1 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -135,13 +135,13 @@ static void pgmyuv_save(const char *filename, int w, int h,
     unsigned char *lum_tab, *cb_tab, *cr_tab;
 
     lum_tab = malloc(w * h);
-    cb_tab  = malloc((w * h) / 4);
-    cr_tab  = malloc((w * h) / 4);
+    cb_tab  = malloc(w * h / 4);
+    cr_tab  = malloc(w * h / 4);
 
     rgb24_to_yuv420p(lum_tab, cb_tab, cr_tab, rgb_tab, w, h);
 
     f = fopen(filename, "wb");
-    fprintf(f, "P5\n%d %d\n%d\n", w, (h * 3) / 2, 255);
+    fprintf(f, "P5\n%d %d\n%d\n", w, h * 3 / 2, 255);
     fwrite(lum_tab, 1, w * h, f);
     h2 = h / 2;
     w2 = w / 2;
@@ -264,8 +264,8 @@ static void init_demo(const char *filename)
     for (i = 0; i < 360; i++) {
         radian = 2 * i * MY_PI / 360;
         h      = 2 * FIXP + int_sin (radian);
-        h_cos[i] = (h * int_sin(radian + MY_PI / 2)) / 2 / FIXP;
-        h_sin[i] = (h * int_sin(radian)            ) / 2 / FIXP;
+        h_cos[i] = h * int_sin(radian + MY_PI / 2) / 2 / FIXP;
+        h_sin[i] = h * int_sin(radian)             / 2 / FIXP;
     }
 }
 

From 5a37c12c82323c0b1f06cf4b8030bcabb554765d Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 12:26:00 +0200
Subject: [PATCH 16/32] rotozoom: Drop silly UINT8 typedef.

---
 tests/rotozoom.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index 209ed027d1..642cefa56f 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -56,14 +56,14 @@ static int64_t int_sin(int64_t a)
 #define SCALEBITS 8
 #define ONE_HALF  (1 << (SCALEBITS - 1))
 #define FIX(x)    ((int) ((x) * (1L << SCALEBITS) + 0.5))
-typedef unsigned char UINT8;
 
-static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                             UINT8 *src, int width, int height)
+static void rgb24_to_yuv420p(unsigned char *lum, unsigned char *cb,
+                             unsigned char *cr, unsigned char *src,
+                             int width, int height)
 {
     int wrap, wrap3, x, y;
     int r, g, b, r1, g1, b1;
-    UINT8 *p;
+    unsigned char *p;
 
     wrap  = width;
     wrap3 = width * 3;

From 2131e8590c447575a1c23bbc9f7e0bf9592d8997 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 12:33:45 +0200
Subject: [PATCH 17/32] rotozoom: Make init_demo() return int and check for
 errors on invocation.

---
 tests/rotozoom.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index 642cefa56f..aebcc8e74a 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -234,7 +234,7 @@ static void gen_image(int num, int w, int h)
 #define W 256
 #define H 256
 
-static void init_demo(const char *filename)
+static int init_demo(const char *filename)
 {
     int i, j;
     int h;
@@ -246,7 +246,7 @@ static void init_demo(const char *filename)
     fichier = fopen(filename, "rb");
     if (!fichier) {
         perror(filename);
-        exit(1);
+        return 1;
     }
 
     fread(line, 1, 15, fichier);
@@ -267,6 +267,8 @@ static void init_demo(const char *filename)
         h_cos[i] = h * int_sin(radian + MY_PI / 2) / 2 / FIXP;
         h_sin[i] = h * int_sin(radian)             / 2 / FIXP;
     }
+
+  return 0;
 }
 
 int main(int argc, char **argv)
@@ -288,7 +290,8 @@ int main(int argc, char **argv)
     width   = w;
     height  = h;
 
-    init_demo(argv[2]);
+    if (init_demo(argv[2]))
+        return 1;
 
     for (i = 0; i < DEFAULT_NB_PICT; i++) {
         snprintf(buf, sizeof(buf), "%s%02d.pgm", argv[1], i);

From 771339ca206468636a64a6041852068be2da3dd2 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 11:16:55 +0200
Subject: [PATCH 18/32] rotozoom: Return an error value instead of calling
 exit().

---
 tests/rotozoom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index aebcc8e74a..644ae8b1eb 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -279,7 +279,7 @@ int main(int argc, char **argv)
     if (argc != 3) {
         printf("usage: %s directory/ image.pnm\n"
                "generate a test video stream\n", argv[0]);
-        exit(1);
+        return 1;
     }
 
     w = DEFAULT_WIDTH;

From cbb0930f0ebdb2655296d7ae4424ee922168c5b7 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 12:42:15 +0200
Subject: [PATCH 19/32] rotozoom: Check return value of fread().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the warnings:
tests/rotozoom.c:252: warning: ignoring return value of ‘fread’, declared with attribute warn_unused_result
tests/rotozoom.c:254: warning: ignoring return value of ‘fread’, declared with attribute warn_unused_result
---
 tests/rotozoom.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index 644ae8b1eb..d61ce21c10 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -249,9 +249,11 @@ static int init_demo(const char *filename)
         return 1;
     }
 
-    fread(line, 1, 15, fichier);
+    if (fread(line, 1, 15, fichier) != 15)
+        return 1;
     for (i = 0; i < H; i++) {
-        fread(line, 1, 3 * W, fichier);
+        if (fread(line, 1, 3 * W, fichier) != 3 * W)
+            return 1;
         for (j = 0; j < W; j++) {
             tab_r[W * i + j] = line[3 * j    ];
             tab_g[W * i + j] = line[3 * j + 1];

From e1e0ca70eb0cd469b4ba757e3f72e7540ef04056 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 9 May 2011 13:00:04 +0200
Subject: [PATCH 20/32] rotozoom: Eliminate French variable name.

---
 tests/rotozoom.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index d61ce21c10..ff817acc3b 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -241,18 +241,18 @@ static int init_demo(const char *filename)
     int radian;
     char line[3 * W];
 
-    FILE *fichier;
+    FILE *input_file;
 
-    fichier = fopen(filename, "rb");
-    if (!fichier) {
+    input_file = fopen(filename, "rb");
+    if (!input_file) {
         perror(filename);
         return 1;
     }
 
-    if (fread(line, 1, 15, fichier) != 15)
+    if (fread(line, 1, 15, input_file) != 15)
         return 1;
     for (i = 0; i < H; i++) {
-        if (fread(line, 1, 3 * W, fichier) != 3 * W)
+        if (fread(line, 1, 3 * W, input_file) != 3 * W)
             return 1;
         for (j = 0; j < W; j++) {
             tab_r[W * i + j] = line[3 * j    ];
@@ -260,7 +260,7 @@ static int init_demo(const char *filename)
             tab_b[W * i + j] = line[3 * j + 2];
         }
     }
-    fclose(fichier);
+    fclose(input_file);
 
     /* tables sin/cos */
     for (i = 0; i < 360; i++) {

From 2caf19e90f270abe1e80a3e85acaf0eb5c9d0aac Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 10 May 2011 20:23:08 -0400
Subject: [PATCH 21/32] h264pred: fix aliasing violations.

Tested to fix Haiku H264/10bit fate failures, may also fix others.
---
 libavcodec/h264pred_template.c | 163 ++++++++++++++++++---------------
 libavcodec/high_bit_depth.h    |   3 +
 2 files changed, 93 insertions(+), 73 deletions(-)

diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 066e837cdf..c600133346 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -31,20 +31,21 @@
 static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    const pixel4 a= ((pixel4*)(src-stride))[0];
-    ((pixel4*)(src+0*stride))[0]= a;
-    ((pixel4*)(src+1*stride))[0]= a;
-    ((pixel4*)(src+2*stride))[0]= a;
-    ((pixel4*)(src+3*stride))[0]= a;
+    const pixel4 a= AV_RN4PA(src-stride);
+
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
-    ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
-    ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
+    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
+    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
+    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
+    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
 }
 
 static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
@@ -52,60 +53,69 @@ static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _strid
     int stride = _stride/sizeof(pixel);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+    const pixel4 a = PIXEL_SPLAT_X4(dc);
 
-    ((pixel4*)(src+0*stride))[0]=
-    ((pixel4*)(src+1*stride))[0]=
-    ((pixel4*)(src+2*stride))[0]=
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
+    const pixel4 a = PIXEL_SPLAT_X4(dc);
 
-    ((pixel4*)(src+0*stride))[0]=
-    ((pixel4*)(src+1*stride))[0]=
-    ((pixel4*)(src+2*stride))[0]=
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
+    const pixel4 a = PIXEL_SPLAT_X4(dc);
 
-    ((pixel4*)(src+0*stride))[0]=
-    ((pixel4*)(src+1*stride))[0]=
-    ((pixel4*)(src+2*stride))[0]=
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    ((pixel4*)(src+0*stride))[0]=
-    ((pixel4*)(src+1*stride))[0]=
-    ((pixel4*)(src+2*stride))[0]=
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
+    const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
+
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    ((pixel4*)(src+0*stride))[0]=
-    ((pixel4*)(src+1*stride))[0]=
-    ((pixel4*)(src+2*stride))[0]=
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
+    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
+
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    ((pixel4*)(src+0*stride))[0]=
-    ((pixel4*)(src+1*stride))[0]=
-    ((pixel4*)(src+2*stride))[0]=
-    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
+    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
+
+    AV_WN4PA(src+0*stride, a);
+    AV_WN4PA(src+1*stride, a);
+    AV_WN4PA(src+2*stride, a);
+    AV_WN4PA(src+3*stride, a);
 }
 
 
@@ -286,16 +296,16 @@ static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
     int i;
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    const pixel4 a = ((pixel4*)(src-stride))[0];
-    const pixel4 b = ((pixel4*)(src-stride))[1];
-    const pixel4 c = ((pixel4*)(src-stride))[2];
-    const pixel4 d = ((pixel4*)(src-stride))[3];
+    const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
+    const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
+    const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
+    const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
 
     for(i=0; i<16; i++){
-        ((pixel4*)(src+i*stride))[0] = a;
-        ((pixel4*)(src+i*stride))[1] = b;
-        ((pixel4*)(src+i*stride))[2] = c;
-        ((pixel4*)(src+i*stride))[3] = d;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
+        AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
+        AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
     }
 }
 
@@ -305,19 +315,21 @@ static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
     stride /= sizeof(pixel);
 
     for(i=0; i<16; i++){
-        ((pixel4*)(src+i*stride))[0] =
-        ((pixel4*)(src+i*stride))[1] =
-        ((pixel4*)(src+i*stride))[2] =
-        ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
+        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
+
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
     }
 }
 
 #define PREDICT_16x16_DC(v)\
     for(i=0; i<16; i++){\
-        AV_WN4P(src+ 0, v);\
-        AV_WN4P(src+ 4, v);\
-        AV_WN4P(src+ 8, v);\
-        AV_WN4P(src+12, v);\
+        AV_WN4PA(src+ 0, v);\
+        AV_WN4PA(src+ 4, v);\
+        AV_WN4PA(src+ 8, v);\
+        AV_WN4PA(src+12, v);\
         src += stride;\
     }
 
@@ -432,12 +444,12 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
     int i;
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
-    const pixel4 a= ((pixel4*)(src-stride))[0];
-    const pixel4 b= ((pixel4*)(src-stride))[1];
+    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
+    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
 
     for(i=0; i<8; i++){
-        ((pixel4*)(src+i*stride))[0]= a;
-        ((pixel4*)(src+i*stride))[1]= b;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
     }
 }
 
@@ -447,19 +459,21 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
     stride /= sizeof(pixel);
 
     for(i=0; i<8; i++){
-        ((pixel4*)(src+i*stride))[0]=
-        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
+        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
     }
 }
 
 #define PRED8x8_X(n, v)\
 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
     int i;\
+    const pixel4 a = PIXEL_SPLAT_X4(v);\
     pixel *src = (pixel*)_src;\
     stride /= sizeof(pixel);\
     for(i=0; i<8; i++){\
-        ((pixel4*)(src+i*stride))[0]=\
-        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
     }\
 }
 
@@ -483,12 +497,12 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
     dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
 
     for(i=0; i<4; i++){
-        ((pixel4*)(src+i*stride))[0]=
-        ((pixel4*)(src+i*stride))[1]= dc0splat;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
     }
     for(i=4; i<8; i++){
-        ((pixel4*)(src+i*stride))[0]=
-        ((pixel4*)(src+i*stride))[1]= dc2splat;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
     }
 }
 
@@ -508,12 +522,12 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
 
     for(i=0; i<4; i++){
-        ((pixel4*)(src+i*stride))[0]= dc0splat;
-        ((pixel4*)(src+i*stride))[1]= dc1splat;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
     }
     for(i=4; i<8; i++){
-        ((pixel4*)(src+i*stride))[0]= dc0splat;
-        ((pixel4*)(src+i*stride))[1]= dc1splat;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
     }
 }
 
@@ -536,12 +550,12 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
     dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
 
     for(i=0; i<4; i++){
-        ((pixel4*)(src+i*stride))[0]= dc0splat;
-        ((pixel4*)(src+i*stride))[1]= dc1splat;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
     }
     for(i=4; i<8; i++){
-        ((pixel4*)(src+i*stride))[0]= dc2splat;
-        ((pixel4*)(src+i*stride))[1]= dc3splat;
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
     }
 }
 
@@ -636,8 +650,8 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
 #define PREDICT_8x8_DC(v) \
     int y; \
     for( y = 0; y < 8; y++ ) { \
-        ((pixel4*)src)[0] = \
-        ((pixel4*)src)[1] = v; \
+        AV_WN4PA(((pixel4*)src)+0, v); \
+        AV_WN4PA(((pixel4*)src)+1, v); \
         src += stride; \
     }
 
@@ -693,6 +707,7 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_top
     int y;
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
+    pixel4 a, b;
 
     PREDICT_8x8_LOAD_TOP;
     src[0] = t0;
@@ -703,9 +718,11 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_top
     src[5] = t5;
     src[6] = t6;
     src[7] = t7;
+    a = AV_RN4PA(((pixel4*)src)+0);
+    b = AV_RN4PA(((pixel4*)src)+1);
     for( y = 1; y < 8; y++ ) {
-        ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
-        ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
+        AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
     }
 }
 static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
diff --git a/libavcodec/high_bit_depth.h b/libavcodec/high_bit_depth.h
index 6f2b6a74f4..511cd00f3a 100644
--- a/libavcodec/high_bit_depth.h
+++ b/libavcodec/high_bit_depth.h
@@ -14,6 +14,7 @@
 #   undef rnd_avg_pixel4
 #   undef AV_RN2P
 #   undef AV_RN4P
+#   undef AV_RN4PA
 #   undef AV_WN2P
 #   undef AV_WN4P
 #   undef AV_WN4PA
@@ -46,6 +47,7 @@ CLIP_PIXEL(10)
 #   define    rnd_avg_pixel4    rnd_avg64
 #   define AV_RN2P  AV_RN32
 #   define AV_RN4P  AV_RN64
+#   define AV_RN4PA AV_RN64A
 #   define AV_WN2P  AV_WN32
 #   define AV_WN4P  AV_WN64
 #   define AV_WN4PA AV_WN64A
@@ -61,6 +63,7 @@ CLIP_PIXEL(10)
 #   define    rnd_avg_pixel4    rnd_avg32
 #   define AV_RN2P  AV_RN16
 #   define AV_RN4P  AV_RN32
+#   define AV_RN4PA AV_RN32A
 #   define AV_WN2P  AV_WN16
 #   define AV_WN4P  AV_WN32
 #   define AV_WN4PA AV_WN32A

From bea705752d6448578482cfa022944a9795388f14 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Wed, 11 May 2011 12:22:28 +0200
Subject: [PATCH 22/32] Remove unused softfloat implementation.

The softfloat functionality is unused, not installed and incomplete.
On platforms without floating point units, the compiler provides a softfloat
implementation so there is no point in carrying this code around locally.
---
 doc/avutil.txt        |   1 -
 libavutil/Makefile    |   2 +-
 libavutil/softfloat.c |  72 ------------------------
 libavutil/softfloat.h | 126 ------------------------------------------
 4 files changed, 1 insertion(+), 200 deletions(-)
 delete mode 100644 libavutil/softfloat.c
 delete mode 100644 libavutil/softfloat.h

diff --git a/doc/avutil.txt b/doc/avutil.txt
index 210bd07264..0847683d1d 100644
--- a/doc/avutil.txt
+++ b/doc/avutil.txt
@@ -19,7 +19,6 @@ integer.c               128bit integer math
 lls.c
 mathematics.c           greatest common divisor, integer sqrt, integer log2, ...
 mem.c                   memory allocation routines with guaranteed alignment
-softfloat.c
 
 Headers:
 bswap.h                 big/little/native-endian conversion code
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 8c3cc60dab..1386ebb190 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -75,7 +75,7 @@ OBJS-$(ARCH_ARM) += arm/cpu.o
 OBJS-$(ARCH_PPC) += ppc/cpu.o
 OBJS-$(ARCH_X86) += x86/cpu.o
 
-TESTPROGS = adler32 aes base64 cpu crc des lls md5 pca sha softfloat tree
+TESTPROGS = adler32 aes base64 cpu crc des lls md5 pca sha tree
 TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo
 
 DIRS = arm bfin sh4 x86
diff --git a/libavutil/softfloat.c b/libavutil/softfloat.c
deleted file mode 100644
index 55969fb141..0000000000
--- a/libavutil/softfloat.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <assert.h>
-#include "softfloat.h"
-#include "common.h"
-#include "log.h"
-
-#undef printf
-
-int main(void){
-    SoftFloat one= av_int2sf(1, 0);
-    SoftFloat sf1, sf2;
-    double d1, d2;
-    int i, j;
-    av_log_set_level(AV_LOG_DEBUG);
-
-    d1= 1;
-    for(i= 0; i<10; i++){
-        d1= 1/(d1+1);
-    }
-    printf("test1 double=%d\n", (int)(d1 * (1<<24)));
-
-    sf1= one;
-    for(i= 0; i<10; i++){
-        sf1= av_div_sf(one, av_normalize_sf(av_add_sf(one, sf1)));
-    }
-    printf("test1 sf    =%d\n", av_sf2int(sf1, 24));
-
-
-    for(i= 0; i<100; i++){
-        START_TIMER
-        d1= i;
-        d2= i/100.0;
-        for(j= 0; j<1000; j++){
-            d1= (d1+1)*d2;
-        }
-        STOP_TIMER("float add mul")
-    }
-    printf("test2 double=%d\n", (int)(d1 * (1<<24)));
-
-    for(i= 0; i<100; i++){
-        START_TIMER
-        sf1= av_int2sf(i, 0);
-        sf2= av_div_sf(av_int2sf(i, 2), av_int2sf(200, 3));
-        for(j= 0; j<1000; j++){
-            sf1= av_mul_sf(av_add_sf(sf1, one),sf2);
-        }
-        STOP_TIMER("softfloat add mul")
-    }
-    printf("test2 sf    =%d (%d %d)\n", av_sf2int(sf1, 24), sf1.exp, sf1.mant);
-    return 0;
-}
diff --git a/libavutil/softfloat.h b/libavutil/softfloat.h
deleted file mode 100644
index 3078bd7496..0000000000
--- a/libavutil/softfloat.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_SOFTFLOAT_H
-#define AVUTIL_SOFTFLOAT_H
-
-#include <stdint.h>
-#include "common.h"
-
-#define MIN_EXP -126
-#define MAX_EXP  126
-#define ONE_BITS 29
-
-typedef struct SoftFloat{
-    int32_t  exp;
-    int32_t mant;
-}SoftFloat;
-
-static av_const SoftFloat av_normalize_sf(SoftFloat a){
-    if(a.mant){
-#if 1
-        while((a.mant + 0x20000000U)<0x40000000U){
-            a.mant += a.mant;
-            a.exp  -= 1;
-        }
-#else
-        int s=ONE_BITS + 1 - av_log2(a.mant ^ (a.mant<<1));
-        a.exp   -= s;
-        a.mant <<= s;
-#endif
-        if(a.exp < MIN_EXP){
-            a.exp = MIN_EXP;
-            a.mant= 0;
-        }
-    }else{
-        a.exp= MIN_EXP;
-    }
-    return a;
-}
-
-static inline av_const SoftFloat av_normalize1_sf(SoftFloat a){
-#if 1
-    if(a.mant + 0x40000000 < 0){
-        a.exp++;
-        a.mant>>=1;
-    }
-    return a;
-#elif 1
-    int t= a.mant + 0x40000000 < 0;
-    return (SoftFloat){a.exp+t, a.mant>>t};
-#else
-    int t= (a.mant + 0x40000000U)>>31;
-    return (SoftFloat){a.exp+t, a.mant>>t};
-#endif
-}
-
-/**
- * @return Will not be more denormalized than a+b. So if either input is
- *         normalized, then the output will not be worse then the other input.
- *         If both are normalized, then the output will be normalized.
- */
-static inline av_const SoftFloat av_mul_sf(SoftFloat a, SoftFloat b){
-    a.exp += b.exp;
-    a.mant = (a.mant * (int64_t)b.mant) >> ONE_BITS;
-    return av_normalize1_sf(a);
-}
-
-/**
- * b has to be normalized and not zero.
- * @return Will not be more denormalized than a.
- */
-static av_const SoftFloat av_div_sf(SoftFloat a, SoftFloat b){
-    a.exp -= b.exp+1;
-    a.mant = ((int64_t)a.mant<<(ONE_BITS+1)) / b.mant;
-    return av_normalize1_sf(a);
-}
-
-static inline av_const int av_cmp_sf(SoftFloat a, SoftFloat b){
-    int t= a.exp - b.exp;
-    if(t<0) return (a.mant >> (-t)) -  b.mant      ;
-    else    return  a.mant          - (b.mant >> t);
-}
-
-static inline av_const SoftFloat av_add_sf(SoftFloat a, SoftFloat b){
-    int t= a.exp - b.exp;
-    if(t<0) return av_normalize1_sf((SoftFloat){b.exp, b.mant + (a.mant >> (-t))});
-    else    return av_normalize1_sf((SoftFloat){a.exp, a.mant + (b.mant >>   t )});
-}
-
-static inline av_const SoftFloat av_sub_sf(SoftFloat a, SoftFloat b){
-    return av_add_sf(a, (SoftFloat){b.exp, -b.mant});
-}
-
-//FIXME sqrt, log, exp, pow, sin, cos
-
-static inline av_const SoftFloat av_int2sf(int v, int frac_bits){
-    return av_normalize_sf((SoftFloat){ONE_BITS-frac_bits, v});
-}
-
-/**
- * Rounding is to -inf.
- */
-static inline av_const int av_sf2int(SoftFloat v, int frac_bits){
-    v.exp += frac_bits - ONE_BITS;
-    if(v.exp >= 0) return v.mant <<  v.exp ;
-    else           return v.mant >>(-v.exp);
-}
-
-#endif /* AVUTIL_SOFTFLOAT_H */

From b437f5b055497fc81b8061de593f3ef27657c1e3 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Mon, 9 May 2011 21:59:20 +0200
Subject: [PATCH 23/32] tiff: add support for inverted FillOrder for
 uncompressed data

Fix decoding of file b.tif, trac issue #168.

Signed-off-by: Diego Biurrun <diego@biurrun.de>
---
 libavcodec/tiff.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index 3cc3a42500..1ec78a7125 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -170,7 +170,13 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin
         }
         switch(s->compr){
         case TIFF_RAW:
-            memcpy(dst, src, width);
+            if (!s->fill_order) {
+                memcpy(dst, src, width);
+            } else {
+                int i;
+                for (i = 0; i < width; i++)
+                    dst[i] = av_reverse[src[i]];
+            }
             src += width;
             break;
         case TIFF_PACKBITS:

From 5c511ad4ce20aff96ec587de1a8be6f28aed4544 Mon Sep 17 00:00:00 2001
From: Baptiste Coudurier <baptiste.coudurier@gmail.com>
Date: Wed, 11 May 2011 10:00:50 +0200
Subject: [PATCH 24/32] swscale: extend YUV422p support to 10bits depth

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libavcodec/utils.c  |  2 ++
 libavutil/pixdesc.c | 23 +++++++++++++++++++++++
 libavutil/pixfmt.h  |  3 +++
 3 files changed, 28 insertions(+)

diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 9a19cde9cc..9e879940a9 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -140,6 +140,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l
     case PIX_FMT_YUV420P9BE:
     case PIX_FMT_YUV420P10LE:
     case PIX_FMT_YUV420P10BE:
+    case PIX_FMT_YUV422P10LE:
+    case PIX_FMT_YUV422P10BE:
         w_align= 16; //FIXME check for non mpeg style codecs and use less alignment
         h_align= 16;
         if(s->codec_id == CODEC_ID_MPEG2VIDEO || s->codec_id == CODEC_ID_MJPEG || s->codec_id == CODEC_ID_AMV || s->codec_id == CODEC_ID_THP || s->codec_id == CODEC_ID_H264)
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index a291141436..bff45e522a 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -809,6 +809,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = {
         },
         .flags = PIX_FMT_BE,
     },
+    [PIX_FMT_YUV422P10LE] = {
+        .name = "yuv422p10le",
+        .nb_components= 3,
+        .log2_chroma_w= 1,
+        .log2_chroma_h= 0,
+        .comp = {
+            {0,1,1,0,9},        /* Y */
+            {1,1,1,0,9},        /* U */
+            {2,1,1,0,9},        /* V */
+        },
+    },
+    [PIX_FMT_YUV422P10BE] = {
+        .name = "yuv422p10be",
+        .nb_components= 3,
+        .log2_chroma_w= 1,
+        .log2_chroma_h= 0,
+        .comp = {
+            {0,1,1,0,9},        /* Y */
+            {1,1,1,0,9},        /* U */
+            {2,1,1,0,9},        /* V */
+        },
+        .flags = PIX_FMT_BE,
+    },
     [PIX_FMT_YUV422P16LE] = {
         .name = "yuv422p16le",
         .nb_components= 3,
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index fafbf9be1a..533eb9f706 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -139,6 +139,8 @@ enum PixelFormat {
     PIX_FMT_YUV420P9LE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
     PIX_FMT_YUV420P10BE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
     PIX_FMT_YUV420P10LE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    PIX_FMT_YUV422P10BE,///< planar YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    PIX_FMT_YUV422P10LE,///< planar YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
     PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
 
@@ -165,6 +167,7 @@ enum PixelFormat {
 
 #define PIX_FMT_YUV420P9  PIX_FMT_NE(YUV420P9BE , YUV420P9LE)
 #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE)
+#define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE)
 #define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE)
 #define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE)
 #define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE)

From 083e715f339e4546e9be8a6e265360be87cca517 Mon Sep 17 00:00:00 2001
From: Reinhard Tartler <siretart@tauware.de>
Date: Wed, 11 May 2011 13:51:11 +0200
Subject: [PATCH 25/32] aac: workaround for compilation on cygwin

On cygwin, math.h needs to be included before float.h because of a bug
in the system headers. Including libavutil/libm.h first works around
this issue.

Longer discussion of the topic:
http://thread.gmane.org/gmane.comp.video.ffmpeg.devel/128582
---
 libavcodec/aaccoder.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index 15fe430732..83d3734089 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -30,13 +30,14 @@
  * add sane pulse detection
  ***********************************/
 
+#include "libavutil/libm.h" // brought forward to work around cygwin header breakage
+
 #include <float.h>
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
 #include "aacenc.h"
 #include "aactab.h"
-#include "libavutil/libm.h"
 
 /** bits needed to code codebook run value for long windows */
 static const uint8_t run_value_bits_long[64] = {

From c9e81d0783345a7b3b573a03ba5fa79ea90cd8c1 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 11 May 2011 08:45:17 +0200
Subject: [PATCH 26/32] lavc: deprecate named constants for deprecated
 antialias_algo.

---
 libavcodec/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/options.c b/libavcodec/options.c
index b22e53db9d..9c714fb73e 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -305,11 +305,11 @@ static const AVOption options[]={
 {"error", NULL, OFFSET(error_rate), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #if FF_API_ANTIALIAS_ALGO
 {"antialias", "MP3 antialias algorithm", OFFSET(antialias_algo), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|D, "aa"},
-#endif
 {"auto", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_AUTO }, INT_MIN, INT_MAX, V|D, "aa"},
 {"fastint", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FASTINT }, INT_MIN, INT_MAX, V|D, "aa"},
 {"int", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_INT }, INT_MIN, INT_MAX, V|D, "aa"},
 {"float", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FLOAT }, INT_MIN, INT_MAX, V|D, "aa"},
+#endif
 {"qns", "quantizer noise shaping", OFFSET(quantizer_noise_shaping), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"threads", NULL, OFFSET(thread_count), FF_OPT_TYPE_INT, {.dbl = 1 }, INT_MIN, INT_MAX, V|E|D},
 {"me_threshold", "motion estimaton threshold", OFFSET(me_threshold), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},

From 880fa2183051dfe6a7e44879361f674a906f5aa7 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 11 May 2011 13:45:12 +0200
Subject: [PATCH 27/32] flacenc: use proper initializers for AVOption default
 values.

default_val was recently changes from double to a union, current code
wasn't updated for that.
---
 libavcodec/flacenc.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 7685ff6ea0..8624a6d987 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -1330,22 +1330,22 @@ static av_cold int flac_encode_close(AVCodecContext *avctx)
 
 #define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
 static const AVOption options[] = {
-{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, 15, 0, MAX_LPC_PRECISION, FLAGS },
-{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" },
-{ "none",     NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_NONE,     INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "fixed",    NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_FIXED,    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_LEVINSON, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_CHOLESKY, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, FLAGS },
-{ "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
-{ "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
-{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, -1, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
-{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_EST,    INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "2level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_2LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "4level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_4LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "8level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_8LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "search",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_SEARCH, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "log",        NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_LOG,    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = 15 }, 0, MAX_LPC_PRECISION, FLAGS },
+{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, {.dbl = FF_LPC_TYPE_DEFAULT }, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" },
+{ "none",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_NONE },     INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "fixed",    NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, FLAGS },
+{ "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  FF_OPT_TYPE_INT, {.dbl = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  FF_OPT_TYPE_INT, {.dbl = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
+{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_EST },    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "2level",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_2LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "4level",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_4LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "8level",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_8LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "search",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_SEARCH }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "log",        NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_LOG },    INT_MIN, INT_MAX, FLAGS, "predm" },
 { NULL },
 };
 

From 36dc49b713f81708e5637c18748c6f666f732caa Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 11 May 2011 13:56:42 +0200
Subject: [PATCH 28/32] doc/APIchanges: fill in missing hashes and dates.

---
 doc/APIchanges | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 0995b192a4..ee96ddf258 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -13,21 +13,21 @@ libavutil:   2011-04-18
 
 API changes, most recent first:
 
-2011-05-10 - xxxxxxx - lavc 53.3.0 - avcodec.h
+2011-05-10 - 188dea1 - lavc 53.3.0 - avcodec.h
   Deprecate AVLPCType and the following fields in
   AVCodecContext: lpc_coeff_precision, prediction_order_method,
   min_partition_order, max_partition_order, lpc_type, lpc_passes.
   Corresponding FLAC encoder options should be used instead.
 
-2011-04-XX - bebe72f - lavu 51.1.0 - avutil.h
+2011-04-26 - bebe72f - lavu 51.1.0 - avutil.h
   Add AVPictureType enum and av_get_picture_type_char(), deprecate
   FF_*_TYPE defines and av_get_pict_type_char() defined in
   libavcodec/avcodec.h.
 
-2011-04-xx - 10d3940 - lavfi 2.3.0 - avfilter.h
+2011-04-26 - 10d3940 - lavfi 2.3.0 - avfilter.h
   Add pict_type and key_frame fields to AVFilterBufferRefVideo.
 
-2011-04-xx - 7a11c82 - lavfi 2.2.0 - vsrc_buffer
+2011-04-26 - 7a11c82 - lavfi 2.2.0 - vsrc_buffer
   Add sample_aspect_ratio fields to vsrc_buffer arguments
 
 2011-04-21 - 94f7451 - lavc 53.1.0 - avcodec.h

From d2bf42895ac30d228491a8a95a5908351dc32783 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 11 May 2011 08:30:02 -0400
Subject: [PATCH 29/32] h264pred: fix one more aliasing violation.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/h264pred_template.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index c600133346..1c1fe0bc31 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -695,10 +695,12 @@ static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_t
 {
     pixel *src = (pixel*)_src;
     int stride = _stride/sizeof(pixel);
+    pixel4 a;
 
     PREDICT_8x8_LOAD_LEFT;
-#define ROW(y) ((pixel4*)(src+y*stride))[0] =\
-               ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
+#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
+               AV_WN4PA(src+y*stride, a); \
+               AV_WN4PA(src+y*stride+4, a);
     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
 #undef ROW
 }

From b44c8ad280c221691560ae9625421416e20c483f Mon Sep 17 00:00:00 2001
From: Anatoly Nenashev <anatoly.nenashev@ovsoft.ru>
Date: Sun, 27 Mar 2011 21:41:48 +0200
Subject: [PATCH 30/32] Fix crash of interlaced MPEG2 decoding

Problem description, preliminary review discussion at
http://thread.gmane.org/gmane.comp.video.ffmpeg.devel/127731
---
 libavcodec/mpegvideo.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index f4de8dded4..2c0525e2ad 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -998,8 +998,13 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->pict_type, s->dropable);*/
 
     if(s->codec_id != CODEC_ID_H264){
-        if((s->last_picture_ptr==NULL || s->last_picture_ptr->data[0]==NULL) && s->pict_type!=AV_PICTURE_TYPE_I){
-            av_log(avctx, AV_LOG_ERROR, "warning: first frame is no keyframe\n");
+        if((s->last_picture_ptr==NULL || s->last_picture_ptr->data[0]==NULL) &&
+           (s->pict_type!=AV_PICTURE_TYPE_I || s->picture_structure != PICT_FRAME)){
+            if (s->pict_type != AV_PICTURE_TYPE_I)
+                av_log(avctx, AV_LOG_ERROR, "warning: first frame is no keyframe\n");
+            else if (s->picture_structure != PICT_FRAME)
+                av_log(avctx, AV_LOG_INFO, "allocate dummy last picture for field based first keyframe\n");
+
             /* Allocate a dummy frame */
             i= ff_find_unused_picture(s, 0);
             s->last_picture_ptr= &s->picture[i];

From 9aa91043f30cee1419555c0e299c94e655b0930a Mon Sep 17 00:00:00 2001
From: Baptiste Coudurier <baptiste.coudurier@gmail.com>
Date: Tue, 12 Apr 2011 15:29:09 -0700
Subject: [PATCH 31/32] Port SMPTE S302M audio decoder from FFmbc 0.3.

---
 Changelog              |   1 +
 doc/general.texi       |   1 +
 libavcodec/Makefile    |   1 +
 libavcodec/allcodecs.c |   1 +
 libavcodec/avcodec.h   |   1 +
 libavcodec/s302m.c     | 141 +++++++++++++++++++++++++++++++++++++++++
 libavformat/mpegts.c   |   1 +
 7 files changed, 147 insertions(+)
 create mode 100644 libavcodec/s302m.c

diff --git a/Changelog b/Changelog
index b5ceea9444..3d3fe6eb57 100644
--- a/Changelog
+++ b/Changelog
@@ -7,6 +7,7 @@ version <next>:
 - Lots of deprecated API cruft removed
 - fft and imdct optimizations for AVX (Sandy Bridge) processors
 - DPX image encoder
+- SMPTE 302M AES3 audio decoder
 
 
 version 0.7_beta1:
diff --git a/doc/general.texi b/doc/general.texi
index 598b9bc873..53482273f9 100644
--- a/doc/general.texi
+++ b/doc/general.texi
@@ -673,6 +673,7 @@ following image formats are supported:
 @item Sierra VMD audio       @tab     @tab  X
     @tab Used in Sierra VMD files.
 @item Smacker audio          @tab     @tab  X
+@item SMPTE 302M AES3 audio  @tab     @tab  X
 @item Speex                  @tab     @tab  E
     @tab supported through external library libspeex
 @item True Audio (TTA)       @tab     @tab  X
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 996b9b4d81..9040b32f57 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -324,6 +324,7 @@ OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o rv30dsp.o        \
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o rv40dsp.o        \
                                           mpegvideo.o error_resilience.o
+OBJS-$(CONFIG_S302M_DECODER)           += s302m.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
 OBJS-$(CONFIG_SGI_ENCODER)             += sgienc.o rle.o
 OBJS-$(CONFIG_SHORTEN_DECODER)         += shorten.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 694674ce7b..3466ad94fd 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -177,6 +177,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC  (RV20, rv20);
     REGISTER_DECODER (RV30, rv30);
     REGISTER_DECODER (RV40, rv40);
+    REGISTER_DECODER (S302M, s302m);
     REGISTER_ENCDEC  (SGI, sgi);
     REGISTER_DECODER (SMACKER, smacker);
     REGISTER_DECODER (SMC, smc);
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 876ba8c21b..2eb218ba4f 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -232,6 +232,7 @@ enum CodecID {
     CODEC_ID_PCM_F64LE,
     CODEC_ID_PCM_BLURAY,
     CODEC_ID_PCM_LXF,
+    CODEC_ID_S302M,
 
     /* various ADPCM codecs */
     CODEC_ID_ADPCM_IMA_QT= 0x11000,
diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c
new file mode 100644
index 0000000000..fb1fd867d0
--- /dev/null
+++ b/libavcodec/s302m.c
@@ -0,0 +1,141 @@
+/*
+ * SMPTE 302M decoder
+ * Copyright (c) 2008 Laurent Aimar <fenrir@videolan.org>
+ * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+
+#define AES3_HEADER_LEN 4
+
+static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
+                                    int buf_size)
+{
+    uint32_t h;
+    int frame_size, channels, id, bits;
+
+    if (buf_size <= AES3_HEADER_LEN) {
+        av_log(avctx, AV_LOG_ERROR, "frame is too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*
+     * AES3 header :
+     * size:            16
+     * number channels   2
+     * channel_id        8
+     * bits per samples  2
+     * alignments        4
+     */
+
+    h = AV_RB32(buf);
+    frame_size =  (h >> 16) & 0xffff;
+    channels   = ((h >> 14) & 0x0003) * 2 +  2;
+    id         =  (h >>  6) & 0x00ff;
+    bits       = ((h >>  4) & 0x0003) * 4 + 16;
+
+    if (AES3_HEADER_LEN + frame_size != buf_size || bits > 24) {
+        av_log(avctx, AV_LOG_ERROR, "frame has invalid header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* Set output properties */
+    avctx->bits_per_coded_sample = bits;
+    if (bits > 16)
+        avctx->sample_fmt = SAMPLE_FMT_S32;
+    else
+        avctx->sample_fmt = SAMPLE_FMT_S16;
+
+    avctx->channels    = channels;
+    avctx->sample_rate = 48000;
+    avctx->bit_rate    = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) +
+                         32 * (48000 / (buf_size * 8 /
+                                        (avctx->channels *
+                                         (avctx->bits_per_coded_sample + 4))));
+
+    return frame_size;
+}
+
+static int s302m_decode_frame(AVCodecContext *avctx, void *data,
+                              int *data_size, AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+
+    int frame_size = s302m_parse_frame_header(avctx, buf, buf_size);
+    if (frame_size < 0)
+        return frame_size;
+
+    buf_size -= AES3_HEADER_LEN;
+    buf      += AES3_HEADER_LEN;
+
+    if (*data_size < 4 * buf_size * 8 / (avctx->bits_per_coded_sample + 4))
+        return -1;
+
+    if (avctx->bits_per_coded_sample == 24) {
+        uint32_t *o = data;
+        for (; buf_size > 6; buf_size -= 7) {
+            *o++ = (av_reverse[buf[2]]        << 24) |
+                   (av_reverse[buf[1]]        << 16) |
+                   (av_reverse[buf[0]]        <<  8);
+            *o++ = (av_reverse[buf[6] & 0xf0] << 28) |
+                   (av_reverse[buf[5]]        << 20) |
+                   (av_reverse[buf[4]]        << 12) |
+                   (av_reverse[buf[3] & 0x0f] <<  8);
+            buf += 7;
+        }
+        *data_size = (uint8_t*) o - (uint8_t*) data;
+    } else if (avctx->bits_per_coded_sample == 20) {
+        uint32_t *o = data;
+        for (; buf_size > 5; buf_size -= 6) {
+            *o++ = (av_reverse[buf[2] & 0xf0] << 28) |
+                   (av_reverse[buf[1]]        << 20) |
+                   (av_reverse[buf[0]]        << 12);
+            *o++ = (av_reverse[buf[5] & 0xf0] << 28) |
+                   (av_reverse[buf[4]]        << 20) |
+                   (av_reverse[buf[3]]        << 12);
+            buf += 6;
+        }
+        *data_size = (uint8_t*) o - (uint8_t*) data;
+    } else {
+        uint16_t *o = data;
+        for (; buf_size > 4; buf_size -= 5) {
+            *o++ = (av_reverse[buf[1]]        <<  8) |
+                    av_reverse[buf[0]];
+            *o++ = (av_reverse[buf[4] & 0xf0] << 12) |
+                   (av_reverse[buf[3]]        <<  4) |
+                    av_reverse[buf[2] & 0x0f];
+            buf += 5;
+        }
+        *data_size = (uint8_t*) o - (uint8_t*) data;
+    }
+
+    return buf - avpkt->data;
+}
+
+
+AVCodec ff_s302m_decoder = {
+    .name           = "s302m",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = CODEC_ID_S302M,
+    .priv_data_size = 0,
+    .decode         = s302m_decode_frame,
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
+};
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 3130eb9ff5..cdefb84864 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -524,6 +524,7 @@ static const StreamType MISC_types[] = {
 static const StreamType REGD_types[] = {
     { MKTAG('d','r','a','c'), AVMEDIA_TYPE_VIDEO, CODEC_ID_DIRAC },
     { MKTAG('A','C','-','3'), AVMEDIA_TYPE_AUDIO,   CODEC_ID_AC3 },
+    { MKTAG('B','S','S','D'), AVMEDIA_TYPE_AUDIO, CODEC_ID_S302M },
     { 0 },
 };
 

From 5705b02079449c685a3dd337fcc3a8b440dca4a0 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Wed, 11 May 2011 10:11:55 -0700
Subject: [PATCH 32/32] 10-bit H.264 x86 chroma v loopfilter asm

Also delete some unused deblock asm macros.
---
 libavcodec/x86/h264_deblock.asm       |  41 ----------
 libavcodec/x86/h264_deblock_10bit.asm | 106 ++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_mmx.c          |  16 +++-
 3 files changed, 121 insertions(+), 42 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 37866812e7..0cf013f58f 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -106,47 +106,6 @@ cextern pb_A1
     TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
 %endmacro
 
-%macro TRANSPOSE4x8W_LOAD 8
-%if mmsize==16
-    TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
-%else
-    SWAP  1, 4, 2, 3
-    mova  m0, [t5]
-    mova  m1, [t5+r1]
-    mova  m2, [t5+r1*2]
-    mova  m3, [t5+t6]
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-%endif
-%endmacro
-
-%macro TRANSPOSE8x2W_STORE 8
-    punpckhwd  m0, m1, m2
-    punpcklwd  m1, m2
-%if mmsize==8
-    movd       %3, m0
-    movd       %1, m1
-    psrlq      m1, 32
-    psrlq      m0, 32
-    movd       %2, m1
-    movd       %4, m0
-%else
-    movd       %5, m0
-    movd       %1, m1
-    psrldq     m1, 4
-    psrldq     m0, 4
-    movd       %2, m1
-    movd       %6, m0
-    psrldq     m1, 4
-    psrldq     m0, 4
-    movd       %3, m1
-    movd       %7, m0
-    psrldq     m1, 4
-    psrldq     m0, 4
-    movd       %4, m1
-    movd       %8, m0
-%endif
-%endmacro
-
 %macro SBUTTERFLY3 4
     punpckh%1  %4, %2, %3
     punpckl%1  %2, %3
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index 402ed9bfac..c253d02954 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -34,6 +34,7 @@ pw_pixel_max: times 8 dw ((1 << 10)-1)
 SECTION .text
 
 cextern pw_2
+cextern pw_3
 cextern pw_4
 
 ; out: %4 = |%1-%2|-%3
@@ -802,3 +803,108 @@ INIT_AVX
 DEBLOCK_LUMA avx
 DEBLOCK_LUMA_INTRA avx
 %endif
+
+; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
+; out: %1=p0', %2=q0'
+%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
+    mova    %6, [pw_2]
+    paddw   %6, %3
+    paddw   %6, %4
+    paddw   %7, %6, %2
+    paddw   %6, %1
+    paddw   %6, %3
+    paddw   %7, %4
+    psraw   %6, 2
+    psraw   %7, 2
+    psubw   %6, %1
+    psubw   %7, %2
+    pand    %6, %5
+    pand    %7, %5
+    paddw   %1, %6
+    paddw   %2, %7
+%endmacro
+
+%macro CHROMA_V_LOAD 1
+    mova        m0, [r0]    ; p1
+    mova        m1, [r0+r1] ; p0
+    mova        m2, [%1]    ; q0
+    mova        m3, [%1+r1] ; q1
+%endmacro
+
+%macro CHROMA_V_STORE 0
+    mova [r0+1*r1], m1
+    mova [r0+2*r1], m2
+%endmacro
+
+%macro DEBLOCK_CHROMA 1
+;-----------------------------------------------------------------------------
+; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
+    mov         r5, r0
+    sub         r0, r1
+    sub         r0, r1
+    shl        r2d, 2
+    shl        r3d, 2
+%if mmsize < 16
+    mov         r6, 16/mmsize
+.loop:
+%endif
+    CHROMA_V_LOAD r5
+    LOAD_AB     m4, m5, r2, r3
+    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
+    pxor        m4, m4
+    LOAD_TC     m6, r4
+    psubw       m6, [pw_3]
+    pmaxsw      m6, m4
+    pand        m7, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+    CHROMA_V_STORE
+%if mmsize < 16
+    add         r0, mmsize
+    add         r5, mmsize
+    add         r4, mmsize/8
+    dec         r6
+    jg .loop
+    REP_RET
+%else
+    RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
+    mov         r4, r0
+    sub         r0, r1
+    sub         r0, r1
+    shl        r2d, 2
+    shl        r3d, 2
+%if mmsize < 16
+    mov         r5, 16/mmsize
+.loop:
+%endif
+    CHROMA_V_LOAD r4
+    LOAD_AB     m4, m5, r2, r3
+    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
+    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
+    CHROMA_V_STORE
+%if mmsize < 16
+    add         r0, mmsize
+    add         r4, mmsize
+    dec         r5
+    jg .loop
+    REP_RET
+%else
+    RET
+%endif
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_CHROMA mmxext
+%endif
+INIT_XMM
+DEBLOCK_CHROMA sse2
+INIT_AVX
+DEBLOCK_CHROMA avx
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 42dae93f2d..01b11163c8 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -236,10 +236,18 @@ LF_FUNC (h,  luma,         depth, sse2)\
 LF_IFUNC(h,  luma_intra,   depth, sse2)\
 LF_FUNC (v,  luma,         depth, sse2)\
 LF_IFUNC(v,  luma_intra,   depth, sse2)\
+LF_FUNC (h,  chroma,       depth, sse2)\
+LF_IFUNC(h,  chroma_intra, depth, sse2)\
+LF_FUNC (v,  chroma,       depth, sse2)\
+LF_IFUNC(v,  chroma_intra, depth, sse2)\
 LF_FUNC (h,  luma,         depth,  avx)\
 LF_IFUNC(h,  luma_intra,   depth,  avx)\
 LF_FUNC (v,  luma,         depth,  avx)\
-LF_IFUNC(v,  luma_intra,   depth,  avx)
+LF_IFUNC(v,  luma_intra,   depth,  avx)\
+LF_FUNC (h,  chroma,       depth,  avx)\
+LF_IFUNC(h,  chroma_intra, depth,  avx)\
+LF_FUNC (v,  chroma,       depth,  avx)\
+LF_IFUNC(v,  chroma_intra, depth,  avx)
 
 LF_FUNCS( uint8_t,  8)
 LF_FUNCS(uint16_t, 10)
@@ -401,12 +409,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
     if (mm_flags & AV_CPU_FLAG_MMX) {
         if (mm_flags & AV_CPU_FLAG_MMX2) {
 #if ARCH_X86_32
+            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext;
+            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext;
             c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
             c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
 #endif
             if (mm_flags&AV_CPU_FLAG_SSE2) {
+                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
+                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
 #if HAVE_ALIGNED_STACK
                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
                 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
@@ -415,6 +427,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_AVX) {
+                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
+                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
 #if HAVE_ALIGNED_STACK
                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
                 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;